From 055e82a6fee79d3e34524329c4149405d4d5536a Mon Sep 17 00:00:00 2001 From: tcezard Date: Thu, 6 Jun 2024 17:14:45 +0100 Subject: [PATCH] replace species folder with assembly folder --- .../analyze_vcf_validation_results.py | 15 +++++------ .../count_rs_ids_in_release_files.py | 14 +++++----- .../create_release_properties_file.py | 6 ++--- .../merge_dbsnp_eva_release_files.py | 26 +++++++++---------- .../release_common_utils.py | 23 ++++++++-------- .../run_release_for_assembly.nf | 16 ++++++------ .../sort_bgzip_index_release_files.py | 20 +++++++------- .../update_sequence_names_to_ena.py | 12 ++++----- .../validate_release_vcf_files.py | 12 ++++----- .../validate_rs_release_files.py | 16 ++++++------ 10 files changed, 79 insertions(+), 81 deletions(-) diff --git a/eva-accession-release-automation/run_release_in_embassy/analyze_vcf_validation_results.py b/eva-accession-release-automation/run_release_in_embassy/analyze_vcf_validation_results.py index 1ea1dd2e0..0d83083eb 100644 --- a/eva-accession-release-automation/run_release_in_embassy/analyze_vcf_validation_results.py +++ b/eva-accession-release-automation/run_release_in_embassy/analyze_vcf_validation_results.py @@ -70,22 +70,21 @@ def analyze_asm_report_files(asm_report_files): return exit_code -def analyze_vcf_validation_results(species_release_folder, assembly_accession): - vcf_validation_report_files = glob.glob("{0}/{1}/{2}".format(species_release_folder, assembly_accession, - vcf_validation_output_file_pattern)) +def analyze_vcf_validation_results(assembly_release_folder, assembly_accession): + vcf_validation_report_files = glob.glob("{0}/{2}".format(assembly_release_folder, assembly_accession, + vcf_validation_output_file_pattern)) exit_code = analyze_vcf_validation_files(vcf_validation_report_files) - asm_report_files = glob.glob("{0}/{1}/{2}".format(species_release_folder, assembly_accession, - asm_report_output_file_pattern)) + asm_report_files = glob.glob("{0}/{2}".format(assembly_release_folder, asm_report_output_file_pattern)) exit_code = exit_code or analyze_asm_report_files(asm_report_files) sys.exit(exit_code) -@click.option("--species-release-folder", required=True) +@click.option("--assembly-release-folder", required=True) @click.option("--assembly-accession", required=True) @click.command() -def main(species_release_folder, assembly_accession): +def main(assembly_release_folder, assembly_accession): logging_config.add_stdout_handler() - analyze_vcf_validation_results(species_release_folder, assembly_accession) + analyze_vcf_validation_results(assembly_release_folder, assembly_accession) if __name__ == '__main__': diff --git a/eva-accession-release-automation/run_release_in_embassy/count_rs_ids_in_release_files.py b/eva-accession-release-automation/run_release_in_embassy/count_rs_ids_in_release_files.py index d950fa52a..8d75004dc 100644 --- a/eva-accession-release-automation/run_release_in_embassy/count_rs_ids_in_release_files.py +++ b/eva-accession-release-automation/run_release_in_embassy/count_rs_ids_in_release_files.py @@ -22,19 +22,19 @@ from run_release_in_embassy.release_common_utils import get_release_vcf_file_name_genbank, get_release_text_file_name -def count_rs_ids_in_release_files(count_ids_script_path, taxonomy_id, assembly_accession, species_release_folder): - release_count_filename = os.path.join(species_release_folder, assembly_accession, "README_rs_ids_counts.txt") +def count_rs_ids_in_release_files(count_ids_script_path, taxonomy_id, assembly_accession, assembly_release_folder): + release_count_filename = os.path.join(assembly_release_folder, assembly_accession, "README_rs_ids_counts.txt") with open(release_count_filename, "w") as release_count_file_handle: release_count_file_handle.write("# Unique RS ID counts\n") for vcf_file_category in release_vcf_file_categories: - release_vcf_file_name = get_release_vcf_file_name_genbank(species_release_folder, taxonomy_id, assembly_accession, + release_vcf_file_name = get_release_vcf_file_name_genbank(assembly_release_folder, taxonomy_id, assembly_accession, vcf_file_category) num_ids_in_file = run_command_with_output("Counting RS IDs in file: " + release_vcf_file_name, "{0} {1}.gz".format(count_ids_script_path, release_vcf_file_name), return_process_output=True) release_count_file_handle.write(num_ids_in_file) for text_release_file_category in release_text_file_categories: - text_release_file_name = get_release_text_file_name(species_release_folder, taxonomy_id, assembly_accession, + text_release_file_name = get_release_text_file_name(assembly_release_folder, taxonomy_id, assembly_accession, text_release_file_category) num_ids_in_file = run_command_with_output("Counting RS IDs in file: " + text_release_file_name, "zcat {0}.gz | cut -f1 | uniq | wc -l" @@ -46,11 +46,11 @@ def count_rs_ids_in_release_files(count_ids_script_path, taxonomy_id, assembly_a @click.option("--count-ids-script-path", help="ex: /path/to/count/ids/script", required=True) @click.option("--taxonomy-id", help="ex: 9913", required=True) @click.option("--assembly-accession", help="ex: GCA_000003055.6", required=True) -@click.option("--species-release-folder", required=True) +@click.option("--assembly-release-folder", required=True) @click.command() -def main(count_ids_script_path, taxonomy_id, assembly_accession, species_release_folder): +def main(count_ids_script_path, taxonomy_id, assembly_accession, assembly_release_folder): logging_config.add_stdout_handler() - count_rs_ids_in_release_files(count_ids_script_path, taxonomy_id, assembly_accession, species_release_folder) + count_rs_ids_in_release_files(count_ids_script_path, taxonomy_id, assembly_accession, assembly_release_folder) if __name__ == "__main__": diff --git a/eva-accession-release-automation/run_release_in_embassy/create_release_properties_file.py b/eva-accession-release-automation/run_release_in_embassy/create_release_properties_file.py index fcd7acfa6..0a7a298e6 100644 --- a/eva-accession-release-automation/run_release_in_embassy/create_release_properties_file.py +++ b/eva-accession-release-automation/run_release_in_embassy/create_release_properties_file.py @@ -64,14 +64,14 @@ def create_release_properties_file_for_assembly(private_config_xml_file, profile @click.option("--release-species-inventory-table", default="eva_progress_tracker.clustering_release_tracker", required=False) @click.option("--release-version", help="ex: 2", type=int, required=True) -@click.option("--species-release-folder", required=True) +@click.option("--assembly-release-folder", required=True) @click.command() def main(private_config_xml_file, profile, taxonomy_id, assembly_accession, release_species_inventory_table, - release_version, species_release_folder): + release_version, assembly_release_folder): logging_config.add_stdout_handler() create_release_properties_file_for_assembly(private_config_xml_file, profile, taxonomy_id, assembly_accession, release_species_inventory_table, release_version, - species_release_folder) + assembly_release_folder) if __name__ == "__main__": diff --git a/eva-accession-release-automation/run_release_in_embassy/merge_dbsnp_eva_release_files.py b/eva-accession-release-automation/run_release_in_embassy/merge_dbsnp_eva_release_files.py index 0c45aa7c7..47c901404 100644 --- a/eva-accession-release-automation/run_release_in_embassy/merge_dbsnp_eva_release_files.py +++ b/eva-accession-release-automation/run_release_in_embassy/merge_dbsnp_eva_release_files.py @@ -26,10 +26,10 @@ get_release_vcf_file_name, get_unsorted_release_vcf_file_name, get_unsorted_release_text_file_name -def move_release_files_to_unsorted_category(taxonomy_id, assembly_accession, species_release_folder, vcf_file_category, +def move_release_files_to_unsorted_category(taxonomy_id, assembly_accession, assembly_release_folder, vcf_file_category, unsorted_release_file_path): unsorted_release_file_name = os.path.basename(unsorted_release_file_path) - release_file_path = get_release_vcf_file_name(species_release_folder, taxonomy_id, assembly_accession, + release_file_path = get_release_vcf_file_name(assembly_release_folder, taxonomy_id, assembly_accession, vcf_file_category) release_file_name = os.path.basename(release_file_path) for variant_source in ["eva", "dbsnp"]: @@ -87,17 +87,17 @@ def merge_dbsnp_eva_vcf_headers(file1, file2, output_file): def merge_dbsnp_eva_vcf_files(bgzip_path, bcftools_path, vcf_sort_script_path, taxonomy_id, assembly_accession, - species_release_folder, vcf_file_category, data_sources): + assembly_release_folder, vcf_file_category, data_sources): vcf_merge_commands = [] # This is the desired post-merge output file name in the format _.vcf # ex: 60711_GCA_000409795.2_merged_ids.vcf - unsorted_release_file_path = get_unsorted_release_vcf_file_name(species_release_folder, taxonomy_id, assembly_accession, + unsorted_release_file_path = get_unsorted_release_vcf_file_name(assembly_release_folder, taxonomy_id, assembly_accession, vcf_file_category) unsorted_release_file_name = os.path.basename(unsorted_release_file_path) # After release pipeline is run on a species, the default VCF output files are in the formats like below # ex: eva_GCA_000409795.2_merged_ids.vcf and dbsnp_GCA_000409795.2_merged_ids.vcf # Move them to files with _unsorted suffix to avoid confusion - move_release_files_to_unsorted_category(taxonomy_id, assembly_accession, species_release_folder, vcf_file_category, + move_release_files_to_unsorted_category(taxonomy_id, assembly_accession, assembly_release_folder, vcf_file_category, unsorted_release_file_path) dbsnp_vcf_file_pattern = unsorted_release_file_path.replace(unsorted_release_file_name, "dbsnp*_" + unsorted_release_file_name.replace(f'{str(taxonomy_id)}_', '')) @@ -134,10 +134,10 @@ def merge_dbsnp_eva_vcf_files(bgzip_path, bcftools_path, vcf_sort_script_path, t return vcf_merge_commands -def merge_dbsnp_eva_text_files(taxonomy_id, assembly_accession, species_release_folder, text_release_file_category, +def merge_dbsnp_eva_text_files(taxonomy_id, assembly_accession, assembly_release_folder, text_release_file_category, data_sources): text_release_file_merge_commands = [] - unsorted_release_file_path = get_unsorted_release_text_file_name(species_release_folder, taxonomy_id, assembly_accession, + unsorted_release_file_path = get_unsorted_release_text_file_name(assembly_release_folder, taxonomy_id, assembly_accession, text_release_file_category) unsorted_release_file_name = os.path.basename(unsorted_release_file_path) # After release is run on a species, the default text (i.e., non-vcf) output files have ".unsorted.txt" file suffix @@ -174,7 +174,7 @@ def merge_dbsnp_eva_text_files(taxonomy_id, assembly_accession, species_release_ def merge_dbsnp_eva_release_files(private_config_xml_file, profile, bgzip_path, bcftools_path, vcf_sort_script_path, taxonomy_id, assembly_accession, release_species_inventory_table, release_version, - species_release_folder): + assembly_release_folder): with get_metadata_connection_handle(profile, private_config_xml_file) as metadata_connection_handle: release_info = get_release_inventory_info_for_assembly(taxonomy_id, assembly_accession, release_species_inventory_table, @@ -182,10 +182,10 @@ def merge_dbsnp_eva_release_files(private_config_xml_file, profile, bgzip_path, merge_commands = [] for vcf_file_category in release_vcf_file_categories: merge_commands.extend(merge_dbsnp_eva_vcf_files(bgzip_path, bcftools_path, vcf_sort_script_path, - taxonomy_id, assembly_accession, species_release_folder, + taxonomy_id, assembly_accession, assembly_release_folder, vcf_file_category, release_info["sources"])) for text_release_file_category in release_text_file_categories: - merge_commands.extend(merge_dbsnp_eva_text_files(taxonomy_id, assembly_accession, species_release_folder, + merge_commands.extend(merge_dbsnp_eva_text_files(taxonomy_id, assembly_accession, assembly_release_folder, text_release_file_category, release_info["sources"])) final_merge_command = " && ".join(merge_commands) run_command_with_output(f"Merging dbSNP and EVA release files for taxonomy {taxonomy_id} and assembly {assembly_accession}", @@ -202,14 +202,14 @@ def merge_dbsnp_eva_release_files(private_config_xml_file, profile, bgzip_path, @click.option("--release-species-inventory-table", default="eva_progress_tracker.clustering_release_tracker", required=False) @click.option("--release-version", help="ex: 2", type=int, required=True) -@click.option("--species-release-folder", required=True) +@click.option("--assembly-release-folder", required=True) @click.command() def main(private_config_xml_file, profile, bgzip_path, bcftools_path, vcf_sort_script_path, taxonomy_id, - assembly_accession, release_species_inventory_table, release_version, species_release_folder): + assembly_accession, release_species_inventory_table, release_version, assembly_release_folder): logging_config.add_stdout_handler() merge_dbsnp_eva_release_files(private_config_xml_file, profile, bgzip_path, bcftools_path, vcf_sort_script_path, taxonomy_id, assembly_accession, release_species_inventory_table, release_version, - species_release_folder) + assembly_release_folder) if __name__ == "__main__": diff --git a/eva-accession-release-automation/run_release_in_embassy/release_common_utils.py b/eva-accession-release-automation/run_release_in_embassy/release_common_utils.py index 8bc966245..d5017053d 100644 --- a/eva-accession-release-automation/run_release_in_embassy/release_common_utils.py +++ b/eva-accession-release-automation/run_release_in_embassy/release_common_utils.py @@ -56,32 +56,31 @@ def get_bgzip_bcftools_index_commands_for_file(bgzip_path, bcftools_path, file): return commands -def get_release_vcf_file_name(species_release_folder, taxonomy_id, assembly_accession, vcf_file_category): - return os.path.join(species_release_folder, assembly_accession, "{0}_{1}_{2}.vcf".format(taxonomy_id, - assembly_accession, - vcf_file_category)) +def get_release_vcf_file_name(assembly_release_folder, taxonomy_id, assembly_accession, vcf_file_category): + return os.path.join(assembly_release_folder, "{0}_{1}_{2}.vcf".format(taxonomy_id, assembly_accession, + vcf_file_category)) -def get_release_vcf_file_name_genbank(species_release_folder, taxonomy_id, assembly_accession, vcf_file_category): +def get_release_vcf_file_name_genbank(assembly_release_folder, taxonomy_id, assembly_accession, vcf_file_category): return os.path.join( - species_release_folder, assembly_accession, + assembly_release_folder, "{0}_{1}_{2}_with_genbank.vcf".format(taxonomy_id, assembly_accession, vcf_file_category) ) -def get_unsorted_release_vcf_file_name(species_release_folder, taxonomy_id, assembly_accession, vcf_file_category): - vcf_file_path = get_release_vcf_file_name(species_release_folder, taxonomy_id, assembly_accession, vcf_file_category) +def get_unsorted_release_vcf_file_name(assembly_release_folder, taxonomy_id, assembly_accession, vcf_file_category): + vcf_file_path = get_release_vcf_file_name(assembly_release_folder, taxonomy_id, assembly_accession, vcf_file_category) filename = os.path.basename(vcf_file_path) return vcf_file_path.replace(filename, filename.replace(".vcf", "_unsorted.vcf")) -def get_release_text_file_name(species_release_folder, taxonomy_id, assembly_accession, release_text_file_category): - return os.path.join(species_release_folder, assembly_accession, +def get_release_text_file_name(assembly_release_folder, taxonomy_id, assembly_accession, release_text_file_category): + return os.path.join(assembly_release_folder, "{0}_{1}_{2}.txt".format(taxonomy_id, assembly_accession, release_text_file_category)) -def get_unsorted_release_text_file_name(species_release_folder, taxonomy_id, assembly_accession, release_text_file_category): - release_text_file_path = get_release_text_file_name(species_release_folder, taxonomy_id, assembly_accession, +def get_unsorted_release_text_file_name(assembly_release_folder, taxonomy_id, assembly_accession, release_text_file_category): + release_text_file_path = get_release_text_file_name(assembly_release_folder, taxonomy_id, assembly_accession, release_text_file_category) filename = os.path.basename(release_text_file_path) return release_text_file_path.replace(filename, filename.replace(".txt", ".unsorted.txt")) diff --git a/eva-accession-release-automation/run_release_in_embassy/run_release_for_assembly.nf b/eva-accession-release-automation/run_release_in_embassy/run_release_for_assembly.nf index 8423ed164..7f71d5d5e 100644 --- a/eva-accession-release-automation/run_release_in_embassy/run_release_for_assembly.nf +++ b/eva-accession-release-automation/run_release_in_embassy/run_release_for_assembly.nf @@ -56,7 +56,7 @@ process run_release_for_assembly { script: """ export PYTHONPATH=$params.python_path - $params.executable.python_interpreter -m run_release_in_embassy.run_release_for_assembly --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder --release-jar-path $params.jar.release_pipeline 1>> $params.log_file 2>&1 + $params.executable.python_interpreter -m run_release_in_embassy.run_release_for_assembly --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --assembly-release-folder $params.assembly_folder --release-jar-path $params.jar.release_pipeline 1>> $params.log_file 2>&1 """ } @@ -73,7 +73,7 @@ process merge_dbsnp_eva_release_files { script: """ export PYTHONPATH=$params.python_path - $params.executable.python_interpreter -m run_release_in_embassy.merge_dbsnp_eva_release_files --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --bgzip-path $params.executable.bgzip --bcftools-path $params.executable.bcftools --vcf-sort-script-path $params.executable.sort_vcf_sorted_chromosomes --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1 + $params.executable.python_interpreter -m run_release_in_embassy.merge_dbsnp_eva_release_files --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --bgzip-path $params.executable.bgzip --bcftools-path $params.executable.bcftools --vcf-sort-script-path $params.executable.sort_vcf_sorted_chromosomes --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --assembly-release-folder $params.assembly_folder 1>> $params.log_file 2>&1 """ } @@ -90,7 +90,7 @@ process sort_bgzip_index_release_files { script: """ export PYTHONPATH=$params.python_path - $params.executable.python_interpreter -m run_release_in_embassy.sort_bgzip_index_release_files --bgzip-path $params.executable.bgzip --bcftools-path $params.executable.bcftools --vcf-sort-script-path $params.executable.sort_vcf_sorted_chromosomes --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1 + $params.executable.python_interpreter -m run_release_in_embassy.sort_bgzip_index_release_files --bgzip-path $params.executable.bgzip --bcftools-path $params.executable.bcftools --vcf-sort-script-path $params.executable.sort_vcf_sorted_chromosomes --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --assembly-release-folder $params.assembly_folder 1>> $params.log_file 2>&1 """ } @@ -107,7 +107,7 @@ process validate_release_vcf_files { script: """ export PYTHONPATH=$params.python_path - $params.executable.python_interpreter -m run_release_in_embassy.validate_release_vcf_files --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder --vcf-validator-path $params.executable.vcf_validator --assembly-checker-path $params.executable.vcf_assembly_checker 1>> $params.log_file 2>&1 + $params.executable.python_interpreter -m run_release_in_embassy.validate_release_vcf_files --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --assembly-release-folder $params.assembly_folder --vcf-validator-path $params.executable.vcf_validator --assembly-checker-path $params.executable.vcf_assembly_checker 1>> $params.log_file 2>&1 """ } @@ -124,7 +124,7 @@ process analyze_vcf_validation_results { script: """ export PYTHONPATH=$params.python_path - $params.executable.python_interpreter -m run_release_in_embassy.analyze_vcf_validation_results --species-release-folder $params.assembly_folder --assembly-accession $params.assembly 1>> $params.log_file 2>&1 + $params.executable.python_interpreter -m run_release_in_embassy.analyze_vcf_validation_results --assembly-release-folder $params.assembly_folder --assembly-accession $params.assembly 1>> $params.log_file 2>&1 """ } @@ -141,7 +141,7 @@ process count_rs_ids_in_release_files { script: """ export PYTHONPATH=$params.python_path - $params.executable.python_interpreter -m run_release_in_embassy.count_rs_ids_in_release_files --count-ids-script-path $params.executable.count_ids_in_vcf --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1 + $params.executable.python_interpreter -m run_release_in_embassy.count_rs_ids_in_release_files --count-ids-script-path $params.executable.count_ids_in_vcf --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --assembly-release-folder $params.assembly_folder 1>> $params.log_file 2>&1 """ } @@ -158,7 +158,7 @@ process validate_rs_release_files { script: """ export PYTHONPATH=$params.python_path - $params.executable.python_interpreter -m run_release_in_embassy.validate_rs_release_files --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1 + $params.executable.python_interpreter -m run_release_in_embassy.validate_rs_release_files --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --assembly-release-folder $params.assembly_folder 1>> $params.log_file 2>&1 """ } @@ -175,7 +175,7 @@ process update_sequence_names_to_ena { script: """ export PYTHONPATH=$params.python_path - $params.executable.python_interpreter -m run_release_in_embassy.update_sequence_names_to_ena --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --species-release-folder $params.assembly_folder --sequence-name-converter-path $params.executable.convert_vcf_file --bcftools-path $params.executable.bcftools 1>> $params.log_file 2>&1 + $params.executable.python_interpreter -m run_release_in_embassy.update_sequence_names_to_ena --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --assembly-release-folder $params.assembly_folder --sequence-name-converter-path $params.executable.convert_vcf_file --bcftools-path $params.executable.bcftools 1>> $params.log_file 2>&1 """ } diff --git a/eva-accession-release-automation/run_release_in_embassy/sort_bgzip_index_release_files.py b/eva-accession-release-automation/run_release_in_embassy/sort_bgzip_index_release_files.py index 15bbc09ca..f67900f40 100644 --- a/eva-accession-release-automation/run_release_in_embassy/sort_bgzip_index_release_files.py +++ b/eva-accession-release-automation/run_release_in_embassy/sort_bgzip_index_release_files.py @@ -25,20 +25,20 @@ def sort_bgzip_index_release_files(bgzip_path, bcftools_path, vcf_sort_script_path, taxonomy_id, assembly_accession, - species_release_folder): + assembly_release_folder): commands = [] # These files are left behind by the sort_vcf_sorted_chromosomes.sh script # To be idempotent, remove such files - commands.append("rm -f {0}/{1}/*.chromosomes".format(species_release_folder, assembly_accession)) + commands.append("rm -f {0}/*.chromosomes".format(assembly_release_folder)) for vcf_file_category in release_vcf_file_categories: - unsorted_release_file_name = get_unsorted_release_vcf_file_name(species_release_folder, taxonomy_id, + unsorted_release_file_name = get_unsorted_release_vcf_file_name(assembly_release_folder, taxonomy_id, assembly_accession, vcf_file_category) - sorted_release_file_name = get_release_vcf_file_name_genbank(species_release_folder, taxonomy_id, + sorted_release_file_name = get_release_vcf_file_name_genbank(assembly_release_folder, taxonomy_id, assembly_accession, vcf_file_category) if vcf_file_category == 'current_ids': commands.append( f"rm -f {sorted_release_file_name} && " - f"{bcftools_path} sort -T {species_release_folder} -m 2G -o {sorted_release_file_name} " + f"{bcftools_path} sort -T {assembly_release_folder} -m 2G -o {sorted_release_file_name} " f"{unsorted_release_file_name}" ) else: @@ -47,9 +47,9 @@ def sort_bgzip_index_release_files(bgzip_path, bcftools_path, vcf_sort_script_pa sorted_release_file_name)) commands.extend(get_bgzip_bcftools_index_commands_for_file(bgzip_path, bcftools_path, sorted_release_file_name)) for text_release_file_category in release_text_file_categories: - unsorted_release_file_name = get_unsorted_release_text_file_name(species_release_folder, taxonomy_id, + unsorted_release_file_name = get_unsorted_release_text_file_name(assembly_release_folder, taxonomy_id, assembly_accession, text_release_file_category) - sorted_release_file_name = get_release_text_file_name(species_release_folder, taxonomy_id, assembly_accession, + sorted_release_file_name = get_release_text_file_name(assembly_release_folder, taxonomy_id, assembly_accession, text_release_file_category) commands.append("(sort -V {1} | uniq > {2})".format(vcf_sort_script_path, unsorted_release_file_name, @@ -65,12 +65,12 @@ def sort_bgzip_index_release_files(bgzip_path, bcftools_path, vcf_sort_script_pa @click.option("--vcf-sort-script-path", help="ex: /path/to/vcf/sort/script", required=True) @click.option("--taxonomy-id", help="ex: 9913", required=True) @click.option("--assembly-accession", help="ex: GCA_000003055.6", required=True) -@click.option("--species-release-folder", required=True) +@click.option("--assembly-release-folder", required=True) @click.command() -def main(bgzip_path, bcftools_path, vcf_sort_script_path, taxonomy_id, assembly_accession, species_release_folder): +def main(bgzip_path, bcftools_path, vcf_sort_script_path, taxonomy_id, assembly_accession, assembly_release_folder): logging_config.add_stdout_handler() sort_bgzip_index_release_files(bgzip_path, bcftools_path, vcf_sort_script_path, taxonomy_id, assembly_accession, - species_release_folder) + assembly_release_folder) if __name__ == "__main__": diff --git a/eva-accession-release-automation/run_release_in_embassy/update_sequence_names_to_ena.py b/eva-accession-release-automation/run_release_in_embassy/update_sequence_names_to_ena.py index a15f5c975..b066ab3c4 100644 --- a/eva-accession-release-automation/run_release_in_embassy/update_sequence_names_to_ena.py +++ b/eva-accession-release-automation/run_release_in_embassy/update_sequence_names_to_ena.py @@ -21,12 +21,12 @@ from ebi_eva_common_pyutils.command_utils import run_command_with_output -def update_sequence_name(taxonomy_id, assembly_accession, species_release_folder, sequence_name_converter_path, +def update_sequence_name(taxonomy_id, assembly_accession, assembly_release_folder, sequence_name_converter_path, bcftools_path): for vcf_file_category in release_vcf_file_categories: - release_vcf_file_name = get_release_vcf_file_name_genbank(species_release_folder, taxonomy_id, + release_vcf_file_name = get_release_vcf_file_name_genbank(assembly_release_folder, taxonomy_id, assembly_accession, vcf_file_category) - release_vcf_file_output_name = get_release_vcf_file_name(species_release_folder, taxonomy_id, + release_vcf_file_output_name = get_release_vcf_file_name(assembly_release_folder, taxonomy_id, assembly_accession, vcf_file_category) # Commands run separately so the index isn't attempted if the conversion fails @@ -40,13 +40,13 @@ def update_sequence_name(taxonomy_id, assembly_accession, species_release_folder @click.option("--taxonomy-id", help="ex: 9913", required=True) @click.option("--assembly-accession", help="ex: GCA_000003055.6", required=True) -@click.option("--species-release-folder", required=True) +@click.option("--assembly-release-folder", required=True) @click.option("--sequence-name-converter-path", help="/path/to/vcf/sequence-name-converter", required=True) @click.option("--bcftools-path", help="ex: /path/to/bcftools/binary", required=True) @click.command() -def main(taxonomy_id, assembly_accession, species_release_folder, sequence_name_converter_path, bcftools_path): +def main(taxonomy_id, assembly_accession, assembly_release_folder, sequence_name_converter_path, bcftools_path): logging_config.add_stdout_handler() - update_sequence_name(taxonomy_id, assembly_accession, species_release_folder, sequence_name_converter_path, bcftools_path) + update_sequence_name(taxonomy_id, assembly_accession, assembly_release_folder, sequence_name_converter_path, bcftools_path) if __name__ == "__main__": diff --git a/eva-accession-release-automation/run_release_in_embassy/validate_release_vcf_files.py b/eva-accession-release-automation/run_release_in_embassy/validate_release_vcf_files.py index b3b543e2b..45ef71e1e 100644 --- a/eva-accession-release-automation/run_release_in_embassy/validate_release_vcf_files.py +++ b/eva-accession-release-automation/run_release_in_embassy/validate_release_vcf_files.py @@ -32,10 +32,10 @@ def remove_index_if_outdated(fasta_path): def validate_release_vcf_files(private_config_xml_file, profile, taxonomy_id, assembly_accession, - release_species_inventory_table, release_version, species_release_folder, + release_species_inventory_table, release_version, assembly_release_folder, vcf_validator_path, assembly_checker_path): run_command_with_output("Remove existing VCF validation and assembly report outputs...", - "rm -f {0}/{1}/{2} {0}/{1}/{3}".format(species_release_folder, assembly_accession, + "rm -f {0}/{1}/{2} {0}/{1}/{3}".format(assembly_release_folder, assembly_accession, vcf_validation_output_file_pattern, asm_report_output_file_pattern)) validate_release_vcf_files_commands = [] @@ -52,7 +52,7 @@ def validate_release_vcf_files(private_config_xml_file, profile, taxonomy_id, as for vcf_file_category in release_vcf_file_categories: - release_vcf_file_name = get_release_vcf_file_name_genbank(species_release_folder, taxonomy_id, + release_vcf_file_name = get_release_vcf_file_name_genbank(assembly_release_folder, taxonomy_id, assembly_accession, vcf_file_category) release_vcf_dir = os.path.dirname(release_vcf_file_name) if "multimap" not in vcf_file_category: @@ -75,15 +75,15 @@ def validate_release_vcf_files(private_config_xml_file, profile, taxonomy_id, as @click.option("--release-species-inventory-table", default="eva_progress_tracker.clustering_release_tracker", required=False) @click.option("--release-version", help="ex: 2", type=int, required=True) -@click.option("--species-release-folder", required=True) +@click.option("--assembly-release-folder", required=True) @click.option("--vcf-validator-path", help="/path/to/vcf/validator/binary", required=True) @click.option("--assembly-checker-path", help="/path/to/assembly/checker/binary", required=True) @click.command() def main(private_config_xml_file, profile, taxonomy_id, assembly_accession, release_species_inventory_table, release_version, - species_release_folder, vcf_validator_path, assembly_checker_path): + assembly_release_folder, vcf_validator_path, assembly_checker_path): logging_config.add_stdout_handler() validate_release_vcf_files(private_config_xml_file, profile, taxonomy_id, assembly_accession, - release_species_inventory_table, release_version, species_release_folder, + release_species_inventory_table, release_version, assembly_release_folder, vcf_validator_path, assembly_checker_path) diff --git a/eva-accession-release-automation/run_release_in_embassy/validate_rs_release_files.py b/eva-accession-release-automation/run_release_in_embassy/validate_rs_release_files.py index 23809e23b..e894be046 100644 --- a/eva-accession-release-automation/run_release_in_embassy/validate_rs_release_files.py +++ b/eva-accession-release-automation/run_release_in_embassy/validate_rs_release_files.py @@ -257,8 +257,8 @@ def read_next_batch_of_missing_ids(missing_rs_ids_file_handle): yield lines_read -def get_unique_release_rs_ids(species_release_folder, taxonomy_id, assembly_accession): - folder_prefix = os.path.join(species_release_folder, assembly_accession, f'{taxonomy_id}_{assembly_accession}') +def get_unique_release_rs_ids(assembly_release_folder, taxonomy_id, assembly_accession): + folder_prefix = os.path.join(assembly_release_folder, assembly_accession, f'{taxonomy_id}_{assembly_accession}') active_rs_ids_file = folder_prefix + "_current_ids_with_genbank.vcf.gz" merged_rs_ids_file = folder_prefix + "_merged_ids_with_genbank.vcf.gz" multimap_rs_ids_file = folder_prefix + "_multimap_ids_with_genbank.vcf.gz" @@ -460,7 +460,7 @@ def export_unique_rs_ids_from_mongo(mongo_database_handle, taxonomy_id, assembly def validate_rs_release_files(private_config_xml_file, profile, taxonomy_id, assembly_accession, release_species_inventory_table, - release_version, species_release_folder): + release_version, assembly_release_folder): port_forwarding_process_id, mongo_port, exit_code = None, None, -1 try: port_forwarding_process_id, mongo_port = open_mongo_port_to_tempmongo(private_config_xml_file, profile, taxonomy_id, @@ -469,10 +469,10 @@ def validate_rs_release_files(private_config_xml_file, profile, taxonomy_id, ass db_name_in_tempmongo_instance = get_release_db_name_in_tempmongo_instance(taxonomy_id, assembly_accession) with MongoClient(port=mongo_port) as client: db_handle = client[db_name_in_tempmongo_instance] - mongo_unique_rs_ids_file = os.path.join(species_release_folder, assembly_accession, + mongo_unique_rs_ids_file = os.path.join(assembly_release_folder, assembly_accession, "{0}_mongo_unique_rs_ids.txt".format(assembly_accession)) export_unique_rs_ids_from_mongo(db_handle, taxonomy_id, assembly_accession, mongo_unique_rs_ids_file) - unique_release_rs_ids_file = get_unique_release_rs_ids(species_release_folder, taxonomy_id, + unique_release_rs_ids_file = get_unique_release_rs_ids(assembly_release_folder, taxonomy_id, assembly_accession) missing_rs_ids_file = os.path.join(os.path.dirname(unique_release_rs_ids_file), assembly_accession + "_missing_ids.txt") @@ -496,13 +496,13 @@ def validate_rs_release_files(private_config_xml_file, profile, taxonomy_id, ass @click.option("--release-species-inventory-table", default="eva_progress_tracker.clustering_release_tracker", required=False) @click.option("--release-version", help="ex: 2", type=int, required=True) -@click.option("--species-release-folder", required=True) +@click.option("--assembly-release-folder", required=True) @click.command() def main(private_config_xml_file, profile, taxonomy_id, assembly_accession, release_species_inventory_table, - release_version, species_release_folder): + release_version, assembly_release_folder): logging_config.add_stdout_handler() validate_rs_release_files(private_config_xml_file, profile, taxonomy_id, assembly_accession, - release_species_inventory_table, release_version, species_release_folder) + release_species_inventory_table, release_version, assembly_release_folder) if __name__ == '__main__':