Skip to content

Commit

Permalink
replace species folder with assembly folder
Browse files Browse the repository at this point in the history
  • Loading branch information
tcezard committed Jun 6, 2024
1 parent e577312 commit 055e82a
Show file tree
Hide file tree
Showing 10 changed files with 79 additions and 81 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -70,22 +70,21 @@ def analyze_asm_report_files(asm_report_files):
return exit_code


def analyze_vcf_validation_results(species_release_folder, assembly_accession):
vcf_validation_report_files = glob.glob("{0}/{1}/{2}".format(species_release_folder, assembly_accession,
vcf_validation_output_file_pattern))
def analyze_vcf_validation_results(assembly_release_folder, assembly_accession):
vcf_validation_report_files = glob.glob("{0}/{2}".format(assembly_release_folder, assembly_accession,
vcf_validation_output_file_pattern))
exit_code = analyze_vcf_validation_files(vcf_validation_report_files)
asm_report_files = glob.glob("{0}/{1}/{2}".format(species_release_folder, assembly_accession,
asm_report_output_file_pattern))
asm_report_files = glob.glob("{0}/{2}".format(assembly_release_folder, asm_report_output_file_pattern))
exit_code = exit_code or analyze_asm_report_files(asm_report_files)
sys.exit(exit_code)


@click.option("--species-release-folder", required=True)
@click.option("--assembly-release-folder", required=True)
@click.option("--assembly-accession", required=True)
@click.command()
def main(species_release_folder, assembly_accession):
def main(assembly_release_folder, assembly_accession):
logging_config.add_stdout_handler()
analyze_vcf_validation_results(species_release_folder, assembly_accession)
analyze_vcf_validation_results(assembly_release_folder, assembly_accession)


if __name__ == '__main__':
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,19 @@
from run_release_in_embassy.release_common_utils import get_release_vcf_file_name_genbank, get_release_text_file_name


def count_rs_ids_in_release_files(count_ids_script_path, taxonomy_id, assembly_accession, species_release_folder):
release_count_filename = os.path.join(species_release_folder, assembly_accession, "README_rs_ids_counts.txt")
def count_rs_ids_in_release_files(count_ids_script_path, taxonomy_id, assembly_accession, assembly_release_folder):
release_count_filename = os.path.join(assembly_release_folder, assembly_accession, "README_rs_ids_counts.txt")
with open(release_count_filename, "w") as release_count_file_handle:
release_count_file_handle.write("# Unique RS ID counts\n")
for vcf_file_category in release_vcf_file_categories:
release_vcf_file_name = get_release_vcf_file_name_genbank(species_release_folder, taxonomy_id, assembly_accession,
release_vcf_file_name = get_release_vcf_file_name_genbank(assembly_release_folder, taxonomy_id, assembly_accession,
vcf_file_category)
num_ids_in_file = run_command_with_output("Counting RS IDs in file: " + release_vcf_file_name,
"{0} {1}.gz".format(count_ids_script_path, release_vcf_file_name),
return_process_output=True)
release_count_file_handle.write(num_ids_in_file)
for text_release_file_category in release_text_file_categories:
text_release_file_name = get_release_text_file_name(species_release_folder, taxonomy_id, assembly_accession,
text_release_file_name = get_release_text_file_name(assembly_release_folder, taxonomy_id, assembly_accession,
text_release_file_category)
num_ids_in_file = run_command_with_output("Counting RS IDs in file: " + text_release_file_name,
"zcat {0}.gz | cut -f1 | uniq | wc -l"
Expand All @@ -46,11 +46,11 @@ def count_rs_ids_in_release_files(count_ids_script_path, taxonomy_id, assembly_a
@click.option("--count-ids-script-path", help="ex: /path/to/count/ids/script", required=True)
@click.option("--taxonomy-id", help="ex: 9913", required=True)
@click.option("--assembly-accession", help="ex: GCA_000003055.6", required=True)
@click.option("--species-release-folder", required=True)
@click.option("--assembly-release-folder", required=True)
@click.command()
def main(count_ids_script_path, taxonomy_id, assembly_accession, species_release_folder):
def main(count_ids_script_path, taxonomy_id, assembly_accession, assembly_release_folder):
logging_config.add_stdout_handler()
count_rs_ids_in_release_files(count_ids_script_path, taxonomy_id, assembly_accession, species_release_folder)
count_rs_ids_in_release_files(count_ids_script_path, taxonomy_id, assembly_accession, assembly_release_folder)


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,14 @@ def create_release_properties_file_for_assembly(private_config_xml_file, profile
@click.option("--release-species-inventory-table", default="eva_progress_tracker.clustering_release_tracker",
required=False)
@click.option("--release-version", help="ex: 2", type=int, required=True)
@click.option("--species-release-folder", required=True)
@click.option("--assembly-release-folder", required=True)
@click.command()
def main(private_config_xml_file, profile, taxonomy_id, assembly_accession, release_species_inventory_table,
release_version, species_release_folder):
release_version, assembly_release_folder):
logging_config.add_stdout_handler()
create_release_properties_file_for_assembly(private_config_xml_file, profile, taxonomy_id, assembly_accession,
release_species_inventory_table, release_version,
species_release_folder)
assembly_release_folder)


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@
get_release_vcf_file_name, get_unsorted_release_vcf_file_name, get_unsorted_release_text_file_name


def move_release_files_to_unsorted_category(taxonomy_id, assembly_accession, species_release_folder, vcf_file_category,
def move_release_files_to_unsorted_category(taxonomy_id, assembly_accession, assembly_release_folder, vcf_file_category,
unsorted_release_file_path):
unsorted_release_file_name = os.path.basename(unsorted_release_file_path)
release_file_path = get_release_vcf_file_name(species_release_folder, taxonomy_id, assembly_accession,
release_file_path = get_release_vcf_file_name(assembly_release_folder, taxonomy_id, assembly_accession,
vcf_file_category)
release_file_name = os.path.basename(release_file_path)
for variant_source in ["eva", "dbsnp"]:
Expand Down Expand Up @@ -87,17 +87,17 @@ def merge_dbsnp_eva_vcf_headers(file1, file2, output_file):


def merge_dbsnp_eva_vcf_files(bgzip_path, bcftools_path, vcf_sort_script_path, taxonomy_id, assembly_accession,
species_release_folder, vcf_file_category, data_sources):
assembly_release_folder, vcf_file_category, data_sources):
vcf_merge_commands = []
# This is the desired post-merge output file name in the format <assembly>_<category>.vcf
# ex: 60711_GCA_000409795.2_merged_ids.vcf
unsorted_release_file_path = get_unsorted_release_vcf_file_name(species_release_folder, taxonomy_id, assembly_accession,
unsorted_release_file_path = get_unsorted_release_vcf_file_name(assembly_release_folder, taxonomy_id, assembly_accession,
vcf_file_category)
unsorted_release_file_name = os.path.basename(unsorted_release_file_path)
# After release pipeline is run on a species, the default VCF output files are in the formats like below
# ex: eva_GCA_000409795.2_merged_ids.vcf and dbsnp_GCA_000409795.2_merged_ids.vcf
# Move them to files with _unsorted suffix to avoid confusion
move_release_files_to_unsorted_category(taxonomy_id, assembly_accession, species_release_folder, vcf_file_category,
move_release_files_to_unsorted_category(taxonomy_id, assembly_accession, assembly_release_folder, vcf_file_category,
unsorted_release_file_path)
dbsnp_vcf_file_pattern = unsorted_release_file_path.replace(unsorted_release_file_name,
"dbsnp*_" + unsorted_release_file_name.replace(f'{str(taxonomy_id)}_', ''))
Expand Down Expand Up @@ -134,10 +134,10 @@ def merge_dbsnp_eva_vcf_files(bgzip_path, bcftools_path, vcf_sort_script_path, t
return vcf_merge_commands


def merge_dbsnp_eva_text_files(taxonomy_id, assembly_accession, species_release_folder, text_release_file_category,
def merge_dbsnp_eva_text_files(taxonomy_id, assembly_accession, assembly_release_folder, text_release_file_category,
data_sources):
text_release_file_merge_commands = []
unsorted_release_file_path = get_unsorted_release_text_file_name(species_release_folder, taxonomy_id, assembly_accession,
unsorted_release_file_path = get_unsorted_release_text_file_name(assembly_release_folder, taxonomy_id, assembly_accession,
text_release_file_category)
unsorted_release_file_name = os.path.basename(unsorted_release_file_path)
# After release is run on a species, the default text (i.e., non-vcf) output files have ".unsorted.txt" file suffix
Expand Down Expand Up @@ -174,18 +174,18 @@ def merge_dbsnp_eva_text_files(taxonomy_id, assembly_accession, species_release_

def merge_dbsnp_eva_release_files(private_config_xml_file, profile, bgzip_path, bcftools_path, vcf_sort_script_path,
taxonomy_id, assembly_accession, release_species_inventory_table, release_version,
species_release_folder):
assembly_release_folder):
with get_metadata_connection_handle(profile, private_config_xml_file) as metadata_connection_handle:
release_info = get_release_inventory_info_for_assembly(taxonomy_id, assembly_accession,
release_species_inventory_table,
release_version, metadata_connection_handle)
merge_commands = []
for vcf_file_category in release_vcf_file_categories:
merge_commands.extend(merge_dbsnp_eva_vcf_files(bgzip_path, bcftools_path, vcf_sort_script_path,
taxonomy_id, assembly_accession, species_release_folder,
taxonomy_id, assembly_accession, assembly_release_folder,
vcf_file_category, release_info["sources"]))
for text_release_file_category in release_text_file_categories:
merge_commands.extend(merge_dbsnp_eva_text_files(taxonomy_id, assembly_accession, species_release_folder,
merge_commands.extend(merge_dbsnp_eva_text_files(taxonomy_id, assembly_accession, assembly_release_folder,
text_release_file_category, release_info["sources"]))
final_merge_command = " && ".join(merge_commands)
run_command_with_output(f"Merging dbSNP and EVA release files for taxonomy {taxonomy_id} and assembly {assembly_accession}",
Expand All @@ -202,14 +202,14 @@ def merge_dbsnp_eva_release_files(private_config_xml_file, profile, bgzip_path,
@click.option("--release-species-inventory-table", default="eva_progress_tracker.clustering_release_tracker",
required=False)
@click.option("--release-version", help="ex: 2", type=int, required=True)
@click.option("--species-release-folder", required=True)
@click.option("--assembly-release-folder", required=True)
@click.command()
def main(private_config_xml_file, profile, bgzip_path, bcftools_path, vcf_sort_script_path, taxonomy_id,
assembly_accession, release_species_inventory_table, release_version, species_release_folder):
assembly_accession, release_species_inventory_table, release_version, assembly_release_folder):
logging_config.add_stdout_handler()
merge_dbsnp_eva_release_files(private_config_xml_file, profile, bgzip_path, bcftools_path, vcf_sort_script_path,
taxonomy_id, assembly_accession, release_species_inventory_table, release_version,
species_release_folder)
assembly_release_folder)


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,32 +56,31 @@ def get_bgzip_bcftools_index_commands_for_file(bgzip_path, bcftools_path, file):
return commands


def get_release_vcf_file_name(species_release_folder, taxonomy_id, assembly_accession, vcf_file_category):
return os.path.join(species_release_folder, assembly_accession, "{0}_{1}_{2}.vcf".format(taxonomy_id,
assembly_accession,
vcf_file_category))
def get_release_vcf_file_name(assembly_release_folder, taxonomy_id, assembly_accession, vcf_file_category):
return os.path.join(assembly_release_folder, "{0}_{1}_{2}.vcf".format(taxonomy_id, assembly_accession,
vcf_file_category))


def get_release_vcf_file_name_genbank(species_release_folder, taxonomy_id, assembly_accession, vcf_file_category):
def get_release_vcf_file_name_genbank(assembly_release_folder, taxonomy_id, assembly_accession, vcf_file_category):
return os.path.join(
species_release_folder, assembly_accession,
assembly_release_folder,
"{0}_{1}_{2}_with_genbank.vcf".format(taxonomy_id, assembly_accession, vcf_file_category)
)


def get_unsorted_release_vcf_file_name(species_release_folder, taxonomy_id, assembly_accession, vcf_file_category):
vcf_file_path = get_release_vcf_file_name(species_release_folder, taxonomy_id, assembly_accession, vcf_file_category)
def get_unsorted_release_vcf_file_name(assembly_release_folder, taxonomy_id, assembly_accession, vcf_file_category):
vcf_file_path = get_release_vcf_file_name(assembly_release_folder, taxonomy_id, assembly_accession, vcf_file_category)
filename = os.path.basename(vcf_file_path)
return vcf_file_path.replace(filename, filename.replace(".vcf", "_unsorted.vcf"))


def get_release_text_file_name(species_release_folder, taxonomy_id, assembly_accession, release_text_file_category):
return os.path.join(species_release_folder, assembly_accession,
def get_release_text_file_name(assembly_release_folder, taxonomy_id, assembly_accession, release_text_file_category):
return os.path.join(assembly_release_folder,
"{0}_{1}_{2}.txt".format(taxonomy_id, assembly_accession, release_text_file_category))


def get_unsorted_release_text_file_name(species_release_folder, taxonomy_id, assembly_accession, release_text_file_category):
release_text_file_path = get_release_text_file_name(species_release_folder, taxonomy_id, assembly_accession,
def get_unsorted_release_text_file_name(assembly_release_folder, taxonomy_id, assembly_accession, release_text_file_category):
release_text_file_path = get_release_text_file_name(assembly_release_folder, taxonomy_id, assembly_accession,
release_text_file_category)
filename = os.path.basename(release_text_file_path)
return release_text_file_path.replace(filename, filename.replace(".txt", ".unsorted.txt"))
Expand Down
Loading

0 comments on commit 055e82a

Please sign in to comment.