From 6ba267faf5e7020c12ef2506822e7a171e573c9f Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Wed, 26 Jun 2024 10:28:22 -0700 Subject: [PATCH] Accession based score set tweaks --- pyproject.toml | 2 +- src/dcd_mapping/main.py | 5 ++++- src/dcd_mapping/transcripts.py | 6 +++++- src/dcd_mapping/vrs_map.py | 6 ++++++ 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1e62573..aaf5390 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ "biopython", "tqdm", "click", + "cdot", "cool-seq-tool>=0.4.0.dev1", "ga4gh.vrs~=2.0.0-a6", "gene_normalizer[etl,pg]==0.3.0-dev2", @@ -43,7 +44,6 @@ dependencies = [ "python-dotenv", "setuptools>=68.0", # tmp -- ensure 3.12 compatibility "mavehgvs==0.6.1", - "cdot", ] dynamic = ["version"] diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index 3ddc08b..db8e75c 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -159,7 +159,10 @@ async def map_scoreset( _emit_info("Selecting reference sequence...", silent) try: - transcript = await select_transcript(metadata, records, alignment_result) + if metadata.target_accession is None: + transcript = await select_transcript(metadata, records, alignment_result) + else: + transcript = None except TxSelectError as e: _emit_info( f"Transcript selection failed for scoreset {metadata.urn}", diff --git a/src/dcd_mapping/transcripts.py b/src/dcd_mapping/transcripts.py index 56bea85..5e6579a 100644 --- a/src/dcd_mapping/transcripts.py +++ b/src/dcd_mapping/transcripts.py @@ -311,7 +311,7 @@ async def select_transcript( metadata: ScoresetMetadata, records: list[ScoreRow], align_result: AlignmentResult, -) -> TxSelectResult | None: +) -> TxSelectResult | str | None: """Select appropriate human reference sequence for scoreset. * Unnecessary for regulatory/other noncoding scoresets which report genomic @@ -336,10 +336,14 @@ async def select_transcript( sequence=_get_protein_sequence(metadata.target_sequence), ) + if metadata.target_accession: + return metadata.target_accession + if metadata.target_gene_category != TargetType.PROTEIN_CODING: _logger.debug("%s is regulatory/noncoding -- skipping transcript selection") return None transcript_reference = await _select_protein_reference(metadata, align_result) + if transcript_reference and metadata.target_sequence_type == TargetSequenceType.DNA: offset = _offset_target_sequence(metadata, records) if offset: diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index c840846..185eb29 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -599,6 +599,9 @@ def _map_accession( # see if accession is in seqrepo # if not, get seq from cdot and add to seqrepo sequence_id = metadata.target_accession + if sequence_id is None: + raise ValueError + store_accession(sequence_id) for row in records: @@ -721,6 +724,9 @@ def vrs_map( _logger.warning(msg) return None + if metadata.target_accession: + return _map_accession(metadata, records, align_result) + if metadata.target_gene_category == TargetType.PROTEIN_CODING and transcript: return _map_protein_coding( metadata,