Skip to content

Commit

Permalink
Accession based score set tweaks
Browse files Browse the repository at this point in the history
  • Loading branch information
bencap committed Jun 26, 2024
1 parent 721c8c2 commit 6ba267f
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 3 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@ dependencies = [
"biopython",
"tqdm",
"click",
"cdot",
"cool-seq-tool>=0.4.0.dev1",
"ga4gh.vrs~=2.0.0-a6",
"gene_normalizer[etl,pg]==0.3.0-dev2",
"pydantic>=2",
"python-dotenv",
"setuptools>=68.0", # tmp -- ensure 3.12 compatibility
"mavehgvs==0.6.1",
"cdot",
]
dynamic = ["version"]

Expand Down
5 changes: 4 additions & 1 deletion src/dcd_mapping/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,10 @@ async def map_scoreset(

_emit_info("Selecting reference sequence...", silent)
try:
transcript = await select_transcript(metadata, records, alignment_result)
if metadata.target_accession is None:
transcript = await select_transcript(metadata, records, alignment_result)
else:
transcript = None
except TxSelectError as e:
_emit_info(
f"Transcript selection failed for scoreset {metadata.urn}",
Expand Down
6 changes: 5 additions & 1 deletion src/dcd_mapping/transcripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ async def select_transcript(
metadata: ScoresetMetadata,
records: list[ScoreRow],
align_result: AlignmentResult,
) -> TxSelectResult | None:
) -> TxSelectResult | str | None:
"""Select appropriate human reference sequence for scoreset.
* Unnecessary for regulatory/other noncoding scoresets which report genomic
Expand All @@ -336,10 +336,14 @@ async def select_transcript(
sequence=_get_protein_sequence(metadata.target_sequence),
)

if metadata.target_accession:
return metadata.target_accession

if metadata.target_gene_category != TargetType.PROTEIN_CODING:
_logger.debug("%s is regulatory/noncoding -- skipping transcript selection")
return None
transcript_reference = await _select_protein_reference(metadata, align_result)

if transcript_reference and metadata.target_sequence_type == TargetSequenceType.DNA:
offset = _offset_target_sequence(metadata, records)
if offset:
Expand Down
6 changes: 6 additions & 0 deletions src/dcd_mapping/vrs_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,9 @@ def _map_accession(
# see if accession is in seqrepo
# if not, get seq from cdot and add to seqrepo
sequence_id = metadata.target_accession
if sequence_id is None:
raise ValueError

store_accession(sequence_id)

for row in records:
Expand Down Expand Up @@ -721,6 +724,9 @@ def vrs_map(
_logger.warning(msg)
return None

if metadata.target_accession:
return _map_accession(metadata, records, align_result)

if metadata.target_gene_category == TargetType.PROTEIN_CODING and transcript:
return _map_protein_coding(
metadata,
Expand Down

0 comments on commit 6ba267f

Please sign in to comment.