Skip to content

Commit

Permalink
updated parsing of resfinder and amrfinder
Browse files Browse the repository at this point in the history
  • Loading branch information
mhkc committed Dec 5, 2023
1 parent 92f9c5c commit ad7657b
Show file tree
Hide file tree
Showing 9 changed files with 207 additions and 156 deletions.
41 changes: 24 additions & 17 deletions prp/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pydantic import ValidationError

from .models.metadata import SoupVersion, SoupType
from .models.phenotype import ElementType
from .models.phenotype import ElementType, ElementStressSubtype
from .models.qc import QcMethodIndex
from .models.sample import MethodIndex, PipelineResult
from .models.typing import TypingMethod
Expand Down Expand Up @@ -63,12 +63,18 @@ def cli():
"-k", "--kraken", type=click.File(), help="Kraken species annotation results"
)
@click.option(
"-a", "--amrfinder", type=str, help="amrfinderplus anti-microbial resistance results"
"-a",
"--amrfinder",
type=str,
help="amrfinderplus anti-microbial resistance results",
)
@click.option("-m", "--mlst", type=click.File(), help="MLST prediction results")
@click.option("-c", "--cgmlst", type=click.File(), help="cgMLST prediction results")
@click.option(
"-v", "--virulence", type=click.File(), help="Virulence factor prediction results"
"-v",
"--virulencefinder",
type=click.File(),
help="Virulence factor prediction results",
)
@click.option(
"-r",
Expand All @@ -89,7 +95,7 @@ def create_output(
kraken,
mlst,
cgmlst,
virulence,
virulencefinder,
amrfinder,
resfinder,
quality,
Expand All @@ -102,7 +108,7 @@ def create_output(
LOG.info("Start generating pipeline result json")
results = {
"run_metadata": {
"run": parse_run_info(run_metadata),
"run": parse_run_info(run_metadata),
"databases": get_database_info(process_metadata),
},
"qc": [],
Expand Down Expand Up @@ -133,7 +139,10 @@ def create_output(
if resfinder:
LOG.info("Parse resistance results")
pred_res = json.load(resfinder)
methods = [ElementType.AMR, ElementType.BIOCIDE, ElementType.HEAT]
methods = [
ElementType.AMR,
ElementType.STRESS,
]
for method in methods:
res: MethodIndex = parse_resfinder_amr_pred(pred_res, method)
# exclude empty results from output
Expand All @@ -145,9 +154,7 @@ def create_output(
LOG.info("Parse amr results")
methods = [
ElementType.AMR,
ElementType.BIOCIDE,
ElementType.METAL,
ElementType.HEAT,
ElementType.STRESS,
]
for method in methods:
res: MethodIndex = parse_amrfinder_amr_pred(amrfinder, method)
Expand All @@ -156,10 +163,11 @@ def create_output(
results["element_type_result"].append(vir)

# get virulence factors in sample
if virulence:
LOG.info("Parse virulence results")
vir: MethodIndex = parse_virulencefinder_vir_pred(virulence)
results["element_type_result"].append(vir)
if virulencefinder:
LOG.info("Parse virulencefinder results")
vir: MethodIndex | None = parse_virulencefinder_vir_pred(virulencefinder)
if vir is not None:
results["element_type_result"].append(vir)

# species id
if kraken:
Expand All @@ -175,7 +183,7 @@ def create_output(
pred_res = json.load(mykrobe)
results["run_metadata"]["databases"].append(
SoupVersion(
name="mykrobe-predictor",
name="mykrobe-predictor",
version=pred_res[sample_id]["version"]["mykrobe-predictor"],
type=SoupType.DB,
)
Expand Down Expand Up @@ -211,9 +219,8 @@ def create_output(

try:
output_data = PipelineResult(
sample_id=sample_id,
schema_version=OUTPUT_SCHEMA_VERSION,
**results)
sample_id=sample_id, schema_version=OUTPUT_SCHEMA_VERSION, **results
)
except ValidationError as err:
click.secho("Input failed Validation", fg="red")
click.secho(err)
Expand Down
6 changes: 3 additions & 3 deletions prp/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ class RWModel(BaseModel): # pylint: disable=too-few-public-methods
"""Base model for read/ write operations"""

model_config = ConfigDict(
allow_population_by_alias = True,
populate_by_name = True,
use_enum_values = True,
allow_population_by_alias=True,
populate_by_name=True,
use_enum_values=True,
)
10 changes: 8 additions & 2 deletions prp/models/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,14 @@ class RunInformation(RWModel):
pipeline: str
version: str
commit: str
analysis_profile: str = Field(..., alias="analysisProfile")
configuration_files: List[str] = Field(..., alias="configurationFiles")
analysis_profile: str = Field(
...,
alias="analysisProfile",
description="The analysis profile used when starting the pipeline",
)
configuration_files: List[str] = Field(
..., alias="configurationFiles", description="Nextflow configuration used"
)
workflow_name: str
sample_name: str
sequencing_platform: str
Expand Down
156 changes: 102 additions & 54 deletions prp/models/phenotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,18 @@
from enum import Enum
from typing import Dict, List, Optional, Union

from pydantic import BaseModel
from pydantic import BaseModel, Field

from .base import RWModel


class SequenceStand(Enum):
"""Definition of DNA strand."""

FORWARD = "+"
REVERSE = "-"


class PredictionSoftware(Enum):
"""Container for prediciton software names."""

Expand All @@ -29,89 +36,130 @@ class ElementType(Enum):
"""Categories of resistance and virulence genes."""

AMR = "AMR"
ACID = "STRESS_ACID"
BIOCIDE = "STRESS_BIOCIDE"
METAL = "STRESS_METAL"
HEAT = "STRESS_HEAT"
STRESS = "STRESS"
VIR = "VIRULENCE"


class ElementStressSubtype(Enum):
"""Categories of resistance and virulence genes."""

ACID = "ACID"
BIOCIDE = "BIOCIDE"
METAL = "METAL"
HEAT = "HEAT"


class ElementAmrSubtype(Enum):
"""Categories of resistance and virulence genes."""

AMR = "AMR"


class ElementVirulenceSubtype(Enum):
"""Categories of resistance and virulence genes."""

VIR = "VIRULENCE"

class DatabaseReference(RWModel):
"""Refernece to a database."""

ref_database: Optional[str]
ref_id: Optional[str]
ref_database: Optional[str] = None
ref_id: Optional[str] = None


class GeneBase(BaseModel):
"""Container for gene information"""

accession: Optional[str]
accession: Optional[str] = None
# prediction info
depth: Optional[float]
identity: Optional[float]
coverage: Optional[float]
ref_start_pos: Optional[int]
ref_end_pos: Optional[int]
ref_gene_length: Optional[int]
depth: Optional[float] = None
identity: Optional[float] = None
coverage: Optional[float] = None
ref_start_pos: Optional[int] = None
ref_end_pos: Optional[int] = None
ref_gene_length: Optional[int] = Field(
default=None,
alias="target_length",
description="The length of the query protein or gene.",
)
alignment_length: Optional[int]
# amrfinder extra info
contig_id: Optional[str]
gene_symbol: Optional[str]
sequence_name: Optional[str]
ass_start_pos: Optional[int]
ass_end_pos: Optional[int]
strand: Optional[str]
element_type: Optional[str]
element_subtype: Optional[str]
target_length: Optional[int]
res_class: Optional[str]
res_subclass: Optional[str]
method: Optional[str]
close_seq_name: Optional[str]
contig_id: Optional[str] = None
gene_symbol: Optional[str] = None
sequence_name: Optional[str] = Field(
default=None, description="Reference sequence name"
)
ass_start_pos: Optional[int] = Field(
default=None, description="Start position on the assembly"
)
ass_end_pos: Optional[int] = Field(
default=None, description="End position on the assembly"
)
strand: Optional[SequenceStand] = None
element_type: ElementType = Field(
description="The predominant function fo the gene."
)
element_subtype: ElementStressSubtype | ElementAmrSubtype | ElementVirulenceSubtype = Field(
description="Further functional categorization of the genes."
)
res_class: Optional[str] = None
res_subclass: Optional[str] = None
method: Optional[str] = Field(
default=None, description="Generic description of the prediction method"
)
close_seq_name: Optional[str] = Field(
default=None,
description="Name of the closest competing hit if there are multiple equaly good hits",
)


class ResistanceGene(GeneBase, DatabaseReference):
"""Container for resistance gene information"""

phenotypes: List[str]
phenotypes: List[str] = []


class VirulenceGene(GeneBase, DatabaseReference):
"""Container for virulence gene information"""

virulence_category: Optional[str]
virulence_category: Optional[str] = None


class VariantBase(DatabaseReference):
"""Container for mutation information"""

variant_type: Optional[VariantType]
genes: Optional[List[str]]
position: Optional[int]
ref_nt: Optional[str]
alt_nt: Optional[str]
variant_type: VariantType
genes: List[str]
position: int
ref_nt: str
alt_nt: str
# prediction info
depth: Optional[float]
contig_id: Optional[str]
gene_symbol: Optional[str]
sequence_name: Optional[str]
ass_start_pos: Optional[int]
ass_end_pos: Optional[int]
strand: Optional[str]
element_type: Optional[str]
element_subtype: Optional[str]
target_length: Optional[int]
res_class: Optional[str]
res_subclass: Optional[str]
method: Optional[str]
close_seq_name: Optional[str]
type: Optional[str]
change: Optional[str]
nucleotide_change: Optional[str]
protein_change: Optional[str]
annotation: Optional[List[Dict]]
drugs: Optional[List[Dict]]
depth: Optional[float] = None
contig_id: Optional[str] = None
gene_symbol: Optional[str] = None
sequence_name: Optional[str] = Field(
default=None, description="Reference sequence name"
)
ass_start_pos: Optional[int] = Field(
default=None, description="Assembly start position"
)
ass_end_pos: Optional[int] = Field(
default=None, description="Assembly end position"
)
strand: Optional[SequenceStand] = None
element_type: Optional[ElementType] = None
element_subtype: Optional[str] = None
target_length: Optional[int] = None
res_class: Optional[str] = None
res_subclass: Optional[str] = None
method: Optional[str] = None
close_seq_name: Optional[str] = None
type: Optional[str] = None
change: Optional[str] = None
nucleotide_change: Optional[str] = None
protein_change: Optional[str] = None
annotation: Optional[List[Dict]] = None
drugs: Optional[List[Dict]] = None


class ResistanceVariant(VariantBase):
Expand Down
6 changes: 3 additions & 3 deletions prp/models/qc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""QC data models."""
from enum import Enum
from typing import Dict, Union
from typing import Dict

from pydantic import BaseModel

Expand Down Expand Up @@ -50,5 +50,5 @@ class QcMethodIndex(RWModel):
"""

software: QcSoftware
version: Union[str, None]
result: Union[QuastQcResult, PostAlignQcResult]
version: str | None = None
result: QuastQcResult | PostAlignQcResult
3 changes: 2 additions & 1 deletion prp/parse/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

LOG = logging.getLogger(__name__)


def get_database_info(process_metadata: List[TextIO]) -> List[SoupVersion]:
"""Get database or software information.
Expand Down Expand Up @@ -35,4 +36,4 @@ def parse_run_info(run_metadata: TextIO) -> RunInformation:
"""
LOG.info("Parse run metadata.")
run_info = RunInformation(**json.load(run_metadata))
return run_info
return run_info
Loading

0 comments on commit ad7657b

Please sign in to comment.