pha4ge · fmaguire · Jan 3, 2025 · Dec 28, 2024 · Dec 28, 2024 · Dec 28, 2024
diff --git a/README.md b/README.md
@@ -7,8 +7,8 @@
 
 # hAMRonization 
 
-This repo contains the hAMRonization module and CLI parser tools combine the outputs of 
-18 disparate antimicrobial resistance gene detection tools into a single unified format.
+This repo contains the hAMRonization module and CLI parser tools that combine the outputs
+of 18 disparate antimicrobial resistance gene detection tools into a single unified format.
 
 This is an implementation of the [hAMRonization AMR detection specification scheme](docs/hAMRonization_specification_details.csv) which supports gene presence/absence resistance and mutational resistance (if supported by the underlying tool).
 
@@ -80,8 +80,7 @@ Tools with hAMRonizable reports:
                         report`)
     kmerresistance      hAMRonize kmerresistance's output report i.e., OUTPUT.res
     resfams             hAMRonize resfams's output report i.e., resfams.tblout
-    resfinder           hAMRonize resfinder's output report i.e.,
-                        ResFinder_results_tab.txt
+    resfinder           hAMRonize resfinder's JSON output report (use -j to produce)
     mykrobe             hAMRonize mykrobe's output report i.e., OUTPUT.json
     pointfinder         hAMRonize pointfinder's output report i.e.,
                         PointFinder_results.txt
@@ -193,7 +192,7 @@ If you want to write multiple reports to one file, this `.write` method can acce
 Currently implemented parsers and the last tool version for which they have been validated:
 
 1. [abricate](hAMRonization/AbricateIO.py): last updated for v1.0.0
-2. [amrfinderplus](hAMRonization/AmrFinderPlusIO.py): last updated for v3.12.18
+2. [amrfinderplus](hAMRonization/AmrFinderPlusIO.py): last updated for v4.0.3
 3. [amrplusplus](hAMRonization/AmrPlusPlusIO.py): last updated for c6b097a
 4. [ariba](hAMRonization/AribaIO.py): last updated for v2.14.6
 5. [csstar](hAMRonization/CSStarIO.py): last updated for v2.1.0

diff --git a/hAMRonization/AmrFinderPlusIO.py b/hAMRonization/AmrFinderPlusIO.py
@@ -18,78 +18,56 @@
 
 
 class AmrFinderPlusIterator(hAMRonizedResultIterator):
+
+    nuc_field_map = {
+        "Protein id": None,
+        "Contig id": "input_sequence_id",
+        "Start": "input_gene_start",
+        "Stop": "input_gene_stop",
+        "Strand": "strand_orientation",
+        "Element symbol": "gene_symbol",
+        "Element name": "gene_name",
+        "Scope": None,
+        "Type": None,
+        "Subtype": None,
+        "Class": "drug_class",
+        "Subclass": "antimicrobial_agent",
+        "Method": None,
+        "Target length": "input_gene_length",
+        "Reference sequence length": "reference_gene_length",
+        "% Coverage of reference": "coverage_percentage",
+        "% Identity to reference": "sequence_identity",
+        "Alignment length": None,
+        "Closest reference accession": "reference_accession",
+        "Closest reference name": None,
+        "HMM accession": None,
+        "HMM description": None,
+        "Hierarchy node": None,
+        # Fields we compute below (not in TSV)
+        "amino_acid_mutation": "amino_acid_mutation",
+        "nucleotide_mutation": "nucleotide_mutation",
+        "genetic_variation_type": "genetic_variation_type",
+    }
+
+    # AMP outputs the same column set for nuc and prot detections,
+    # with Start and Stop always in nt units; however target and
+    # reference length are reported in AA for proteins.
+    prot_field_map = nuc_field_map.copy()
+    prot_field_map.update({
+        "Target length": "input_protein_length",
+        "Reference sequence length": "reference_protein_length"
+    })
+
     def __init__(self, source, metadata):
         metadata["analysis_software_name"] = "amrfinderplus"
         metadata["reference_database_name"] = "NCBI Reference Gene Database"
         self.metadata = metadata
 
-        # check source for whether AMFP has been run in protein or nt mode
-
-        nucleotide_field_mapping = {
-            "Protein identifier": None,
-            "Contig id": "input_sequence_id",
-            "Start": "input_gene_start",
-            "Stop": "input_gene_stop",
-            "Strand": "strand_orientation",
-            "Gene symbol": "gene_symbol",
-            "Sequence name": "gene_name",
-            "Scope": None,
-            "Element type": None,
-            "Element subtype": None,
-            "Class": "drug_class",
-            "Subclass": "antimicrobial_agent",
-            "Method": None,
-            "Target length": "input_protein_length",
-            "Reference sequence length": "reference_protein_length",
-            "% Coverage of reference sequence": "coverage_percentage",
-            "% Identity to reference sequence": "sequence_identity",
-            "Alignment length": None,
-            "Accession of closest sequence": "reference_accession",
-            "Name of closest sequence": None,
-            "HMM id": None,
-            "HMM description": None,
-            "AA Mutation": "amino_acid_mutation",
-            "Nucleotide Mutation": "nucleotide_mutation",
-            "genetic_variation_type": "genetic_variation_type",
-        }
-        protein_field_mapping = {
-            "Protein identifier": "input_sequence_id",
-            "Gene symbol": "gene_symbol",
-            "Sequence name": "gene_name",
-            "Scope": None,
-            "Element": None,
-            "Element subtype": None,
-            "Class": "drug_class",
-            "Subclass": "antimicrobial_agent",
-            "Method": None,
-            "Target length": "input_protein_length",
-            "Reference sequence length": "reference_protein_length",
-            "% Coverage of reference sequence": "coverage_percentage",
-            "% Identity to reference sequence": "sequence_identity",
-            "Alignment length": None,
-            "Accession of closest sequence": "reference_accession",
-            "Name of closest sequence": None,
-            "HMM id": None,
-            "HMM description": None,
-            "AA Mutation": "amino_acid_mutation",
-            "genetic_variation_type": "genetic_variation_type",
-        }
-
-        with open(source) as fh:
-            header = next(fh).strip().split("\t")
-            try:
-                first_result = next(fh)
-                prot_id = header.index("Protein identifier") 
-                if first_result.strip().split("\t")[prot_id] == "NA":
-                    self.field_mapping = nucleotide_field_mapping
-                else:
-                    self.field_mapping = protein_field_mapping
-            except StopIteration:
-                # doesn't really matter which mapping as this error indicates
-                # this is an empty results file
-                self.field_mapping = nucleotide_field_mapping
-
-        super().__init__(source, self.field_mapping, self.metadata)
+        # We pass None for the field_map as it differs depending on
+        # whether we return a nucleotide or protein variant detection.
+        # TODO: refactor field_map out of super's constructor, and make
+        # it a parameter on super's hARMonize().
+        super().__init__(source, None, self.metadata)
 
     def parse(self, handle):
         """
@@ -98,11 +76,16 @@ def parse(self, handle):
         skipped_truncated = 0
         reader = csv.DictReader(handle, delimiter="\t")
         for result in reader:
-            # replace NA value with None for consitency
+
+            # Replace NA value with None for consistency
             for field, value in result.items():
                 if value == "NA":
                     result[field] = None
 
+            # Skip reported virulence genes
+            if result['Type'] == "VIRULENCE":
+                continue
+
             # AFP reports partial hits so to avoid misleadingly listing these
             # as present skip results with INTERNAL_STOP
             # recommended by developers
@@ -113,24 +96,40 @@ def parse(self, handle):
             # "POINT" indicates mutational resistance
             # amrfinderplus has no special fields but the mutation itself is
             # appended to the symbol name so we want to split this
-            result["AA Mutation"] = None
-            result["Nucleotide Mutation"] = None
-            result["genetic_variation_type"] = GENE_PRESENCE
+            result['amino_acid_mutation'] = None
+            result['nucleotide_mutation'] = None
+            result['genetic_variation_type'] = GENE_PRESENCE
 
-            if result["Element subtype"] == "POINT":
-                gene_symbol, mutation = result["Gene symbol"].rsplit("_", 1)
-                result["Gene symbol"] = gene_symbol
+            if result['Subtype'] == "POINT":
+                gene_symbol, mutation = result['Element symbol'].rsplit("_", 1)
+                result['Element symbol'] = gene_symbol
                 _, ref, pos, alt, _ = re.split(r"(\D+)(\d+)(\D+)", mutation)
                 # this means it is a protein mutation
-                if result["Method"] in ["POINTX", "POINTP"]:
-                    result["AA Mutation"] = f"p.{ref}{pos}{alt}"
-                    result["genetic_variation_type"] = AMINO_ACID_VARIANT
-                elif result["Method"] == "POINTN":
+                if result['Method'] in ["POINTX", "POINTP"]:
+                    result['amino_acid_mutation'] = f"p.{ref}{pos}{alt}"
+                    result['genetic_variation_type'] = AMINO_ACID_VARIANT
+                elif result['Method'] == "POINTN":
                     # e.g., 23S_G2032G ampC_C-11C -> c.2032G>G
-                    result["Nucleotide Mutation"] = f"c.{pos}{ref}>{alt}"
-                    result["genetic_variation_type"] = NUCLEOTIDE_VARIANT
+                    result['nucleotide_mutation'] = f"c.{pos}{ref}>{alt}"
+                    result['genetic_variation_type'] = NUCLEOTIDE_VARIANT
+
+            # Determine the field_map to use depending on the method used
+            # The following seems to cover all bases with a minimum of fuss
+            have_prot = result['Protein id'] is not None
+            method = result['Method']
+            if method.endswith('P') or method.endswith('X'):
+                field_map = self.prot_field_map
+            elif method.endswith('N'):
+                field_map = self.nuc_field_map
+            elif method in ['COMPLETE', 'HMM']:
+                field_map = self.prot_field_map if have_prot else self.nuc_field_map
+            else:
+                warnings.warn(f"Assuming unknown method {method} implies a protein detection"
+                              f" in {self.metadata['input_file_name']}")
+                field_map = self.prot_field_map
 
-            yield self.hAMRonize(result, self.metadata)
+            # This uses the "override hack" that should perhaps be cleaned up
+            yield self.hAMRonize(result, self.metadata, field_map)
 
         if skipped_truncated > 0:
             warnings.warn(f"Skipping {skipped_truncated} records with INTERNAL_STOP "

diff --git a/hAMRonization/Interfaces.py b/hAMRonization/Interfaces.py
@@ -50,16 +50,24 @@ def __init__(self, source, field_map, metadata):
         except Exception:
             self.stream.close()
 
-    def hAMRonize(self, report_data, metadata):
+    # TODO: the field_map_override is a half-hack to support the scenario
+    # (as for amrfinderplus) where different records need different mappings,
+    # so setting a field_map in the constructor makes no sense.
+    # It might be cleaner to remove it from the constructor altogether and
+    # make it a parameter of this method (which is the only place where it
+    # is referenced anyway), and subclasses can trivially pass it in.
+    def hAMRonize(self, report_data, metadata, field_map_override=None):
         """
         Convert a line of parsed AMR report in original format to the
         hAMRonization specification
-        - report_result parsed dict of single results from report
-        - metadata dict of additional metadata fields that need added
+        - report_data parsed dict of single result from report
+        - metadata dict of additional metadata fields
+        - field_map_override optional override of field_map passed in c'tor
         """
         hAMRonized_result_data = {**metadata}
 
-        for original_field, hAMRonized_field in self.field_map.items():
+        field_map = field_map_override or self.field_map
+        for original_field, hAMRonized_field in field_map.items():
             if hAMRonized_field:
                 hAMRonized_result_data[hAMRonized_field] = report_data[original_field]
 

diff --git a/hAMRonization/ResFinderIO.py b/hAMRonization/ResFinderIO.py
@@ -63,11 +63,13 @@ def set_shared_fields(r):
             res.gene_symbol = r.get('name', "unspecified")
             res.gene_name = r.get('name', "unspecified")
             res.reference_accession = r.get('ref_acc', r.get('ref_id', r.get('key', "unknown")))
+            res.reference_database_name = _get_db_name(r.get('ref_database'))
+            res.reference_database_version = _get_db_ver(r.get('ref_database'))
 
             # optional
             res.coverage_percentage = _safe_round(r.get('coverage'), 1)
             res.coverage_depth = None  # we may have this for mutations detected from reads
-            res.coverage_ratio = r.get('coverage')/100.0
+            res.coverage_ratio = None
             res.input_sequence_id = r.get('query_id')
             res.input_gene_length = _get_length(r.get('query_start_pos'), r.get('query_end_pos'))
             res.input_gene_start = _get_start_pos(r.get('query_start_pos'), r.get('query_end_pos'))
@@ -118,9 +120,9 @@ def set_variation_fields(r, vs):
                     _codon.append(v.get('codon_change'))
 
                 # Add the content of the list fields to the bags above
-                fold(lambda s, e: s.add(e), _phenos, v.get('phenotypes', []))
-                fold(lambda s, e: s.add(e), _notes, v.get('notes', []))
-                fold(lambda s, e: s.add(e), _pmids, v.get('pmids', []))
+                _phenos.update(v.get('phenotypes', []))
+                _notes.update(v.get('notes', []))
+                _pmids.update(v.get('pmids', []))
 
             # We have collected all variations on region r, now collapse into fields on res
             res.predicted_phenotype = _empty_to_none(", ".join(filter(None, _phenos)))
@@ -145,11 +147,9 @@ def set_variation_fields(r, vs):
         #     - for each r report one AMINO_ACID_VARIANT record, collapsing the seq_variations
         for p in filter(lambda d: d.get('amr_resistant', False), data['phenotypes'].values()):
 
-            # Set the fields available on phenotype object
+            # Set the fields available on the phenotype object
             res.drug_class = ", ".join(p.get('amr_classes', []))
             res.antimicrobial_agent = p.get('amr_resistance', "unspecified")
-            res.reference_database_name = _get_db_name(p.get('ref_database'))
-            res.reference_database_version = _get_db_ver(p.get('ref_database'))
 
             # Iterate r over the regions (AMR genes) referenced by p, and yield each in turn
             for r in map(lambda k: data['seq_regions'][k], p.get('seq_regions', [])):

diff --git a/hAMRonization/hAMRonizedResult.py b/hAMRonization/hAMRonizedResult.py
@@ -77,7 +77,7 @@ def __post_init__(self):
         input_file_name = getattr(self, "input_file_name")
         input_file_name = os.path.basename(input_file_name)
 
-        for suffix in [ ".gz", ".fna", ".fasta", ".fsa", ".faa", ".fa" ]:
+        for suffix in [".gz", ".fna", ".fasta", ".fsa", ".faa", ".fa"]:
             input_file_name = input_file_name.removesuffix(suffix)
 
         setattr(self, "input_file_name", input_file_name)
diff --git a/test/data/dummy/amrfinderplus/report.tsv b/test/data/dummy/amrfinderplus/report.tsv
@@ -1,2 +1,2 @@
-Protein identifier	Contig id	Start	Stop	Strand	Gene symbol	Sequence name	Scope	Element type	Element subtype	Class	Subclass	Method	Target length	Reference sequence length	% Coverage of reference sequence	% Identity to reference sequence	Alignment length	Accession of closest sequence	Name of closest sequence	HMM id	HMM description
-NA	NZ_LR792628.1	1333611	1334783	-	oqxA	multidrug efflux RND transporter periplasmic adaptor subunit OqxA	core	AMR	AMR	PHENICOL/QUINOLONE	PHENICOL/QUINOLONE	BLASTX	391	391	100	99.49	391	WP_002914189.1	multidrug efflux RND transporter periplasmic adaptor subunit OqxA	NF000272.1	multidrug efflux RND transporter periplasmic adaptor subunit OqxA
+Protein id	Contig id	Start	Stop	Strand	Element symbol	Element name	Scope	Type	Subtype	Class	Subclass	Method	Target length	Reference sequence length	% Coverage of reference	% Identity to reference	Alignment length	Closest reference accession	Closest reference name	HMM accession	HMM description
+NA	NZ_LR792628.1	1333611	1334783	-	oqxA	multidrug efflux RND transporter periplasmic adaptor subunit OqxA	core	AMR	AMR	PHENICOL/QUINOLONE	PHENICOL/QUINOLONE	BLASTX	391	391	100.00	99.49	391	WP_002914189.1	multidrug efflux RND transporter periplasmic adaptor subunit OqxA	NA	NA
diff --git a/test/data/raw_outputs/amrfinderplus/afp_non_coding.tsv b/test/data/raw_outputs/amrfinderplus/afp_non_coding.tsv
@@ -1,2 +1,2 @@
-Name	Protein identifier	Contig id	Start	Stop	Strand	Gene symbol	Sequence name	Scope	Element type	Element subtype	Class	Subclass	Method	Target length	Reference sequence length	% Coverage of reference sequence	% Identity to reference sequence	Alignment length	Accession of closest sequence	Name of closest sequence	HMM id	HMM description
-DAWXTK010000082_noncoding_test	NA	DAWXTK010000082.1:68-2970	1	2903	+	23S_A2062G	Neisseria gonorrhoeae azithromycin resistant 23S	core	AMR	POINT	MACROLIDE	AZITHROMYCIN	POINTN	2903	2910	100.00	99.35	2910	NC_002946.2:1119158-1116249	23S ribosomal RNA	NA	NA
+Protein id	Contig id	Start	Stop	Strand	Element symbol	Element name	Scope	Type	Subtype	Class	Subclass	Method	Target length	Reference sequence length	% Coverage of reference	% Identity to reference	Alignment length	Closest reference accession	Closest reference name	HMM accession	HMM description
+NA	DAWXTK010000082.1:68-2970	1	2903	+	23S_A2059G	Neisseria gonorrhoeae azithromycin resistant 23S	core	AMR	POINT	MACROLIDE	AZITHROMYCIN	POINTN	2903	2910	100.00	99.35	2910	NC_002946.2:1119158-1116249	23S ribosomal RNA	NA	NA
diff --git a/test/data/raw_outputs/amrfinderplus/empty_report_with_header.tsv b/test/data/raw_outputs/amrfinderplus/empty_report_with_header.tsv
@@ -1 +1 @@
-Protein identifier	Contig id	Start	Stop	Strand	Gene symbol	Sequence name	Scope	Element type	Element subtype	Class	Subclass	Method	Target length	Reference sequence length	% Coverage of reference sequence	% Identity to reference sequence	Alignment length	Accession of closest sequence	Name of closest sequence	HMM id	HMM description
+Protein id	Contig id	Start	Stop	Strand	Element symbol	Element name	Scope	Type	Subtype	Class	Subclass	Method	Target length	Reference sequence length	% Coverage of reference	% Identity to reference	Alignment length	Closest reference accession	Closest reference name	HMM accession	HMM description	Hierarchy node
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		Protein identifier Contig id Start Stop Strand Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description
		Protein id Contig id Start Stop Strand Element symbol Element name Scope Type Subtype Class Subclass Method Target length Reference sequence length % Coverage of reference % Identity to reference Alignment length Closest reference accession Closest reference name HMM accession HMM description Hierarchy node