diff --git a/CHANGELOG.md b/CHANGELOG.md index adf1d1c..1ef083a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,9 @@ - resfinder_curation: grdA_1_QJX10702 -> 3007380 & EstDL136_1_JN242251 -> 3000557 - megares_curation: MEG_2865|Drugs|Phenicol|Chloramphenicol_hydrolase|ESTD -> 3000557 +### Handle AROs as string rather than int in get_aro_mapping_table() +AROs were previously handled as 'int' in the get_aro_mapping_table() function and this posed challenges when ARO numbers such as 'ARO:0010004' (leading zeros are cut). To fix this, AROs are now treated as strings so leading zeros can be maintained. + ## 0.4.0 - 10 June - Bundle a specific version of ARO with the package instead of downloading it from the internet (ensures reproducibility) diff --git a/argnorm/lib.py b/argnorm/lib.py index 747ffed..88a8011 100644 --- a/argnorm/lib.py +++ b/argnorm/lib.py @@ -42,18 +42,18 @@ def get_aro_mapping_table(database): aro_mapping_table = pd.read_csv( os.path.join(_ROOT, 'data', f'{database}_ARO_mapping.tsv'), - sep='\t') + sep='\t', dtype={'ARO': str}) aro_mapping_table.drop_duplicates(subset=['Original ID'], inplace=True) aro_mapping_table.set_index('Original ID', inplace=True) manual_curation = pd.read_csv( os.path.join(_ROOT, 'data/manual_curation', f'{database}_curation.tsv'), - sep='\t', index_col=0) + sep='\t', index_col=0, dtype={'ARO': str}) manual_curation['Database'] = aro_mapping_table['Database'].iloc[0] aro_mapping_table.drop(index=set(manual_curation.index) & set(aro_mapping_table.index), inplace=True) aro_mapping_table = pd.concat([aro_mapping_table, manual_curation]) - aro_mapping_table['ARO'] = aro_mapping_table['ARO'].map(lambda a: f'ARO:{int(a)}', na_action='ignore') + aro_mapping_table['ARO'] = aro_mapping_table['ARO'].map(lambda a: f'ARO:{a}', na_action='ignore') return aro_mapping_table def map_to_aro(gene, database): diff --git a/tests/test_lib.py b/tests/test_lib.py index 893b96b..8422523 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -9,7 +9,8 @@ def test_map_to_aro(): ["1028085756|WP_063844287.1|1|1|cpt|cpt|phosphotransferase|2|CHLORAMPHENICOL|PHENICOL|chloramphenicol_phosphotransferase_CPT", 'ncbi'], ["gb|AAG57600.1|ARO:3000318|mphB", "sarg"], ["(Phe)cpt_strepv:U09991:AAB36569:1412-1948:537", "argannot"], - ["MEG_4060|Metals|Multi-metal_resistance|Multi-metal_resistance_protein|MREA", "megares"] + ["MEG_4060|Metals|Multi-metal_resistance|Multi-metal_resistance_protein|MREA", "megares"], + ["gi:447201629:ref:WP_001278885.1:|FEATURES|cob(I)alamin_adenolsyltransferase|unclassified|cob(I)alamin_adenolsyltransferase", "deeparg"] ] ARO = lib.get_aro_ontology() @@ -19,7 +20,8 @@ def test_map_to_aro(): ARO.get_term('ARO:3000249'), ARO.get_term('ARO:3000318'), ARO.get_term('ARO:3000249'), - None + None, + ARO.get_term('ARO:0010004') ] for t, e in zip(test_cases, expected_output):