diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..c409646 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,32 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.11" + # You can also specify other tool versions: + # nodejs: "19" + # rust: "1.64" + # golang: "1.19" + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/conf.py + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub + +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +python: + install: + - requirements: requirements.txt \ No newline at end of file diff --git a/README.md b/README.md index d2ed2a4..08a52ee 100644 --- a/README.md +++ b/README.md @@ -13,13 +13,14 @@ pip install text2term import text2term import pandas -df1 = text2term.map_file("test/unstruct_terms.txt", "http://www.ebi.ac.uk/efo/efo.owl") +df1 = text2term.map_terms("test/unstruct_terms.txt", "http://www.ebi.ac.uk/efo/efo.owl") df2 = text2term.map_terms(["asthma", "acute bronchitis"], "http://www.ebi.ac.uk/efo/efo.owl") +df3 = text2term.map_terms({"asthma":"disease", "acute bronchitis":["disease", "lungs"]}, "http://www.ebi.ac.uk/efo/efo.owl") ``` Below is an example of caching, assuming the same imports as above: ```python text2term.cache_ontology("http://www.ebi.ac.uk/efo/efo.owl", "EFO") -df1 = text2term.map_file("test/unstruct_terms.txt", "EFO", use_cache=True) +df1 = text2term.map_terms("test/unstruct_terms.txt", "EFO", use_cache=True) df2 = text2term.map_terms(["asthma", "acute bronchitis"], "EFO", use_cache=True) text2term.clear_cache("EFO") ``` @@ -48,10 +49,10 @@ Then, after running this, the following command is equivalent: `python text2term -s test/unstruct_terms.txt -t EFO` ## Programmatic Usage -The tool can be executed in Python with any of the three following functions: +The tool can be executed in Python with the `map_terms` function: ```python -text2term.map_file(input_file='/some/file.txt', +text2term.map_terms(source_terms, target_ontology='http://some.ontology/v1.owl', base_iris=(), csv_columns=(), @@ -64,45 +65,15 @@ text2term.map_file(input_file='/some/file.txt', save_mappings=False, separator=',', use_cache=False, - term_type='classes') -``` -or -```python -text2term.map_terms(source_terms=['term one', 'term two'], - target_ontology='http://some.ontology/v1.owl', - base_iris=(), - excl_deprecated=False, - max_mappings=3, - min_score=0.3, - mapper=Mapper.TFIDF, - output_file='', - save_graphs=False, - save_mappings=False, - source_terms_ids=(), - use_cache=False, - term_type='classes') -``` -or -```python -text2term.map_tagged_terms(tagged_terms_dict={'term one': ["tag 1", "tag 2"]}, - target_ontology='http://some.ontology/v1.owl', - base_iris=(), - excl_deprecated=False, - max_mappings=3, - min_score=0.3, - mapper=Mapper.TFIDF, - output_file='', - save_graphs=False, - save_mappings=False, - source_terms_ids=(), - use_cache=False, - term_type='classes') + term_type='classes', + incl_unmapped=False) + ``` +NOTE: As of 3.0.0, the former three functions (`map_file`, `map_terms`, `map_tagged_terms`) have been condensed into one function. Users can now change the name of any function in old code to `map_terms` and it reads the input context to maintain the functionality of each one. ### Arguments -For `map_file`, the first argument 'input_file' specifies a path to a file containing the terms to be mapped. It also has a `csv_column` argument that allows the user to specify a column to map if a csv is passed in as the input file. -For `map_terms`, the first argument 'source_terms' takes in a list of the terms to be mapped. -For `map_tagged_terms`, everything is the same as `map_terms` except the first argument is either a dictionary of terms to a list of tags, or a list of TaggedTerm objects (see below). Currently, the tags do not affect the mapping in any way, but they are added to the output dataframe at the end of the process. +For `map_terms`, the first argument can be any of the following: 1) a string that specifies a path to a file containing the terms to be mapped, 2) a list of the terms to be mapped, or 3)dictionary of terms to a list of tags, or a list of TaggedTerm objects (see below). +Currently, the tags do not affect the mapping in any way, but they are added to the output dataframe at the end of the process. The exception is the Ignore tag, which causes the term to not be mapped at all, but still be outputted in the results if the incl_unmapped argument is True (see below). All other arguments are the same, and have the same functionality: @@ -115,6 +86,9 @@ All other arguments are the same, and have the same functionality: Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') +`csv_column` : tuple + Allows the user to specify a column to map if a csv is passed in as the input file. Ignored if the input is not a file path. + `source_terms_ids` : tuple Collection of identifiers for the given source terms WARNING: While this is still available for the tagged term function, it is worth noting that dictionaries do not necessarily preserve order, so it is not recommended. If using the TaggedTerm object, the source terms can be attached there to guarantee order. @@ -141,12 +115,18 @@ All other arguments are the same, and have the same functionality: `save_mappings` : bool Save the generated mappings to a file (specified by `output_file`) +`seperator` : str + Character that seperates the source term values if a file input is given. Ignored if the input is not a file path. + `use_cache` : bool Use the cache for the ontology. More details are below. `term_type` : str Determines whether the ontology should be parsed for its classes (ThingClass), properties (PropertyClass), or both. Possible values are ['classes', 'properties', 'both']. If it does not match one of these values, the program will throw a ValueError. +`incl_unmapped` : bool + Include all unmapped terms in the output. If something has been tagged Ignore (see below) or falls below the `min_score` threshold, it is included without a mapped term at the end of the output. + All default values, if they exist, can be seen above. ### Return Value @@ -185,9 +165,6 @@ As of version 1.2.0, text2term includes regex-based preprocessing functionality Like the "map" functions above, the two functions differ on whether the input is a file or a list of strings: ```python -preprocess_file(file_path, template_path, output_file='', blocklist_path='', blocklist_char='', rem_duplicates=False) -``` -```python preprocess_terms(terms, template_path, output_file='', blocklist_path='', blocklist_char='', rem_duplicates=False) ``` ```python @@ -202,7 +179,7 @@ NOTE: As of version 2.1.0, the arguments were changed to "blocklist" from "black The Remove Duplicates `rem_duplicates` functionality will remove all duplicate terms after processing, if set to `True`. WARNING: Removing duplicates at any point does not guarantee which original term is kept. This is particularly important if original terms have different tags, so user caution is advised. -The functions `preprocess_file()` and `preprocess_terms()` both return a dictionary where the keys are the original terms and the values are the preprocessed terms. +The function `preprocess_terms()` returns a dictionary where the keys are the original terms and the values are the preprocessed terms. The `preprocess_tagged_terms()` function returns a list of TaggedTerm items with the following function contracts: ```python def __init__(self, term=None, tags=[], original_term=None, source_term_id=None) @@ -214,10 +191,19 @@ def get_term(self) def get_tags(self) def get_source_term_id(self) ``` -As mentioned in the mapping section above, this can then be passed directly to map_tagged_terms(), allowing for easy programmatic usage. Note that this allows multiple of the same preprocessed term with different tags. +As mentioned in the mapping section above, this can then be passed directly to `map_terms`, allowing for easy programmatic usage. Note that this allows multiple of the same preprocessed term with different tags. **Note on NA values in input**: As of v2.0.3, when the input to text2term is a table file, any rows that contain `NA` values in the specified term column, or in the term ID column (if provided), will be ignored. +### Tag Usage +As of 3.0.0, some tags have additional functionality that is added when attached to a term: + +IGNORE: + If an ignore tag is added to a term, that term will not be mapped to any terms in the ontology. It will only be included in the output if the `incl_unmapped` argument is True. Here are the following values that count as ignore tags: +```python + IGNORE_TAGS = ["ignore", "Ignore", "ignore ", "Ignore "] +``` + ## Command Line Usage After installation, execute the tool from a command line as follows: diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..ded1330 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,27 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = 'text2term' +copyright = '2023, Harvard Medical School' +author = 'Rafael Goncalves and Jason Payne' + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = ["myst_parser"] + +templates_path = ['_templates'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = 'alabaster' +html_static_path = ['_static'] diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..6456e30 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,20 @@ +.. text2term documentation master file, created by + sphinx-quickstart on Tue Jul 11 10:34:29 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to text2term's documentation! +===================================== + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..32bb245 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/test/simple-test.py b/test/simple-test.py index 7143c1f..44577d0 100644 --- a/test/simple-test.py +++ b/test/simple-test.py @@ -5,14 +5,16 @@ def main(): efo = "http://www.ebi.ac.uk/efo/efo.owl#" pizza = "https://protege.stanford.edu/ontologies/pizza/pizza.owl" ncit = "http://purl.obolibrary.org/obo/ncit/releases/2022-08-19/ncit.owl" - # print(bioregistry.get_owl_download("eFo")) if not text2term.cache_exists("EFO"): cached_onto = text2term.cache_ontology("EFO") # df = cached_onto.map_terms(["asthma", "disease location", "obsolete food allergy"], excl_deprecated=True, term_type="classes") print("Cache exists:", cached_onto.cache_exists()) # caches = text2term.cache_ontology_set("text2term/resources/ontologies.csv") - df = text2term.map_terms(["asthma", "disease location", "obsolete food allergy"], "EFO", min_score=.8, mapper=text2term.Mapper.JARO_WINKLER, excl_deprecated=True, use_cache=True, term_type="classes") + # df = text2term.map_terms(["asthma", "disease location", "obsolete food allergy"], "EFO", min_score=.8, mapper=text2term.Mapper.JARO_WINKLER, excl_deprecated=True, use_cache=True, term_type="classes") # df = text2term.map_terms(["contains", "asthma"], "EFO", term_type="classes") + df = text2term.map_terms({"asthma":"disease", "allergy":["ignore", "response"], "assdhfbswif":["sent"], "isdjfnsdfwd":""}, "EFO", excl_deprecated=True, use_cache=True, incl_unmapped=True) + # taggedterms = text2term.preprocess_tagged_terms("test/simple_preprocess.txt") + # df = text2term.map_terms(taggedterms, "EFO", excl_deprecated=True, use_cache=True, incl_unmapped=True) print(df.to_string()) if __name__ == '__main__': diff --git a/test/simple_preprocess.txt b/test/simple_preprocess.txt new file mode 100644 index 0000000..fdd7467 --- /dev/null +++ b/test/simple_preprocess.txt @@ -0,0 +1,3 @@ +asthma;:;disease +acute bronchitis;:;important,tags +colon disease diff --git a/text2term/__init__.py b/text2term/__init__.py index 9e3c8a0..33b75b5 100644 --- a/text2term/__init__.py +++ b/text2term/__init__.py @@ -1,12 +1,9 @@ from .t2t import map_terms -from .t2t import map_file -from .t2t import map_tagged_terms from .t2t import cache_ontology from .onto_cache import cache_ontology_set from .onto_cache import cache_exists from .onto_cache import clear_cache from .mapper import Mapper -from .preprocess import preprocess_file from .preprocess import preprocess_terms from .preprocess import preprocess_tagged_terms from .tagged_terms import TaggedTerm \ No newline at end of file diff --git a/text2term/preprocess.py b/text2term/preprocess.py index 17ea7ea..44e4f0f 100644 --- a/text2term/preprocess.py +++ b/text2term/preprocess.py @@ -3,32 +3,11 @@ from enum import Enum from .tagged_terms import TaggedTerm -def preprocess_file(file_path, template_path, output_file="", blocklist_path="", \ - blocklist_char='', blacklist_path="", blacklist_char='', \ - rem_duplicates=False): - # Allows backwards compatibility to blacklist. Will eventually be deleted - if blocklist_char == '': - blocklist_char = blacklist_char - if blocklist_path == "": - blocklist_path = blacklist_path - terms = _get_values(file_path) - processed_terms = preprocess_terms(terms, template_path, output_file=output_file, \ - blocklist_path=blocklist_path, blocklist_char=blocklist_char, \ - rem_duplicates=rem_duplicates) - - return processed_terms - ## Tags should be stored with their terms in the same line, delineated by ";:;" ## ex: Age when diagnosed with (.*) ;:; age,diagnosis ## "Age when diagnosed with cancer" becomes: {"cancer", ["age", "diagnosis"]} def preprocess_tagged_terms(file_path, template_path="", blocklist_path="", \ - blocklist_char='', blacklist_path="", blacklist_char='', \ - rem_duplicates=False, separator=";:;"): - # Allows backwards compatibility to blacklist. Will eventually be deleted - if blocklist_char == '': - blocklist_char = blacklist_char - if blocklist_path == "": - blocklist_path = blacklist_path + blocklist_char='', rem_duplicates=False, separator=";:;"): # Seperate tags from the terms, put in TaggedTerm and add to list raw_terms = _get_values(file_path) terms = [] @@ -80,13 +59,9 @@ def preprocess_tagged_terms(file_path, template_path="", blocklist_path="", \ return processed_terms def preprocess_terms(terms, template_path, output_file="", blocklist_path="", \ - blocklist_char='', blacklist_path="", blacklist_char='', \ - rem_duplicates=False): - # Allows backwards compatibility to blacklist. Will eventually be deleted - if blocklist_char == '': - blocklist_char = blacklist_char - if blocklist_path == "": - blocklist_path = blacklist_path + blocklist_char='', rem_duplicates=False): + if isinstance(terms, str): + terms = _get_values(file_path) # Form the templates as regular expressions template_strings = [] if template_path != "": diff --git a/text2term/t2t.py b/text2term/t2t.py index 12cc402..cb7b18c 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -16,99 +16,11 @@ from text2term.tfidf_mapper import TFIDFMapper from text2term.zooma_mapper import ZoomaMapper from text2term.config import VERSION +from text2term.tagged_terms import TaggedTerm +from text2term.term_mapping import TermMapping -""" -Maps the terms in the given input file to the specified target ontology. - -Parameters ----------- -input_file : str - Path to input file containing 'source' terms to map to ontology terms (list of terms or CSV file) -target_ontology : str - Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, - provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies -base_iris : tuple - Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: - ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') -csv_columns : tuple - Name of the column that contains the terms to map, optionally followed by the name of the column that - contains identifiers for the terms (eg 'my_terms,my_term_ids') -separator : str - Specifies the cell separator to be used when reading a non-comma-separated tabular file -excl_deprecated : bool - Exclude ontology terms stated as deprecated via `owl:deprecated true` -mapper : mapper.Mapper - Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, - fuzzy, tfidf, zooma, bioportal -max_mappings : int - Maximum number of top-ranked mappings returned per source term -min_score : float - Minimum similarity score [0,1] for the mappings (1=exact match) -output_file : str - Path to desired output file for the mappings -save_graphs : bool - Save vis.js graphs representing the neighborhood of each ontology term -save_mappings : bool - Save the generated mappings to a file (specified by `output_file`) - -Returns ----------- -df - Data frame containing the generated ontology mappings -""" -def map_file(input_file, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, - mapper=Mapper.TFIDF, min_score=0.3, output_file='', save_graphs=False, save_mappings=False, - separator=',', use_cache=False, term_type='classes'): - source_terms, source_terms_ids = _load_data(input_file, csv_columns, separator) - return map_terms(source_terms, target_ontology, source_terms_ids=source_terms_ids, base_iris=base_iris, - excl_deprecated=excl_deprecated, max_mappings=max_mappings, mapper=mapper, min_score=min_score, - output_file=output_file, save_graphs=save_graphs, save_mappings=save_mappings, - use_cache=use_cache, term_type=term_type) - -""" -All parameters are the same as below, but tagged_terms_dict is a dictionary where the - key is the source term and the value is a list of all tags (or a single string for - one tag). It can also be a list of TaggedTerm objects. - The dataframe returned is the same but contains a tags column -""" -def map_tagged_terms(tagged_terms_dict, target_ontology, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, - mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), use_cache=False, - term_type='classes'): - # If the input is a dict, use keys. If it is a list, it is a list of TaggedTerms - if isinstance(tagged_terms_dict, dict): - terms = list(tagged_terms_dict.keys()) - else: - terms = [] - source_terms_id_list = [] - for tagged_term in tagged_terms_dict: - terms.append(tagged_term.get_term()) - if tagged_term.get_source_term_id() != None: - source_terms_id_list.append(tagged_term.get_source_term_id()) - if len(source_terms_id_list) > 0: - source_terms_ids = tuple(source_terms_id_list) - - # Run the mapper - df = map_terms(terms, target_ontology, base_iris=base_iris, excl_deprecated=excl_deprecated, \ - max_mappings=max_mappings, min_score=min_score, mapper=mapper, output_file=output_file, \ - save_graphs=save_graphs, source_terms_ids=source_terms_ids, use_cache=use_cache, \ - term_type=term_type) - - # For each term in dict, add tags to corresponding mappings row in "Tags" Column - if isinstance(tagged_terms_dict, dict): - for key, value in tagged_terms_dict.items(): - if isinstance(value, list): - to_store = ','.join(value) - else: - to_store = str(value) - df.loc[df['Source Term'] == key, "Tags"] = to_store - else: - for term in tagged_terms_dict: - to_store = ','.join(term.get_tags()) - df.loc[df['Source Term'] == term.get_term(), "Tags"] = to_store - - if save_mappings: - _save_mappings(df, output_file, min_score, mapper, target_ontology, base_iris, excl_deprecated, max_mappings, term_type) - return df +IGNORE_TAGS = ["ignore", "Ignore", "ignore ", "Ignore "] +UNMAPPED_TAG = "unmapped" """ Maps the terms in the given list to the specified target ontology. @@ -146,23 +58,31 @@ def map_tagged_terms(tagged_terms_dict, target_ontology, base_iris=(), excl_depr df Data frame containing the generated ontology mappings """ -def map_terms(source_terms, target_ontology, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, - mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), - use_cache=False, term_type='classes'): +def map_terms(source_terms, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, + min_score=0.3, mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), + separator=',', use_cache=False, term_type='classes', incl_unmapped=False): + # Parse the possible source terms options and tags + source_terms, source_term_ids, tags = _parse_source_terms(source_terms, source_terms_ids, csv_columns, separator) + # Create Source Term Ids if they are not provided if len(source_terms_ids) != len(source_terms): if len(source_terms_ids) > 0: sys.stderr.write("Warning: Source Term Ids are non-zero, but will not be used.") source_terms_ids = onto_utils.generate_iris(len(source_terms)) + # Create the output file if output_file == '': timestamp = datetime.datetime.now().strftime("%d-%m-%YT%H-%M-%S") output_file = "t2t-mappings-" + timestamp + ".csv" + # Load the ontology for either Zooma, Bioportal, or directly if mapper in {Mapper.ZOOMA, Mapper.BIOPORTAL}: target_terms = '' if target_ontology.lower() == 'all' else target_ontology else: target_terms = _load_ontology(target_ontology, base_iris, excl_deprecated, use_cache, term_type) - mappings_df = _do_mapping(source_terms, source_terms_ids, target_terms, mapper, max_mappings, min_score) + # Run the mapper + mappings_df = _do_mapping(source_terms, source_terms_ids, target_terms, mapper, max_mappings, min_score, tags, incl_unmapped) + mappings_df["Mapping Score"] = mappings_df["Mapping Score"].astype(float).round(decimals=3) if save_mappings: - _save_mappings(mappings_df, output_file, min_score, mapper, target_ontology, base_iris, excl_deprecated, max_mappings, term_type) + _save_mappings(mappings_df, output_file, min_score, mapper, target_ontology, base_iris, \ + excl_deprecated, max_mappings, term_type, source_terms, incl_unmapped) if save_graphs: _save_graphs(target_terms, output_file) return mappings_df @@ -184,6 +104,31 @@ def cache_ontology(ontology_url, ontology_acronym="", base_iris=()): """ PRIVATE/HELPER FUNCTIONS """ +# Parses the source terms and returns what is to be mapped, the term ids, and the tags +def _parse_source_terms(source_terms, source_terms_ids=(), csv_columns=(), separator=','): + # If source_terms is a string, we assume it is a file location + if isinstance(source_terms, str): + terms, source_terms_ids = _load_data(source_terms, csv_columns, separator) + tags = dict.fromkeys(terms) + # If source_terms is a dictionary, the keys are terms and the values are tags + elif isinstance(source_terms, dict): + terms = list(source_terms.keys()) + tags = source_terms + # Otherwise, it is a list of either TaggedTerms or strings + elif isinstance(source_terms[0], TaggedTerm): + terms = [] + source_terms_id_list = [] + for tagged_term in source_terms: + terms.append(tagged_term.get_term()) + if tagged_term.get_source_term_id() != None: + source_terms_id_list.append(tagged_term.get_source_term_id()) + source_terms_ids = source_terms_id_list + tags = source_terms + else: + terms = source_terms + tags = dict.fromkeys(terms) + return terms, source_terms_ids, tags + def _serialize_ontology(ontology_terms, ontology_acronym, cache_dir): start = time.time() with open(cache_dir + ontology_acronym + "-term-details.pickle", 'wb+') as out_file: @@ -216,22 +161,62 @@ def _load_ontology(ontology, iris, exclude_deprecated, use_cache=False, term_typ raise RuntimeError("Could not find any terms in the given ontology.") return onto_terms -def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappings, min_score): +def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappings, min_score, tags, incl_unmapped): + to_map, tags = _process_tags(source_terms, tags) if mapper == Mapper.TFIDF: term_mapper = TFIDFMapper(ontology_terms) - mappings_df = term_mapper.map(source_terms, source_term_ids, max_mappings=max_mappings, min_score=min_score) + mappings_df = term_mapper.map(to_map, source_term_ids, max_mappings=max_mappings, min_score=min_score) elif mapper == Mapper.ZOOMA: term_mapper = ZoomaMapper() - mappings_df = term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) + mappings_df = term_mapper.map(to_map, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) elif mapper == Mapper.BIOPORTAL: term_mapper = BioPortalAnnotatorMapper("8f0cbe43-2906-431a-9572-8600d3f4266e") - mappings_df = term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) + mappings_df = term_mapper.map(to_map, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) elif mapper in {Mapper.LEVENSHTEIN, Mapper.JARO, Mapper.JARO_WINKLER, Mapper.INDEL, Mapper.FUZZY, Mapper.JACCARD}: term_mapper = SyntacticMapper(ontology_terms) - mappings_df = term_mapper.map(source_terms, source_term_ids, mapper, max_mappings=max_mappings) + mappings_df = term_mapper.map(to_map, source_term_ids, mapper, max_mappings=max_mappings) else: raise ValueError("Unsupported mapper: " + mapper) + + # Add tags, process, and filter df = _filter_mappings(mappings_df, min_score) + if incl_unmapped: + df = _add_unmapped_terms(mappings_df, tags, source_terms, source_term_ids) + df = _add_tags_to_df(mappings_df, tags) + return df + +# Takes in the tags and source terms and processes them accordingly +def _process_tags(source_terms, tags): + to_map = [] + # IGNORE TAGS SECTION + for term in source_terms: + if isinstance(tags, dict): + term_tags = tags[term] + else: + for tag in tags: + if tag.get_term() == term: + term_tags = tag.get_tags() + break + if isinstance(term_tags, list): + if not any(tag in IGNORE_TAGS for tag in term_tags): + to_map.append(term) + else: + if term_tags not in IGNORE_TAGS: + to_map.append(term) + return to_map, tags + +def _add_tags_to_df(df, tags): + if isinstance(tags, dict): + for key, value in tags.items(): + if isinstance(value, list): + to_store = ','.join(value) + else: + to_store = str(value) + df.loc[df['Source Term'] == key, "Tags"] = to_store + else: + for term in tags: + to_store = ','.join(term.get_tags()) + df.loc[df['Source Term'] == term.get_term(), "Tags"] = to_store return df def _filter_mappings(mappings_df, min_score): @@ -241,7 +226,34 @@ def _filter_mappings(mappings_df, min_score): new_df.loc[len(new_df.index)] = row return new_df -def _save_mappings(mappings, output_file, min_score, mapper, target_ontology, base_iris, excl_deprecated, max_mappings, term_type): +def _add_unmapped_terms(mappings_df, tags, source_terms, source_terms_ids): + mapped = pd.unique(mappings_df["Source Term"]) + for (term, term_id) in zip(source_terms, source_terms_ids): + if term not in mapped: + non_mapping = TermMapping(term, term_id, "", "", 0) + _add_tag(tags, term, UNMAPPED_TAG, ignore=True) + mappings_df.loc[len(mappings_df.index)] = non_mapping.to_dict() + return mappings_df + +def _add_tag(tags, term, to_add, ignore=False): + if isinstance(tags, dict): + new_tags = tags.get(term, []) + if not any(tag in IGNORE_TAGS for tag in new_tags): + if isinstance(new_tags, list): + new_tags.append(to_add) + elif new_tags != "": + new_tags = [new_tags, to_add] + else: + new_tags = [to_add] + tags[term] = new_tags + else: + for tagged_term in tags: + check_ignore = not ignore and not any(tagged_term.has_tag(tag) for tag in IGNORE_TAGS) + if tagged_term.get_term() == term and check_ignore: + tagged_term.add_tags([to_add]) + +def _save_mappings(mappings, output_file, min_score, mapper, target_ontology, base_iris, \ + excl_deprecated, max_mappings, term_type, source_terms, incl_unmapped): if os.path.dirname(output_file): # create output directories if needed os.makedirs(os.path.dirname(output_file), exist_ok=True) with open(output_file, "a") as f: @@ -255,6 +267,11 @@ def _save_mappings(mappings, output_file, min_score, mapper, target_ontology, ba f.write("# Term Type: %s\n" % term_type) f.write("# Deprecated Terms ") f.write("Excluded\n" if excl_deprecated else "Included\n") + f.write("# Unmapped Terms ") + f.write("Excluded\n" if not incl_unmapped else "Included\n") + writestring = "# Of " + str(len(source_terms)) + " entries, " + str(len(pd.unique(mappings["Source Term ID"]))) + writestring += " were successfully mapped to " + str(len(pd.unique(mappings["Mapped Term IRI"]))) + " unique terms\n" + f.write(writestring) mappings.to_csv(output_file, index=False, mode='a') def _save_graphs(terms, output_file): diff --git a/text2term/tagged_terms.py b/text2term/tagged_terms.py index d845999..53d3441 100644 --- a/text2term/tagged_terms.py +++ b/text2term/tagged_terms.py @@ -18,6 +18,9 @@ def update_term(self, term): def update_source_term_id(self, source_term_id): self.source_term_id = source_term_id + def has_tag(self, tag): + return tag in self.tags + def get_original_term(self): return self.original_term @@ -29,4 +32,7 @@ def get_tags(self): def get_source_term_id(self): return self.source_term_id + + def to_dict(self): + return {term : tags} \ No newline at end of file diff --git a/text2term/term_mapping.py b/text2term/term_mapping.py index 39ef795..8da155c 100644 --- a/text2term/term_mapping.py +++ b/text2term/term_mapping.py @@ -37,6 +37,8 @@ def mapped_term_iri(self): @property def mapped_term_curie(self): + if self.mapped_term_iri == "": + return "" return onto_utils.curie_from_iri(self.mapped_term_iri) @property