Merge pull request #36 from ccb-hms/development

Remove extra functions and add unmapped
ccb-hms · Aug 3, 2023 · 71349e4 · 71349e4
2 parents d3fe0a3 + d2634fb
commit 71349e4
Show file tree

Hide file tree

Showing 13 changed files with 304 additions and 182 deletions.
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -0,0 +1,32 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the OS, Python version and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+    # You can also specify other tool versions:
+    # nodejs: "19"
+    # rust: "1.64"
+    # golang: "1.19"
+
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+   configuration: docs/conf.py
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+#    - pdf
+#    - epub
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+   install:
+   - requirements: requirements.txt
diff --git a/README.md b/README.md
@@ -13,13 +13,14 @@ pip install text2term
 import text2term
 import pandas
 
-df1 = text2term.map_file("test/unstruct_terms.txt", "http://www.ebi.ac.uk/efo/efo.owl")
+df1 = text2term.map_terms("test/unstruct_terms.txt", "http://www.ebi.ac.uk/efo/efo.owl")
 df2 = text2term.map_terms(["asthma", "acute bronchitis"], "http://www.ebi.ac.uk/efo/efo.owl")
+df3 = text2term.map_terms({"asthma":"disease", "acute bronchitis":["disease", "lungs"]}, "http://www.ebi.ac.uk/efo/efo.owl")
 ```
 Below is an example of caching, assuming the same imports as above:
 ```python
 text2term.cache_ontology("http://www.ebi.ac.uk/efo/efo.owl", "EFO")
-df1 = text2term.map_file("test/unstruct_terms.txt", "EFO", use_cache=True)
+df1 = text2term.map_terms("test/unstruct_terms.txt", "EFO", use_cache=True)
 df2 = text2term.map_terms(["asthma", "acute bronchitis"], "EFO", use_cache=True)
 text2term.clear_cache("EFO")
 ```
@@ -48,10 +49,10 @@ Then, after running this, the following command is equivalent:
 `python text2term -s test/unstruct_terms.txt -t EFO`
 
 ## Programmatic Usage
-The tool can be executed in Python with any of the three following functions:
+The tool can be executed in Python with the `map_terms` function:
 
 ```python
-text2term.map_file(input_file='/some/file.txt', 
+text2term.map_terms(source_terms, 
                    target_ontology='http://some.ontology/v1.owl',
                    base_iris=(),
                    csv_columns=(), 
@@ -64,45 +65,15 @@ text2term.map_file(input_file='/some/file.txt',
                    save_mappings=False, 
                    separator=',', 
                    use_cache=False,
-                   term_type='classes')
-```
-or
-```python
-text2term.map_terms(source_terms=['term one', 'term two'],
-                    target_ontology='http://some.ontology/v1.owl',
-                    base_iris=(),
-                    excl_deprecated=False,
-                    max_mappings=3,
-                    min_score=0.3,
-                    mapper=Mapper.TFIDF,
-                    output_file='',
-                    save_graphs=False,
-                    save_mappings=False,
-                    source_terms_ids=(),
-                    use_cache=False,
-                    term_type='classes')
-```
-or
-```python
-text2term.map_tagged_terms(tagged_terms_dict={'term one': ["tag 1", "tag 2"]},
-                    target_ontology='http://some.ontology/v1.owl',
-                    base_iris=(),
-                    excl_deprecated=False,
-                    max_mappings=3,
-                    min_score=0.3,
-                    mapper=Mapper.TFIDF,
-                    output_file='',
-                    save_graphs=False,
-                    save_mappings=False,
-                    source_terms_ids=(),
-                    use_cache=False,
-                    term_type='classes')
+                   term_type='classes',
+                   incl_unmapped=False)
+
 ```
+NOTE: As of 3.0.0, the former three functions (`map_file`, `map_terms`, `map_tagged_terms`) have been condensed into one function. Users can now change the name of any function in old code to `map_terms` and it reads the input context to maintain the functionality of each one.
 
 ### Arguments
-For `map_file`, the first argument 'input_file' specifies a path to a file containing the terms to be mapped. It also has a `csv_column` argument that allows the user to specify a column to map if a csv is passed in as the input file. 
-For `map_terms`, the first argument 'source_terms' takes in a list of the terms to be mapped.
-For `map_tagged_terms`, everything is the same as `map_terms` except the first argument is either a dictionary of terms to a list of tags, or a list of TaggedTerm objects (see below). Currently, the tags do not affect the mapping in any way, but they are added to the output dataframe at the end of the process.
+For `map_terms`, the first argument can be any of the following: 1) a string that specifies a path to a file containing the terms to be mapped, 2) a list of the terms to be mapped, or 3)dictionary of terms to a list of tags, or a list of TaggedTerm objects (see below). 
+Currently, the tags do not affect the mapping in any way, but they are added to the output dataframe at the end of the process. The exception is the Ignore tag, which causes the term to not be mapped at all, but still be outputted in the results if the incl_unmapped argument is True (see below).
 
 All other arguments are the same, and have the same functionality:
 
@@ -115,6 +86,9 @@ All other arguments are the same, and have the same functionality:
     Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example:
     ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP')
 
+`csv_column` : tuple 
+    Allows the user to specify a column to map if a csv is passed in as the input file. Ignored if the input is not a file path.
+
 `source_terms_ids` : tuple
     Collection of identifiers for the given source terms
     WARNING: While this is still available for the tagged term function, it is worth noting that dictionaries do not necessarily preserve order, so it is not recommended. If using the TaggedTerm object, the source terms can be attached there to guarantee order.
@@ -141,12 +115,18 @@ All other arguments are the same, and have the same functionality:
 `save_mappings` : bool
     Save the generated mappings to a file (specified by `output_file`) 
 
+`seperator` : str
+    Character that seperates the source term values if a file input is given. Ignored if the input is not a file path.
+
 `use_cache` : bool
     Use the cache for the ontology. More details are below.
 
 `term_type` : str
     Determines whether the ontology should be parsed for its classes (ThingClass), properties (PropertyClass), or both. Possible values are ['classes', 'properties', 'both']. If it does not match one of these values, the program will throw a ValueError.
 
+`incl_unmapped` : bool
+    Include all unmapped terms in the output. If something has been tagged Ignore (see below) or falls below the `min_score` threshold, it is included without a mapped term at the end of the output. 
+
 All default values, if they exist, can be seen above.
 
 ### Return Value
@@ -185,9 +165,6 @@ As of version 1.2.0, text2term includes regex-based preprocessing functionality
 
 Like the "map" functions above, the two functions differ on whether the input is a file or a list of strings:
 ```python
-preprocess_file(file_path, template_path, output_file='', blocklist_path='', blocklist_char='', rem_duplicates=False)
-```
-```python
 preprocess_terms(terms, template_path, output_file='', blocklist_path='', blocklist_char='', rem_duplicates=False)
 ``` 
 ```python
@@ -202,7 +179,7 @@ NOTE: As of version 2.1.0, the arguments were changed to "blocklist" from "black
 The Remove Duplicates `rem_duplicates` functionality will remove all duplicate terms after processing, if set to `True`. 
 WARNING: Removing duplicates at any point does not guarantee which original term is kept. This is particularly important if original terms have different tags, so user caution is advised.
 
-The functions `preprocess_file()` and `preprocess_terms()` both return a dictionary where the keys are the original terms and the values are the preprocessed terms.
+The function `preprocess_terms()` returns a dictionary where the keys are the original terms and the values are the preprocessed terms.
 The `preprocess_tagged_terms()` function returns a list of TaggedTerm items with the following function contracts:
 ```python
 def __init__(self, term=None, tags=[], original_term=None, source_term_id=None)
@@ -214,10 +191,19 @@ def get_term(self)
 def get_tags(self)
 def get_source_term_id(self)
 ```
-As mentioned in the mapping section above, this can then be passed directly to map_tagged_terms(), allowing for easy programmatic usage. Note that this allows multiple of the same preprocessed term with different tags. 
+As mentioned in the mapping section above, this can then be passed directly to `map_terms`, allowing for easy programmatic usage. Note that this allows multiple of the same preprocessed term with different tags. 
 
 **Note on NA values in input**: As of v2.0.3, when the input to text2term is a table file, any rows that contain `NA` values in the specified term column, or in the term ID column (if provided), will be ignored.
 
+### Tag Usage
+As of 3.0.0, some tags have additional functionality that is added when attached to a term:
+
+IGNORE:
+    If an ignore tag is added to a term, that term will not be mapped to any terms in the ontology. It will only be included in the output if the `incl_unmapped` argument is True. Here are the following values that count as ignore tags:
+```python
+    IGNORE_TAGS = ["ignore", "Ignore", "ignore ", "Ignore "]
+```
+
 ## Command Line Usage
 
 After installation, execute the tool from a command line as follows:

diff --git a/docs/Makefile b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/conf.py b/docs/conf.py
@@ -0,0 +1,27 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+project = 'text2term'
+copyright = '2023, Harvard Medical School'
+author = 'Rafael Goncalves and Jason Payne'
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = ["myst_parser"]
+
+templates_path = ['_templates']
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = 'alabaster'
+html_static_path = ['_static']
diff --git a/docs/index.rst b/docs/index.rst
@@ -0,0 +1,20 @@
+.. text2term documentation master file, created by
+   sphinx-quickstart on Tue Jul 11 10:34:29 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to text2term's documentation!
+=====================================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/docs/make.bat b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/test/simple-test.py b/test/simple-test.py
@@ -5,14 +5,16 @@ def main():
 	efo = "http://www.ebi.ac.uk/efo/efo.owl#"
 	pizza = "https://protege.stanford.edu/ontologies/pizza/pizza.owl"
 	ncit = "http://purl.obolibrary.org/obo/ncit/releases/2022-08-19/ncit.owl"
-	# print(bioregistry.get_owl_download("eFo"))
 	if not text2term.cache_exists("EFO"):
 		cached_onto = text2term.cache_ontology("EFO")
 		# df = cached_onto.map_terms(["asthma", "disease location", "obsolete food allergy"], excl_deprecated=True, term_type="classes")
 		print("Cache exists:", cached_onto.cache_exists())
 	# caches = text2term.cache_ontology_set("text2term/resources/ontologies.csv")
-	df = text2term.map_terms(["asthma", "disease location", "obsolete food allergy"], "EFO", min_score=.8, mapper=text2term.Mapper.JARO_WINKLER, excl_deprecated=True, use_cache=True, term_type="classes")
+	# df = text2term.map_terms(["asthma", "disease location", "obsolete food allergy"], "EFO", min_score=.8, mapper=text2term.Mapper.JARO_WINKLER, excl_deprecated=True, use_cache=True, term_type="classes")
 	# df = text2term.map_terms(["contains", "asthma"], "EFO", term_type="classes")
+	df = text2term.map_terms({"asthma":"disease", "allergy":["ignore", "response"], "assdhfbswif":["sent"], "isdjfnsdfwd":""}, "EFO", excl_deprecated=True, use_cache=True, incl_unmapped=True)
+	# taggedterms = text2term.preprocess_tagged_terms("test/simple_preprocess.txt")
+	# df = text2term.map_terms(taggedterms, "EFO", excl_deprecated=True, use_cache=True, incl_unmapped=True)
 	print(df.to_string())
 
 if __name__ == '__main__':

diff --git a/test/simple_preprocess.txt b/test/simple_preprocess.txt
@@ -0,0 +1,3 @@
+asthma;:;disease
+acute bronchitis;:;important,tags
+colon disease
diff --git a/text2term/__init__.py b/text2term/__init__.py
@@ -1,12 +1,9 @@
 from .t2t import map_terms
-from .t2t import map_file
-from .t2t import map_tagged_terms
 from .t2t import cache_ontology
 from .onto_cache import cache_ontology_set
 from .onto_cache import cache_exists
 from .onto_cache import clear_cache
 from .mapper import Mapper
-from .preprocess import preprocess_file
 from .preprocess import preprocess_terms
 from .preprocess import preprocess_tagged_terms
 from .tagged_terms import TaggedTerm
diff --git a/text2term/preprocess.py b/text2term/preprocess.py
@@ -3,32 +3,11 @@
 from enum import Enum
 from .tagged_terms import TaggedTerm
 
-def preprocess_file(file_path, template_path, output_file="", blocklist_path="", \
-	                blocklist_char='', blacklist_path="", blacklist_char='', \
-	                rem_duplicates=False):
-	# Allows backwards compatibility to blacklist. Will eventually be deleted
-	if blocklist_char == '':
-		blocklist_char = blacklist_char
-	if blocklist_path == "":
-		blocklist_path = blacklist_path
-	terms = _get_values(file_path)
-	processed_terms = preprocess_terms(terms, template_path, output_file=output_file, \
-					blocklist_path=blocklist_path, blocklist_char=blocklist_char, \
-					rem_duplicates=rem_duplicates)
-
-	return processed_terms
-
 ## Tags should be stored with their terms in the same line, delineated by ";:;" 
 ##		ex: Age when diagnosed with (.*) ;:; age,diagnosis
 ##		"Age when diagnosed with cancer" becomes: {"cancer", ["age", "diagnosis"]}
 def preprocess_tagged_terms(file_path, template_path="", blocklist_path="", \
-	                 		blocklist_char='', blacklist_path="", blacklist_char='', \
-	                 		rem_duplicates=False, separator=";:;"):
-	# Allows backwards compatibility to blacklist. Will eventually be deleted
-	if blocklist_char == '':
-		blocklist_char = blacklist_char
-	if blocklist_path == "":
-		blocklist_path = blacklist_path
+	                 		blocklist_char='', rem_duplicates=False, separator=";:;"):
 	# Seperate tags from the terms, put in TaggedTerm and add to list
 	raw_terms = _get_values(file_path)
 	terms = []
@@ -80,13 +59,9 @@ def preprocess_tagged_terms(file_path, template_path="", blocklist_path="", \
 	return processed_terms
 
 def preprocess_terms(terms, template_path, output_file="", blocklist_path="", \
-	                 blocklist_char='', blacklist_path="", blacklist_char='', \
-	                 rem_duplicates=False):
-	# Allows backwards compatibility to blacklist. Will eventually be deleted
-	if blocklist_char == '':
-		blocklist_char = blacklist_char
-	if blocklist_path == "":
-		blocklist_path = blacklist_path
+	                 blocklist_char='', rem_duplicates=False):
+	if isinstance(terms, str):
+		terms = _get_values(file_path)
 	# Form the templates as regular expressions
 	template_strings = []
 	if template_path != "":