Merge pull request #334 from helxplatform/feat/165-sapbert-annotator

Feat/165 sapbert annotator
helxplatform · Jan 3, 2024 · 1c42441 · 1c42441
2 parents 2b09941 + ceece16
commit 1c42441
Show file tree

Hide file tree

Showing 34 changed files with 2,459 additions and 1,190 deletions.
diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
@@ -42,7 +42,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v4
       with:
-        python-version: '3.10'
+        python-version: '3.12'
 
     # Currently actions/setup-python supports caching
     # but the cache is not as robust as cache action.
@@ -113,7 +113,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v4
       with:
-        python-version: '3.10'
+        python-version: '3.12'
 
     - name: Install Requirements
       run: |
@@ -123,8 +123,7 @@ jobs:
 
     - name: Test with pytest
       run: |
-        pytest --doctest-modules src
-        coverage run -m pytest tests/unit
+        make test
 
 ############################ Bandit ################################
   bandit:
@@ -134,7 +133,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v4
       with:
-        python-version: '3.10'
+        python-version: '3.12'
 
     - name: Install Requirements
       run: |
@@ -145,4 +144,4 @@ jobs:
     # Only report high security issues
     - name: Test with Bandit
       run: |
-        bandit -r src -n3 -lll
+        bandit -r src -n3 -lll
diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml
@@ -55,6 +55,7 @@ jobs:
         image-ref: '${{ github.repository }}:vuln-test'
         format: 'sarif'
         severity: 'CRITICAL,HIGH'
+        ignore-unfixed: true
         output: 'trivy-results.sarif'
         exit-code: '1'
     # Scan results should be viewable in GitHub Security Dashboard

diff --git a/Dockerfile b/Dockerfile
@@ -3,19 +3,19 @@
 # A container for the core semantic-search capability.
 #
 ######################################################
-FROM python:3.10.10-slim
+FROM python:3.12.0-alpine3.18
 
 # Install required packages
-RUN apt-get update && \
-    apt-get install -y curl make vim && \
-    rm -rf /var/cache/apt/*
+RUN apk update && \
+    apk add g++ make    
 
+RUN pip install --upgrade pip
 # Create a non-root user.
 ENV USER dug
 ENV HOME /home/$USER
 ENV UID 1000
 
-RUN adduser --disabled-login --home $HOME --shell /bin/bash --uid $UID $USER
+RUN adduser -D --home $HOME  --uid $UID $USER
 
 USER $USER
 WORKDIR $HOME

diff --git a/Makefile b/Makefile
@@ -40,8 +40,6 @@ install.dug:
 
 #test: Run all tests
 test:
-	# ${PYTHON} -m flake8 src
-	${PYTHON} -m pytest --doctest-modules src
 	coverage run -m pytest tests
 
 coverage:

diff --git a/README.md b/README.md
@@ -57,13 +57,13 @@ dug crawl tests/integration/data/test_variables_v1.0.csv -p "TOPMedTag"
 
 After crawling, you can search:
 ```shell
-dug search -q "heart attack" -t "concepts"
-dug search -q "heart attack" -t "variables" -k "concept=MONDO:0005068"
+dug search -q "vein" -t "concepts"
+dug search -q "vein" -t "variables" -k "concept=UBERON:0001638"
 ```
 
 You can also query Dug's REST API:
 ```shell
-query="`echo '{"index" : "concepts_index", "query" : "heart attack"}'`"
+query="`echo '{"index" : "concepts_index", "query" : "vein"}'`"
 
 curl --data "$query" \
      --header "Content-Type: application/json" \

diff --git a/requirements.txt b/requirements.txt
@@ -1,30 +1,29 @@
 aiohttp
 asyncio
-fastapi==0.95.0
-uvicorn==0.23.2
+fastapi
+uvicorn
 elasticsearch[async]==8.5.2
 gunicorn
 itsdangerous
 Jinja2
 jsonschema
 MarkupSafe
-ormar==0.12.1
-mistune==2.0.3
-pluggy==1.0.0
-pyrsistent==0.17.3
+ormar
+mistune
+pluggy
+pyrsistent
 pytest
-pytz==2021.1
-PyYAML==6.0
-requests==2.31.0
-# old redis==4.4.2
-redis==4.5.1
-requests-cache==0.9.8
-six==1.16.0
+pytz
+PyYAML
+requests
+redis
+requests-cache
+six
 
 # Click for command line arguments
 # We use Click 7.0 because that's what one of the pinned packages above use.
 click
-httpx>=0.24.1
+httpx
 linkml-runtime==1.6.0
 bmt==1.1.0
-urllib3>=1.26.17
+urllib3
diff --git a/setup.cfg b/setup.cfg
@@ -17,23 +17,23 @@ classifiers =
 package_dir =
     = src
 packages = find:
-python_requires = >=3.10
+python_requires = >=3.12
 include_package_data = true
 install_requires =
     elasticsearch==8.5.2
     pluggy
     requests
-    requests_cache==0.9.8
-    redis==4.5.1
+    requests_cache
+    redis
 
 [options.entry_points]
 console_scripts =
     dug = dug.cli:main
 
 [options.extras_require]
 rest =
-    fastapi==0.95.0
-    uvicorn==0.23.2
+    fastapi
+    uvicorn
     gunicorn
     jsonschema
 

diff --git a/src/dug/cli.py b/src/dug/cli.py
@@ -51,6 +51,13 @@ def get_argparser():
         required=True
     )
 
+    crawl_parser.add_argument(
+        '-a', '--annotator',
+        help='Annotator used to annotate identifiers in crawl file',
+        dest="annotator_type",
+        default="annotator-monarch"
+    )
+
     crawl_parser.add_argument(
         '-e', '--element-type',
         help='[Optional] Coerce all elements to a certain data type (e.g. DbGaP Variable).\n'
@@ -108,7 +115,7 @@ def crawl(args):
         config.node_to_element_queries = {}
     factory = DugFactory(config)
     dug = Dug(factory)
-    dug.crawl(args.target, args.parser_type, args.element_type)
+    dug.crawl(args.target, args.parser_type, args.annotator_type, args.element_type)
 
 
 def search(args):

diff --git a/src/dug/core/__init__.py b/src/dug/core/__init__.py
@@ -12,8 +12,10 @@
 
 from dug import hookspecs
 from dug.core import parsers
+from dug.core import annotators
 from dug.core.factory import DugFactory
 from dug.core.parsers import DugConcept, Parser, get_parser
+from dug.core.annotators import DugIdentifier, Annotator, get_annotator
 
 logger = logging.getLogger('dug')
 stdout_log_handler = logging.StreamHandler(sys.stdout)
@@ -29,6 +31,7 @@ def get_plugin_manager() -> pluggy.PluginManager:
     pm.add_hookspecs(hookspecs)
     pm.load_setuptools_entrypoints("dug")
     pm.register(parsers)
+    pm.register(annotators)
     return pm
 
 
@@ -56,19 +59,20 @@ def __init__(self, factory: DugFactory):
             ]
         )
 
-    def crawl(self, target_name: str, parser_type: str, element_type: str = None):
+    def crawl(self, target_name: str, parser_type: str, annotator_type: str, element_type: str = None):
 
         pm = get_plugin_manager()
         parser = get_parser(pm.hook, parser_type)
+        annotator = get_annotator(pm.hook, annotator_type)
         targets = get_targets(target_name)
 
         for target in targets:
-            self._crawl(target, parser, element_type)
+            self._crawl(target, parser, annotator, element_type)
 
-    def _crawl(self, target: Path, parser: Parser, element_type):
+    def _crawl(self, target: Path, parser: Parser, annotator: Annotator, element_type):
 
         # Initialize crawler
-        crawler = self._factory.build_crawler(target, parser, element_type)
+        crawler = self._factory.build_crawler(target, parser, annotator, element_type)
         # Read elements, annotate, and expand using tranql queries
         crawler.crawl()
 
@@ -93,11 +97,11 @@ def search(self, target, query, **kwargs):
         event_loop = asyncio.get_event_loop()
         targets = {
             'concepts': partial(
-                self._search.search_concepts, index=kwargs.get('index', self.concepts_index)),
+                self._search.search_concepts),
             'variables': partial(
-                self._search.search_variables, index=kwargs.get('index', self.variables_index), concept=kwargs.pop('concept', None)),
+                self._search.search_variables, concept=kwargs.pop('concept', None)),
             'kg': partial(
-                self._search.search_kg, index=kwargs.get('index', self.kg_index), unique_id=kwargs.pop('unique_id', None))
+                self._search.search_kg, unique_id=kwargs.pop('unique_id', None))
         }
         kwargs.pop('index', None)
         func = targets.get(target)