Skip to content

Commit

Permalink
Merge pull request #334 from helxplatform/feat/165-sapbert-annotator
Browse files Browse the repository at this point in the history
Feat/165 sapbert annotator
  • Loading branch information
YaphetKG authored Jan 3, 2024
2 parents 2b09941 + ceece16 commit 1c42441
Show file tree
Hide file tree
Showing 34 changed files with 2,459 additions and 1,190 deletions.
11 changes: 5 additions & 6 deletions .github/workflows/code-checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
python-version: '3.12'

# Currently actions/setup-python supports caching
# but the cache is not as robust as cache action.
Expand Down Expand Up @@ -113,7 +113,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
python-version: '3.12'

- name: Install Requirements
run: |
Expand All @@ -123,8 +123,7 @@ jobs:
- name: Test with pytest
run: |
pytest --doctest-modules src
coverage run -m pytest tests/unit
make test
############################ Bandit ################################
bandit:
Expand All @@ -134,7 +133,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
python-version: '3.12'

- name: Install Requirements
run: |
Expand All @@ -145,4 +144,4 @@ jobs:
# Only report high security issues
- name: Test with Bandit
run: |
bandit -r src -n3 -lll
bandit -r src -n3 -lll
1 change: 1 addition & 0 deletions .github/workflows/trivy-pr-scan.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ jobs:
image-ref: '${{ github.repository }}:vuln-test'
format: 'sarif'
severity: 'CRITICAL,HIGH'
ignore-unfixed: true
output: 'trivy-results.sarif'
exit-code: '1'
# Scan results should be viewable in GitHub Security Dashboard
Expand Down
10 changes: 5 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@
# A container for the core semantic-search capability.
#
######################################################
FROM python:3.10.10-slim
FROM python:3.12.0-alpine3.18

# Install required packages
RUN apt-get update && \
apt-get install -y curl make vim && \
rm -rf /var/cache/apt/*
RUN apk update && \
apk add g++ make

RUN pip install --upgrade pip
# Create a non-root user.
ENV USER dug
ENV HOME /home/$USER
ENV UID 1000

RUN adduser --disabled-login --home $HOME --shell /bin/bash --uid $UID $USER
RUN adduser -D --home $HOME --uid $UID $USER

USER $USER
WORKDIR $HOME
Expand Down
2 changes: 0 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@ install.dug:

#test: Run all tests
test:
# ${PYTHON} -m flake8 src
${PYTHON} -m pytest --doctest-modules src
coverage run -m pytest tests

coverage:
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,13 @@ dug crawl tests/integration/data/test_variables_v1.0.csv -p "TOPMedTag"

After crawling, you can search:
```shell
dug search -q "heart attack" -t "concepts"
dug search -q "heart attack" -t "variables" -k "concept=MONDO:0005068"
dug search -q "vein" -t "concepts"
dug search -q "vein" -t "variables" -k "concept=UBERON:0001638"
```

You can also query Dug's REST API:
```shell
query="`echo '{"index" : "concepts_index", "query" : "heart attack"}'`"
query="`echo '{"index" : "concepts_index", "query" : "vein"}'`"
curl --data "$query" \
--header "Content-Type: application/json" \
Expand Down
29 changes: 14 additions & 15 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,30 +1,29 @@
aiohttp
asyncio
fastapi==0.95.0
uvicorn==0.23.2
fastapi
uvicorn
elasticsearch[async]==8.5.2
gunicorn
itsdangerous
Jinja2
jsonschema
MarkupSafe
ormar==0.12.1
mistune==2.0.3
pluggy==1.0.0
pyrsistent==0.17.3
ormar
mistune
pluggy
pyrsistent
pytest
pytz==2021.1
PyYAML==6.0
requests==2.31.0
# old redis==4.4.2
redis==4.5.1
requests-cache==0.9.8
six==1.16.0
pytz
PyYAML
requests
redis
requests-cache
six

# Click for command line arguments
# We use Click 7.0 because that's what one of the pinned packages above use.
click
httpx>=0.24.1
httpx
linkml-runtime==1.6.0
bmt==1.1.0
urllib3>=1.26.17
urllib3
10 changes: 5 additions & 5 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,23 @@ classifiers =
package_dir =
= src
packages = find:
python_requires = >=3.10
python_requires = >=3.12
include_package_data = true
install_requires =
elasticsearch==8.5.2
pluggy
requests
requests_cache==0.9.8
redis==4.5.1
requests_cache
redis

[options.entry_points]
console_scripts =
dug = dug.cli:main

[options.extras_require]
rest =
fastapi==0.95.0
uvicorn==0.23.2
fastapi
uvicorn
gunicorn
jsonschema

Expand Down
9 changes: 8 additions & 1 deletion src/dug/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,13 @@ def get_argparser():
required=True
)

crawl_parser.add_argument(
'-a', '--annotator',
help='Annotator used to annotate identifiers in crawl file',
dest="annotator_type",
default="annotator-monarch"
)

crawl_parser.add_argument(
'-e', '--element-type',
help='[Optional] Coerce all elements to a certain data type (e.g. DbGaP Variable).\n'
Expand Down Expand Up @@ -108,7 +115,7 @@ def crawl(args):
config.node_to_element_queries = {}
factory = DugFactory(config)
dug = Dug(factory)
dug.crawl(args.target, args.parser_type, args.element_type)
dug.crawl(args.target, args.parser_type, args.annotator_type, args.element_type)


def search(args):
Expand Down
18 changes: 11 additions & 7 deletions src/dug/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@

from dug import hookspecs
from dug.core import parsers
from dug.core import annotators
from dug.core.factory import DugFactory
from dug.core.parsers import DugConcept, Parser, get_parser
from dug.core.annotators import DugIdentifier, Annotator, get_annotator

logger = logging.getLogger('dug')
stdout_log_handler = logging.StreamHandler(sys.stdout)
Expand All @@ -29,6 +31,7 @@ def get_plugin_manager() -> pluggy.PluginManager:
pm.add_hookspecs(hookspecs)
pm.load_setuptools_entrypoints("dug")
pm.register(parsers)
pm.register(annotators)
return pm


Expand Down Expand Up @@ -56,19 +59,20 @@ def __init__(self, factory: DugFactory):
]
)

def crawl(self, target_name: str, parser_type: str, element_type: str = None):
def crawl(self, target_name: str, parser_type: str, annotator_type: str, element_type: str = None):

pm = get_plugin_manager()
parser = get_parser(pm.hook, parser_type)
annotator = get_annotator(pm.hook, annotator_type)
targets = get_targets(target_name)

for target in targets:
self._crawl(target, parser, element_type)
self._crawl(target, parser, annotator, element_type)

def _crawl(self, target: Path, parser: Parser, element_type):
def _crawl(self, target: Path, parser: Parser, annotator: Annotator, element_type):

# Initialize crawler
crawler = self._factory.build_crawler(target, parser, element_type)
crawler = self._factory.build_crawler(target, parser, annotator, element_type)
# Read elements, annotate, and expand using tranql queries
crawler.crawl()

Expand All @@ -93,11 +97,11 @@ def search(self, target, query, **kwargs):
event_loop = asyncio.get_event_loop()
targets = {
'concepts': partial(
self._search.search_concepts, index=kwargs.get('index', self.concepts_index)),
self._search.search_concepts),
'variables': partial(
self._search.search_variables, index=kwargs.get('index', self.variables_index), concept=kwargs.pop('concept', None)),
self._search.search_variables, concept=kwargs.pop('concept', None)),
'kg': partial(
self._search.search_kg, index=kwargs.get('index', self.kg_index), unique_id=kwargs.pop('unique_id', None))
self._search.search_kg, unique_id=kwargs.pop('unique_id', None))
}
kwargs.pop('index', None)
func = targets.get(target)
Expand Down
Loading

0 comments on commit 1c42441

Please sign in to comment.