Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nick/89 efficient search #343

Draft
wants to merge 11 commits into
base: main
Choose a base branch
from
Prev Previous commit
Next Next commit
Merge branch 'main' into nick/89-efficient-search
  • Loading branch information
nickzoic committed Oct 17, 2024
commit 701cf199f22045a6cf199a45e40166decfb2f7e1
5 changes: 1 addition & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -84,16 +84,13 @@ requests-mock = "~1.11.0"
ruff = "^0.6.8"
SQLAlchemy = { extras = ["mypy"], version = "~2.0.0" }


[tool.poetry.extras]
server = ["alembic", "alembic-utils", "arq", "authlib", "boto3", "cryptography", "fastapi", "email-validator", "orcid", "psycopg2", "python-jose", "python-multipart", "requests", "slack-sdk", "uvicorn", "watchtower"]

server = ["alembic", "arq", "authlib", "biocommons", "boto3", "cdot", "cryptography", "fastapi", "hgvs", "orcid", "psycopg2", "python-jose", "python-multipart", "requests", "starlette", "starlette-context", "slack-sdk", "uvicorn", "watchtower"]

[tool.black]
extend-exclude = "alembic/versions"
line-length = 120


[tool.mypy]
plugins = [
"sqlalchemy.ext.mypy.plugin",
3 changes: 0 additions & 3 deletions src/mavedb/lib/score_sets.py
Original file line number Diff line number Diff line change
@@ -37,9 +37,6 @@
from mavedb.models.score_set_publication_identifier import (
ScoreSetPublicationIdentifierAssociation,
)
from mavedb.models.refseq_offset import RefseqOffset
from mavedb.models.refseq_identifier import RefseqIdentifier
from mavedb.models.score_set import ScoreSet
from mavedb.models.score_set_fulltext import scoreset_fulltext_filter
from mavedb.models.target_accession import TargetAccession
from mavedb.models.target_gene import TargetGene
6 changes: 3 additions & 3 deletions src/mavedb/models/score_set.py
Original file line number Diff line number Diff line change
@@ -3,7 +3,9 @@

from sqlalchemy import Boolean, Column, Date, Enum, ForeignKey, Integer, String
from sqlalchemy.dialects.postgresql import JSONB
import logging
from sqlalchemy.ext.associationproxy import AssociationProxy, association_proxy
from sqlalchemy.orm import Mapped, relationship
from sqlalchemy.schema import Table

import mavedb.models.score_set_publication_identifier
from mavedb.db.base import Base
@@ -26,8 +28,6 @@

# TODO Reformat code without removing dependencies whose use is not detected.

logger = logging.getLogger(__name__)

score_sets_contributors_association_table = Table(
"scoreset_contributors",
Base.metadata,
78 changes: 78 additions & 0 deletions src/mavedb/models/score_set_fulltext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import logging

from sqlalchemy import text
from mavedb.models.score_set import ScoreSet
from alembic_utils.pg_materialized_view import PGMaterializedView

logger = logging.getLogger(__name__)

# TODO(#94): add LICENSE, plus TAX_ID if numeric
# TODO(#89): The query below should be generated from SQLAlchemy
# models rather than hand-carved SQL

_scoreset_fulltext_view = PGMaterializedView(
schema="public",
signature="scoreset_fulltext",
definition=' union ' .join(
[
f"select id as scoreset_id, to_tsvector({c}) as text from scoresets"
for c in ('urn', 'title', 'short_description', 'abstract_text')
] + [
f"select scoreset_id, to_tsvector({c}) as text from target_genes"
for c in ('name', 'category')
] + [
f"select scoreset_id, to_tsvector(TX.{c}) as text from target_genes TG join target_sequences TS on \
(TG.target_sequence_id = TS.id) join taxonomies TX on (TS.taxonomy_id = TX.id)"
for c in ('organism_name', 'common_name')
] + [
"select scoreset_id, to_tsvector(TA.assembly) as text from target_genes TG join target_accessions TA on \
(TG.accession_id = TA.id)"
] + [
f"select scoreset_id, to_tsvector(PI.{c}) as text from scoreset_publication_identifiers SPI JOIN \
publication_identifiers PI ON (SPI.publication_identifier_id = PI.id)"
for c in ('identifier', 'doi', 'abstract', 'title', 'publication_journal')
] + [
"select scoreset_id, to_tsvector(jsonb_array_elements(authors)->'name') as text from \
scoreset_publication_identifiers SPI join publication_identifiers PI on \
SPI.publication_identifier_id = PI.id",
] + [
"select scoreset_id, to_tsvector(DI.identifier) as text from scoreset_doi_identifiers SD join \
doi_identifiers DI on (SD.doi_identifier_id = DI.id)",
] + [
f"select scoreset_id, to_tsvector(XI.identifier) as text from target_genes TG join {x}_offsets XO on \
(XO.target_gene_id = TG.id) join {x}_identifiers XI on (XI.id = XO.identifier_id)"
for x in ('uniprot', 'refseq', 'ensembl')
]
),
with_data=True
)


def scoreset_fulltext_create(session):
logger.warning("Creating %s", _scoreset_fulltext_view.signature)
session.execute(
_scoreset_fulltext_view.to_sql_statement_create()
)
session.commit()
logger.warning("Created %s", _scoreset_fulltext_view.signature)


def scoreset_fulltext_destroy(session):
logger.warning("Destroying %s", _scoreset_fulltext_view.signature)
session.execute(
_scoreset_fulltext_view.to_sql_statement_drop()
)
session.commit()
logger.warning("Destroyed %s", _scoreset_fulltext_view.signature)


def scoreset_fulltext_refresh(session):
session.execute(text(f'refresh materialized view {_scoreset_fulltext_view.signature}'))
session.commit()


def scoreset_fulltext_filter(query, string):
return query.filter(ScoreSet.id.in_(
text(f"select distinct scoreset_id from {_scoreset_fulltext_view.signature} \
where text @@ websearch_to_tsquery(:text)").params(text=string)
))
1 change: 0 additions & 1 deletion src/mavedb/routers/score_sets.py
Original file line number Diff line number Diff line change
@@ -53,7 +53,6 @@
from mavedb.models.mapped_variant import MappedVariant
from mavedb.models.score_set import ScoreSet
from mavedb.models.score_set_fulltext import scoreset_fulltext_refresh
from mavedb.models.target_gene import TargetGene
from mavedb.models.target_accession import TargetAccession
from mavedb.models.target_gene import TargetGene
from mavedb.models.target_sequence import TargetSequence
8 changes: 4 additions & 4 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -55,13 +55,13 @@ def session(postgresql):
session = sessionmaker(autocommit=False, autoflush=False, bind=engine)()

Base.metadata.create_all(bind=engine)
sesh = session()
scoreset_fulltext_create(session)

try:
scoreset_fulltext_create(sesh)
yield sesh
yield session
finally:
scoreset_fulltext_destroy(sesh)
scoreset_fulltext_destroy(session)
session.close()
Base.metadata.drop_all(bind=engine)


You are viewing a condensed version of this merge commit. You can view the full changes here.