Skip to content

Commit

Permalink
Merge pull request #256 from VariantEffect/feature/bencap/251/variant…
Browse files Browse the repository at this point in the history
…-mapping-script

Variant Mapping Script (and script environment improvements)
  • Loading branch information
bencap authored Nov 21, 2024
2 parents 0dc3ca4 + c6c9d4c commit a183e27
Show file tree
Hide file tree
Showing 5 changed files with 403 additions and 91 deletions.
25 changes: 0 additions & 25 deletions src/mavedb/lib/script_environment.py

This file was deleted.

4 changes: 2 additions & 2 deletions src/mavedb/models/mapped_variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ class MappedVariant(Base):

id = Column(Integer, primary_key=True)

pre_mapped = Column(JSONB, nullable=True)
post_mapped = Column(JSONB, nullable=True)
pre_mapped = Column(JSONB(none_as_null=True), nullable=True)
post_mapped = Column(JSONB(none_as_null=True), nullable=True)
vrs_version = Column(String, nullable=True)
error_message = Column(String, nullable=True)
modification_date = Column(Date, nullable=False, default=date.today, onupdate=date.today)
Expand Down
160 changes: 160 additions & 0 deletions src/mavedb/scripts/environment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
"""
Environment setup for scripts.
"""

import enum
import logging
import click
from functools import wraps


from sqlalchemy.orm import configure_mappers

from mavedb import deps
from mavedb.models import * # noqa: F403


logger = logging.getLogger(__name__)


@enum.unique
class DatabaseSessionAction(enum.Enum):
"""
Enum representing the database session transaction action selected for a
command decorated by :py:func:`.with_database_session`.
You will not need to use this class unless you provide ``pass_action =
True`` to :py:func:`.with_database_session`.
"""

DRY_RUN = "rollback"
PROMPT = "prompt"
COMMIT = "commit"


@click.group()
def script_environment():
"""
Set up the environment for a script that may be run from the command line and does not necessarily depend on the
FastAPI framework.
Features:
- Configures logging for the script.
- Loads the SQLAlchemy data model.
"""

logging.basicConfig()

# Un-comment this line to log all database queries:
logging.getLogger("__main__").setLevel(logging.INFO)
# logging.getLogger("sqlalchemy.engine").setLevel(logging.INFO)

# Scan all our model classes and create backref attributes. Otherwise, these attributes only get added to classes once
# an instance of the related class has been created.
configure_mappers()


def with_database_session(command=None, *, pass_action: bool = False):
"""
Decorator to provide database session and error handling for a *command*.
The *command* callable must be a :py:class:`click.Command` instance.
The decorated *command* is called with a ``db`` keyword argument to provide
a :class:`~id3c.db.session.DatabaseSession` object. The call happens
within an exception handler that commits or rollsback the database
transaction, possibly interactively. Three new options are added to the
*command* (``--dry-run``, ``--prompt``, and ``--commit``) to control this
behaviour.
>>> @click.command
... @with_database_session
... def cmd(db: DatabaseSession):
... pass
If the optional, keyword-only argument *pass_action* is ``True``, then the
:py:class:`.DatabaseSessionAction` selected by the CLI options above is
passed as an additional ``action`` argument to the decorated *command*.
>>> @click.command
... @with_database_session(pass_action = True)
... def cmd(db: DatabaseSession, action: DatabaseSessionAction):
... pass
One example where this is useful is when the *command* accesses
non-database resources and wants to extend dry run mode to them as well.
"""

def decorator(command):
@click.option(
"--dry-run",
"action",
help="Only go through the motions of changing the database (default)",
flag_value=DatabaseSessionAction("rollback"),
type=DatabaseSessionAction,
default=True,
)
@click.option(
"--prompt",
"action",
help="Ask if changes to the database should be saved",
flag_value=DatabaseSessionAction("prompt"),
type=DatabaseSessionAction,
)
@click.option(
"--commit",
"action",
help="Save changes to the database",
flag_value=DatabaseSessionAction("commit"),
type=DatabaseSessionAction,
)
@wraps(command)
def decorated(*args, action, **kwargs):
db = next(deps.get_db())

kwargs["db"] = db

if pass_action:
kwargs["action"] = action

processed_without_error = None

try:
command(*args, **kwargs)

except Exception as error:
processed_without_error = False

logger.error(f"Aborting with error: {error}")
raise error from None

else:
processed_without_error = True

finally:
if action is DatabaseSessionAction.PROMPT:
ask_to_commit = (
"Commit all changes?"
if processed_without_error
else "Commit successfully processed records up to this point?"
)

commit = click.confirm(ask_to_commit)
else:
commit = action is DatabaseSessionAction.COMMIT

if commit:
logger.info(
"Committing all changes"
if processed_without_error
else "Committing successfully processed records up to this point"
)
db.commit()

else:
logger.info("Rolling back all changes; the database will not be modified")
db.rollback()

return decorated

return decorator(command) if command else decorator
132 changes: 68 additions & 64 deletions src/mavedb/scripts/export_public_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,16 @@

from fastapi.encoders import jsonable_encoder
from sqlalchemy import select
from sqlalchemy.orm import lazyload
from sqlalchemy.orm import lazyload, Session

from mavedb.lib.score_sets import get_score_set_counts_as_csv, get_score_set_scores_as_csv
from mavedb.lib.script_environment import init_script_environment
from mavedb.models.experiment import Experiment
from mavedb.models.experiment_set import ExperimentSet
from mavedb.models.license import License
from mavedb.models.score_set import ScoreSet
from mavedb.view_models.experiment_set import ExperimentSetPublicDump

db = init_script_environment()
from mavedb.scripts.environment import script_environment, with_database_session

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -89,68 +88,73 @@ def flatmap(f: Callable[[S], Iterable[T]], items: Iterable[S]) -> Iterable[T]:
return chain.from_iterable(map(f, items))


logger.info("Fetching data sets")

experiment_sets_query = db.scalars(
select(ExperimentSet)
.where(ExperimentSet.published_date.is_not(None))
.options(
lazyload(ExperimentSet.experiments.and_(Experiment.published_date.is_not(None))).options(
lazyload(
Experiment.score_sets.and_(
ScoreSet.published_date.is_not(None), ScoreSet.license.has(License.short_name == "CC0")
@script_environment.command()
@with_database_session
def export_public_data(db: Session):
experiment_sets_query = db.scalars(
select(ExperimentSet)
.where(ExperimentSet.published_date.is_not(None))
.options(
lazyload(ExperimentSet.experiments.and_(Experiment.published_date.is_not(None))).options(
lazyload(
Experiment.score_sets.and_(
ScoreSet.published_date.is_not(None), ScoreSet.license.has(License.short_name == "CC0")
)
)
)
)
.execution_options(populate_existing=True)
.order_by(ExperimentSet.urn)
)

# Filter the stream of experiment sets to exclude experiments and experiment sets with no public, CC0-licensed score
# sets.
experiment_sets = list(filter_experiment_sets(experiment_sets_query.all()))

# TODO To support very large data sets, we may want to use custom code for JSON-encoding an iterator.
# Issue: https://github.com/VariantEffect/mavedb-api/issues/192
# See, for instance, https://stackoverflow.com/questions/12670395/json-encoding-very-long-iterators.

experiment_set_views = list(map(lambda es: ExperimentSetPublicDump.from_orm(es), experiment_sets))

# Get a list of IDS of all the score sets included.
score_set_ids = list(
flatmap(lambda es: flatmap(lambda e: map(lambda ss: ss.id, e.score_sets), es.experiments), experiment_sets)
)
.execution_options(populate_existing=True)
.order_by(ExperimentSet.urn)
)

# Filter the stream of experiment sets to exclude experiments and experiment sets with no public, CC0-licensed score
# sets.
experiment_sets = list(filter_experiment_sets(experiment_sets_query.all()))

# TODO To support very large data sets, we may want to use custom code for JSON-encoding an iterator.
# Issue: https://github.com/VariantEffect/mavedb-api/issues/192
# See, for instance, https://stackoverflow.com/questions/12670395/json-encoding-very-long-iterators.

experiment_set_views = list(map(lambda es: ExperimentSetPublicDump.from_orm(es), experiment_sets))

# Get a list of IDS of all the score sets included.
score_set_ids = list(
flatmap(lambda es: flatmap(lambda e: map(lambda ss: ss.id, e.score_sets), es.experiments), experiment_sets)
)

timestamp_format = "%Y%m%d%H%M%S"
zip_file_name = f"mavedb-dump.{datetime.now().strftime(timestamp_format)}.zip"

logger.info(f"Exporting public data set metadata to {zip_file_name}/main.json")
json_data = {
"title": "MaveDB public data",
"asOf": datetime.now(timezone.utc).isoformat(),
"experimentSets": experiment_set_views,
}

with ZipFile(zip_file_name, "w") as zipfile:
# Write metadata for all data sets to a single JSON file.
zipfile.writestr("main.json", json.dumps(jsonable_encoder(json_data)))

# Copy the CC0 license.
zipfile.write(os.path.join(os.path.dirname(__file__), "resources/CC0_license.txt"), "LICENSE.txt")

# Write score and count files for each score set.
num_score_sets = len(score_set_ids)
for i, score_set_id in enumerate(score_set_ids):
score_set = db.scalars(select(ScoreSet).where(ScoreSet.id == score_set_id)).one_or_none()
if score_set is not None and score_set.urn is not None:
logger.info(f"{i + 1}/{num_score_sets} Exporting variants for score set {score_set.urn}")
csv_filename_base = score_set.urn.replace(":", "-")

csv_str = get_score_set_scores_as_csv(db, score_set)
zipfile.writestr(f"csv/{csv_filename_base}.scores.csv", csv_str)

count_columns = score_set.dataset_columns["count_columns"] if score_set.dataset_columns else None
if count_columns and len(count_columns) > 0:
csv_str = get_score_set_counts_as_csv(db, score_set)
zipfile.writestr(f"csv/{csv_filename_base}.counts.csv", csv_str)

timestamp_format = "%Y%m%d%H%M%S"
zip_file_name = f"mavedb-dump.{datetime.now().strftime(timestamp_format)}.zip"

logger.info(f"Exporting public data set metadata to {zip_file_name}/main.json")
json_data = {
"title": "MaveDB public data",
"asOf": datetime.now(timezone.utc).isoformat(),
"experimentSets": experiment_set_views,
}

with ZipFile(zip_file_name, "w") as zipfile:
# Write metadata for all data sets to a single JSON file.
zipfile.writestr("main.json", json.dumps(jsonable_encoder(json_data)))

# Copy the CC0 license.
zipfile.write(os.path.join(os.path.dirname(__file__), "resources/CC0_license.txt"), "LICENSE.txt")

# Write score and count files for each score set.
num_score_sets = len(score_set_ids)
for i, score_set_id in enumerate(score_set_ids):
score_set = db.scalars(select(ScoreSet).where(ScoreSet.id == score_set_id)).one_or_none()
if score_set is not None and score_set.urn is not None:
logger.info(f"{i + 1}/{num_score_sets} Exporting variants for score set {score_set.urn}")
csv_filename_base = score_set.urn.replace(":", "-")

csv_str = get_score_set_scores_as_csv(db, score_set)
zipfile.writestr(f"csv/{csv_filename_base}.scores.csv", csv_str)

count_columns = score_set.dataset_columns["count_columns"] if score_set.dataset_columns else None
if count_columns and len(count_columns) > 0:
csv_str = get_score_set_counts_as_csv(db, score_set)
zipfile.writestr(f"csv/{csv_filename_base}.counts.csv", csv_str)


if __name__ == "__main__":
export_public_data()
Loading

0 comments on commit a183e27

Please sign in to comment.