Merge pull request #256 from VariantEffect/feature/bencap/251/variant…

…-mapping-script Variant Mapping Script (and script environment improvements)
VariantEffect · Nov 21, 2024 · a183e27 · a183e27
2 parents 0dc3ca4 + c6c9d4c
commit a183e27
Show file tree

Hide file tree

Showing 5 changed files with 403 additions and 91 deletions.
diff --git a/src/mavedb/lib/script_environment.py b/src/mavedb/lib/script_environment.py
diff --git a/src/mavedb/models/mapped_variant.py b/src/mavedb/models/mapped_variant.py
@@ -14,8 +14,8 @@ class MappedVariant(Base):
 
     id = Column(Integer, primary_key=True)
 
-    pre_mapped = Column(JSONB, nullable=True)
-    post_mapped = Column(JSONB, nullable=True)
+    pre_mapped = Column(JSONB(none_as_null=True), nullable=True)
+    post_mapped = Column(JSONB(none_as_null=True), nullable=True)
     vrs_version = Column(String, nullable=True)
     error_message = Column(String, nullable=True)
     modification_date = Column(Date, nullable=False, default=date.today, onupdate=date.today)

diff --git a/src/mavedb/scripts/environment.py b/src/mavedb/scripts/environment.py
@@ -0,0 +1,160 @@
+"""
+Environment setup for scripts.
+"""
+
+import enum
+import logging
+import click
+from functools import wraps
+
+
+from sqlalchemy.orm import configure_mappers
+
+from mavedb import deps
+from mavedb.models import *  # noqa: F403
+
+
+logger = logging.getLogger(__name__)
+
+
+@enum.unique
+class DatabaseSessionAction(enum.Enum):
+    """
+    Enum representing the database session transaction action selected for a
+    command decorated by :py:func:`.with_database_session`.
+
+    You will not need to use this class unless you provide ``pass_action =
+    True`` to :py:func:`.with_database_session`.
+    """
+
+    DRY_RUN = "rollback"
+    PROMPT = "prompt"
+    COMMIT = "commit"
+
+
+@click.group()
+def script_environment():
+    """
+    Set up the environment for a script that may be run from the command line and does not necessarily depend on the
+    FastAPI framework.
+
+    Features:
+    - Configures logging for the script.
+    - Loads the SQLAlchemy data model.
+    """
+
+    logging.basicConfig()
+
+    # Un-comment this line to log all database queries:
+    logging.getLogger("__main__").setLevel(logging.INFO)
+    # logging.getLogger("sqlalchemy.engine").setLevel(logging.INFO)
+
+    # Scan all our model classes and create backref attributes. Otherwise, these attributes only get added to classes once
+    # an instance of the related class has been created.
+    configure_mappers()
+
+
+def with_database_session(command=None, *, pass_action: bool = False):
+    """
+    Decorator to provide database session and error handling for a *command*.
+
+    The *command* callable must be a :py:class:`click.Command` instance.
+
+    The decorated *command* is called with a ``db`` keyword argument to provide
+    a :class:`~id3c.db.session.DatabaseSession` object.  The call happens
+    within an exception handler that commits or rollsback the database
+    transaction, possibly interactively.  Three new options are added to the
+    *command* (``--dry-run``, ``--prompt``, and ``--commit``) to control this
+    behaviour.
+
+    >>> @click.command
+    ... @with_database_session
+    ... def cmd(db: DatabaseSession):
+    ...     pass
+
+    If the optional, keyword-only argument *pass_action* is ``True``, then the
+    :py:class:`.DatabaseSessionAction` selected by the CLI options above is
+    passed as an additional ``action`` argument to the decorated *command*.
+
+    >>> @click.command
+    ... @with_database_session(pass_action = True)
+    ... def cmd(db: DatabaseSession, action: DatabaseSessionAction):
+    ...     pass
+
+    One example where this is useful is when the *command* accesses
+    non-database resources and wants to extend dry run mode to them as well.
+    """
+
+    def decorator(command):
+        @click.option(
+            "--dry-run",
+            "action",
+            help="Only go through the motions of changing the database (default)",
+            flag_value=DatabaseSessionAction("rollback"),
+            type=DatabaseSessionAction,
+            default=True,
+        )
+        @click.option(
+            "--prompt",
+            "action",
+            help="Ask if changes to the database should be saved",
+            flag_value=DatabaseSessionAction("prompt"),
+            type=DatabaseSessionAction,
+        )
+        @click.option(
+            "--commit",
+            "action",
+            help="Save changes to the database",
+            flag_value=DatabaseSessionAction("commit"),
+            type=DatabaseSessionAction,
+        )
+        @wraps(command)
+        def decorated(*args, action, **kwargs):
+            db = next(deps.get_db())
+
+            kwargs["db"] = db
+
+            if pass_action:
+                kwargs["action"] = action
+
+            processed_without_error = None
+
+            try:
+                command(*args, **kwargs)
+
+            except Exception as error:
+                processed_without_error = False
+
+                logger.error(f"Aborting with error: {error}")
+                raise error from None
+
+            else:
+                processed_without_error = True
+
+            finally:
+                if action is DatabaseSessionAction.PROMPT:
+                    ask_to_commit = (
+                        "Commit all changes?"
+                        if processed_without_error
+                        else "Commit successfully processed records up to this point?"
+                    )
+
+                    commit = click.confirm(ask_to_commit)
+                else:
+                    commit = action is DatabaseSessionAction.COMMIT
+
+                if commit:
+                    logger.info(
+                        "Committing all changes"
+                        if processed_without_error
+                        else "Committing successfully processed records up to this point"
+                    )
+                    db.commit()
+
+                else:
+                    logger.info("Rolling back all changes; the database will not be modified")
+                    db.rollback()
+
+        return decorated
+
+    return decorator(command) if command else decorator
diff --git a/src/mavedb/scripts/export_public_data.py b/src/mavedb/scripts/export_public_data.py
@@ -34,17 +34,16 @@
 
 from fastapi.encoders import jsonable_encoder
 from sqlalchemy import select
-from sqlalchemy.orm import lazyload
+from sqlalchemy.orm import lazyload, Session
 
 from mavedb.lib.score_sets import get_score_set_counts_as_csv, get_score_set_scores_as_csv
-from mavedb.lib.script_environment import init_script_environment
 from mavedb.models.experiment import Experiment
 from mavedb.models.experiment_set import ExperimentSet
 from mavedb.models.license import License
 from mavedb.models.score_set import ScoreSet
 from mavedb.view_models.experiment_set import ExperimentSetPublicDump
 
-db = init_script_environment()
+from mavedb.scripts.environment import script_environment, with_database_session
 
 logger = logging.getLogger(__name__)
 
@@ -89,68 +88,73 @@ def flatmap(f: Callable[[S], Iterable[T]], items: Iterable[S]) -> Iterable[T]:
     return chain.from_iterable(map(f, items))
 
 
-logger.info("Fetching data sets")
-
-experiment_sets_query = db.scalars(
-    select(ExperimentSet)
-    .where(ExperimentSet.published_date.is_not(None))
-    .options(
-        lazyload(ExperimentSet.experiments.and_(Experiment.published_date.is_not(None))).options(
-            lazyload(
-                Experiment.score_sets.and_(
-                    ScoreSet.published_date.is_not(None), ScoreSet.license.has(License.short_name == "CC0")
+@script_environment.command()
+@with_database_session
+def export_public_data(db: Session):
+    experiment_sets_query = db.scalars(
+        select(ExperimentSet)
+        .where(ExperimentSet.published_date.is_not(None))
+        .options(
+            lazyload(ExperimentSet.experiments.and_(Experiment.published_date.is_not(None))).options(
+                lazyload(
+                    Experiment.score_sets.and_(
+                        ScoreSet.published_date.is_not(None), ScoreSet.license.has(License.short_name == "CC0")
+                    )
                 )
             )
         )
+        .execution_options(populate_existing=True)
+        .order_by(ExperimentSet.urn)
+    )
+
+    # Filter the stream of experiment sets to exclude experiments and experiment sets with no public, CC0-licensed score
+    # sets.
+    experiment_sets = list(filter_experiment_sets(experiment_sets_query.all()))
+
+    # TODO To support very large data sets, we may want to use custom code for JSON-encoding an iterator.
+    # Issue: https://github.com/VariantEffect/mavedb-api/issues/192
+    # See, for instance, https://stackoverflow.com/questions/12670395/json-encoding-very-long-iterators.
+
+    experiment_set_views = list(map(lambda es: ExperimentSetPublicDump.from_orm(es), experiment_sets))
+
+    # Get a list of IDS of all the score sets included.
+    score_set_ids = list(
+        flatmap(lambda es: flatmap(lambda e: map(lambda ss: ss.id, e.score_sets), es.experiments), experiment_sets)
     )
-    .execution_options(populate_existing=True)
-    .order_by(ExperimentSet.urn)
-)
-
-# Filter the stream of experiment sets to exclude experiments and experiment sets with no public, CC0-licensed score
-# sets.
-experiment_sets = list(filter_experiment_sets(experiment_sets_query.all()))
-
-# TODO To support very large data sets, we may want to use custom code for JSON-encoding an iterator.
-# Issue: https://github.com/VariantEffect/mavedb-api/issues/192
-# See, for instance, https://stackoverflow.com/questions/12670395/json-encoding-very-long-iterators.
-
-experiment_set_views = list(map(lambda es: ExperimentSetPublicDump.from_orm(es), experiment_sets))
-
-# Get a list of IDS of all the score sets included.
-score_set_ids = list(
-    flatmap(lambda es: flatmap(lambda e: map(lambda ss: ss.id, e.score_sets), es.experiments), experiment_sets)
-)
-
-timestamp_format = "%Y%m%d%H%M%S"
-zip_file_name = f"mavedb-dump.{datetime.now().strftime(timestamp_format)}.zip"
-
-logger.info(f"Exporting public data set metadata to {zip_file_name}/main.json")
-json_data = {
-    "title": "MaveDB public data",
-    "asOf": datetime.now(timezone.utc).isoformat(),
-    "experimentSets": experiment_set_views,
-}
-
-with ZipFile(zip_file_name, "w") as zipfile:
-    # Write metadata for all data sets to a single JSON file.
-    zipfile.writestr("main.json", json.dumps(jsonable_encoder(json_data)))
-
-    # Copy the CC0 license.
-    zipfile.write(os.path.join(os.path.dirname(__file__), "resources/CC0_license.txt"), "LICENSE.txt")
-
-    # Write score and count files for each score set.
-    num_score_sets = len(score_set_ids)
-    for i, score_set_id in enumerate(score_set_ids):
-        score_set = db.scalars(select(ScoreSet).where(ScoreSet.id == score_set_id)).one_or_none()
-        if score_set is not None and score_set.urn is not None:
-            logger.info(f"{i + 1}/{num_score_sets} Exporting variants for score set {score_set.urn}")
-            csv_filename_base = score_set.urn.replace(":", "-")
-
-            csv_str = get_score_set_scores_as_csv(db, score_set)
-            zipfile.writestr(f"csv/{csv_filename_base}.scores.csv", csv_str)
-
-            count_columns = score_set.dataset_columns["count_columns"] if score_set.dataset_columns else None
-            if count_columns and len(count_columns) > 0:
-                csv_str = get_score_set_counts_as_csv(db, score_set)
-                zipfile.writestr(f"csv/{csv_filename_base}.counts.csv", csv_str)
+
+    timestamp_format = "%Y%m%d%H%M%S"
+    zip_file_name = f"mavedb-dump.{datetime.now().strftime(timestamp_format)}.zip"
+
+    logger.info(f"Exporting public data set metadata to {zip_file_name}/main.json")
+    json_data = {
+        "title": "MaveDB public data",
+        "asOf": datetime.now(timezone.utc).isoformat(),
+        "experimentSets": experiment_set_views,
+    }
+
+    with ZipFile(zip_file_name, "w") as zipfile:
+        # Write metadata for all data sets to a single JSON file.
+        zipfile.writestr("main.json", json.dumps(jsonable_encoder(json_data)))
+
+        # Copy the CC0 license.
+        zipfile.write(os.path.join(os.path.dirname(__file__), "resources/CC0_license.txt"), "LICENSE.txt")
+
+        # Write score and count files for each score set.
+        num_score_sets = len(score_set_ids)
+        for i, score_set_id in enumerate(score_set_ids):
+            score_set = db.scalars(select(ScoreSet).where(ScoreSet.id == score_set_id)).one_or_none()
+            if score_set is not None and score_set.urn is not None:
+                logger.info(f"{i + 1}/{num_score_sets} Exporting variants for score set {score_set.urn}")
+                csv_filename_base = score_set.urn.replace(":", "-")
+
+                csv_str = get_score_set_scores_as_csv(db, score_set)
+                zipfile.writestr(f"csv/{csv_filename_base}.scores.csv", csv_str)
+
+                count_columns = score_set.dataset_columns["count_columns"] if score_set.dataset_columns else None
+                if count_columns and len(count_columns) > 0:
+                    csv_str = get_score_set_counts_as_csv(db, score_set)
+                    zipfile.writestr(f"csv/{csv_filename_base}.counts.csv", csv_str)
+
+
+if __name__ == "__main__":
+    export_public_data()