From 73b421337d346f5ab1ecf2fa3a816111ee45477b Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Tue, 19 Nov 2024 13:52:39 +0100 Subject: [PATCH] Add tracking of runs (#83) * Add track() method to Instance * Add finish() method to Instance() Move checks to Instance$get_py_lamin() * Fix lint * Update achitecture vignette Fix initailize() position and adjust indentation * Update development vignette * Update features in README * Update CHANGELOG --- CHANGELOG.md | 1 + R/Instance.R | 65 ++++++++++++++++++++++++++++++- R/Registry.R | 18 ++------- README.md | 1 + man/Instance.Rd | 48 ++++++++++++++++++++++- vignettes/architecture.qmd | 80 +++++++++++++++----------------------- vignettes/development.qmd | 8 ++-- 7 files changed, 152 insertions(+), 69 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 277fac8..ad58bba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - Add a `from_df()` method to the `Registry` class to create new artifacts from data frames (PR #78) - Create `TemporaryRecord` classes for new artifacts before they have been saved to the database (PR #78) - Add a `delete()` method to the `Record` class (PR #78) +- Add `track()` and `finish()` methods to the `Instance` class (PR #83) ## MAJOR CHANGES diff --git a/R/Instance.R b/R/Instance.R index 6f4b111..4bce2e0 100644 --- a/R/Instance.R +++ b/R/Instance.R @@ -191,10 +191,73 @@ Instance <- R6::R6Class( # nolint object_name_linter }, #' @description Get the Python lamindb module #' + #' @param check Logical, whether to perform checks + #' @param what What the python module is being requested for, used in check + #' messages + #' #' @return Python lamindb module. - get_py_lamin = function() { + get_py_lamin = function(check = FALSE, what = "This functionality") { + if (check && isFALSE(self$is_default)) { + cli::cli_abort(c( + "{what} can only be performed by the default instance", + "i" = "Use {.code connect(slug = NULL)} to connect to the default instance" + )) + } + + if (check && is.null(self$get_py_lamin())) { + cli::cli_abort(c( + "{what} requires the Python lamindb package", + "i" = "Check the output of {.code connect()} for warnings" + )) + } + private$.py_lamin }, + #' @description Start a run with tracked data lineage + #' + #' @details + #' Calling `track()` with `transform = NULL` with return a UID, providing + #' that UID with the same path with start a run + #' + #' @param path Path to the R script or document to track + #' @param transform UID specifying the data transformation + track = function(path, transform = NULL) { + py_lamin <- self$get_py_lamin(check = TRUE, what = "Tracking") + + if (is.null(transform)) { + transform <- tryCatch( + py_lamin$track(path = path), + error = function(err) { + py_err <- reticulate::py_last_error() + if (py_err$type != "MissingContextUID") { + cli::cli_abort(c( + "Python error {.val {py_err$type}}", + "i" = "Run {.run reticulate::py_last_error()} for details" + )) + } + + uid <- gsub(".*\\(\"(.*?)\"\\).*", "\\1", py_err$value) + cli::cli_inform(paste( + "Got UID {.val {uid}} for path {.file {path}}.", + "Run this function with {.code transform = \"{uid}\"} to track this path." + )) + } + ) + } else { + if (is.character(transform) && nchar(transform) != 16) { + cli::cli_abort( + "The transform UID must be exactly 16 characters, got {nchar(transform)}" + ) + } + + py_lamin$track(transform = transform, path = path) + } + }, + #' @description Finish a tracked run + finish = function() { + py_lamin <- self$get_py_lamin(check = TRUE, what = "Tracking") + py_lamin$finish() + }, #' @description #' Print an `Instance` #' diff --git a/R/Registry.R b/R/Registry.R index 92f7fa1..b409eae 100644 --- a/R/Registry.R +++ b/R/Registry.R @@ -154,27 +154,15 @@ Registry <- R6::R6Class( # nolint object_name_linter #' @return A `TemporaryRecord` object containing the new record. This is not #' saved to the database until `temp_record$save()` is called. from_df = function(dataframe, key = NULL, description = NULL, run = NULL) { - if (isFALSE(private$.instance$is_default)) { - cli::cli_abort(c( - "Only the default instance can create records", - "i" = "Use {.code connect(slug = NULL)} to connect to the default instance" - )) - } - - if (is.null(private$.instance$get_py_lamin())) { - cli::cli_abort(c( - "Creating records requires the Python lamindb package", - "i" = "Check the output of {.code connect()} for warnings" - )) - } - if (private$.registry_name != "artifact") { cli::cli_abort( "Creating records from data frames is only supported for the Artifact registry" ) } - py_lamin <- private$.instance$get_py_lamin() + py_lamin <- private$.instance$get_py_lamin( + check = TRUE, what = "Creating records" + ) py_record <- py_lamin$Artifact$from_df( dataframe, diff --git a/README.md b/README.md index c41ba00..27ef831 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ LaminDB is accompanied by LaminHub which is a data collaboration hub built on La - Planned: `.fcs`, `.h5mu`, `.zarr`. - Create records from data frames. - Delete records. +- Track code in R scripts and notebooks. See the development roadmap for more details (`vignette("development", package = "laminr")`). diff --git a/man/Instance.Rd b/man/Instance.Rd index 7d09979..9617e57 100644 --- a/man/Instance.Rd +++ b/man/Instance.Rd @@ -51,6 +51,8 @@ Whether this is the default instance.} \item \href{#method-Instance-get_settings}{\code{Instance$get_settings()}} \item \href{#method-Instance-get_api}{\code{Instance$get_api()}} \item \href{#method-Instance-get_py_lamin}{\code{Instance$get_py_lamin()}} +\item \href{#method-Instance-track}{\code{Instance$track()}} +\item \href{#method-Instance-finish}{\code{Instance$finish()}} \item \href{#method-Instance-print}{\code{Instance$print()}} \item \href{#method-Instance-to_string}{\code{Instance$to_string()}} } @@ -163,12 +165,56 @@ The API for the instance. \subsection{Method \code{get_py_lamin()}}{ Get the Python lamindb module \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Instance$get_py_lamin()}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Instance$get_py_lamin(check = FALSE, what = "This functionality")}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{check}}{Logical, whether to perform checks} + +\item{\code{what}}{What the python module is being requested for, used in check +messages} +} +\if{html}{\out{
}} +} \subsection{Returns}{ Python lamindb module. } +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Instance-track}{}}} +\subsection{Method \code{track()}}{ +Start a run with tracked data lineage +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Instance$track(path, transform = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{path}}{Path to the R script or document to track} + +\item{\code{transform}}{UID specifying the data transformation} +} +\if{html}{\out{
}} +} +\subsection{Details}{ +Calling \code{track()} with \code{transform = NULL} with return a UID, providing +that UID with the same path with start a run +} + +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Instance-finish}{}}} +\subsection{Method \code{finish()}}{ +Finish a tracked run +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Instance$finish()}\if{html}{\out{
}} +} + } \if{html}{\out{
}} \if{html}{\out{}} diff --git a/vignettes/architecture.qmd b/vignettes/architecture.qmd index 41e5fff..1c95d22 100644 --- a/vignettes/architecture.qmd +++ b/vignettes/architecture.qmd @@ -127,7 +127,9 @@ classDiagram Instance --> RelatedRecords InstanceAPI --> RelatedRecords - %% Use #emsp; to create indents in the rendered diagram when necessary + %% Methods must be on one line to be shown in the right diagram section + %% Use \n for newlines and #emsp; to create indents in the rendered + %% diagram when necessary class laminr{ +connect(String slug): RichInstance @@ -150,13 +152,16 @@ classDiagram +api_url: String } class Instance{ - +initialize( - #emsp;InstanceSettings Instance_settings, API api, - #emsp;Map schema - ): Instance + +initialize(\n#emsp;InstanceSettings Instance_settings, API api, \n#emsp;Map schema\n): Instance +get_modules(): Module[] +get_module(String module_name): Module +get_module_names(): String[] + +get_api(): InstanceAPI + +get_settings(): InstanceSettings + +get_py_lamin(Boolean check, String what): PythonModule + +track(String path, String transform): NULL + +finish(): NULL + +is_default: Boolean } class InstanceAPI{ +initialize(InstanceSettings Instance_settings) @@ -166,38 +171,28 @@ classDiagram +delete_record(...): NULL } class Module{ - +initialize( - #emsp;Instance Instance, API api, String module_name, - #emsp;Map module_schema - ): Module + +initialize(\n#emsp;Instance Instance, API api, String module_name,\n#emsp;Map module_schema\n): Module +name: String +get_registries(): Registry[] +get_registry(String registry_name): Registry +get_registry_names(): String[] } class Registry{ - +initialize( - #emsp;Instance Instance, Module module, API api, - #emsp;String registry_name, Map registry_schema - ): Registry + +initialize(\n#emsp;Instance Instance, Module module, API api,\n#emsp;String registry_name, Map registry_schema\n): Registry +name: String +class_name: String +is_link_table: Bool +get_fields(): Field[] +get_field(String field_name): Field +get_field_names(): String[] - +get(String id_or_uid, Bool include_foreign_keys, List~String~ select, Bool verbose): RichRecord + +get(\n#emsp;String id_or_uid, Bool include_foreign_keys,\n#emsp;List~String~ select, Bool verbose\n): RichRecord +get_record_class(): RichRecordClass +get_temporary_record_class(): TemporaryRecordClass +df(Integer limit, Bool verbose): DataFrame - +from_df(DataFrame dataframe, String key, String description, String run)): TemporaryRecord + +from_df(\n#emsp;DataFrame dataframe, String key,\n#emsp;String description, String run\n): TemporaryRecord } class Field{ - +initialize( - #emsp;String type, String through, String field_name, String registry_name, - #emsp;String column_name, String module_name, Bool is_link_table, String relation_type, - #emsp;String related_field_name, String related_registry_name, String related_module_name - ): Field + +initialize(\n#emsp;String type, String through, String field_name,\n#emsp;String registry_name, String column_name, String module_name,\n#emsp;Bool is_link_table, String relation_type, String related_field_name,\n#emsp;String related_registry_name, String related_module_name\n): Field +type: String +through: Map +field_name: String @@ -211,15 +206,12 @@ classDiagram +related_module_name: String } class Record{ - +initialize(Instance Instance, Registry registry, API api, Map data): Record + +initialize(\n#emsp;Instance Instance, Registry registry,\n#emsp;API api, Map data\n): Record +get_value(String field_name): Any +delete(): NULL } class RelatedRecords{ - +initialize( - #emsp;Instance instance, Registry registry, Field field, - #emsp;String related_to, API api - ): RelatedRecords + +initialize(\n#emsp;Instance instance, Registry registry, Field field,\n#emsp;String related_to, API api\n): RelatedRecords +df(): DataFrame +field: Field } @@ -317,13 +309,16 @@ classDiagram +api_url: String } class Instance{ - +initialize( - #emsp;InstanceSettings Instance_settings, API api, - #emsp;Map schema - ): Instance + +initialize(\n#emsp;InstanceSettings Instance_settings, API api, \n#emsp;Map schema\n): Instance +get_modules(): Module[] +get_module(String module_name): Module +get_module_names(): String[] + +get_api(): InstanceAPI + +get_settings(): InstanceSettings + +get_py_lamin(Boolean check, String what): PythonModule + +track(String path, String transform): NULL + +finish(): NULL + +is_default: Boolean } class InstanceAPI{ +initialize(InstanceSettings Instance_settings) @@ -333,38 +328,28 @@ classDiagram +delete_record(...): NULL } class Module{ - +initialize( - #emsp;Instance Instance, API api, String module_name, - #emsp;Map module_schema - ): Module + +initialize(\n#emsp;Instance Instance, API api, String module_name,\n#emsp;Map module_schema\n): Module +name: String +get_registries(): Registry[] +get_registry(String registry_name): Registry +get_registry_names(): String[] } class Registry{ - +initialize( - #emsp;Instance Instance, Module module, API api, - #emsp;String registry_name, Map registry_schema - ): Registry + +initialize(\n#emsp;Instance Instance, Module module, API api,\n#emsp;String registry_name, Map registry_schema\n): Registry +name: String +class_name: String +is_link_table: Bool +get_fields(): Field[] +get_field(String field_name): Field +get_field_names(): String[] - +get(String id_or_uid, Bool include_foreign_keys, List~String~ select, Bool verbose): RichRecord + +get(\n#emsp;String id_or_uid, Bool include_foreign_keys,\n#emsp;List~String~ select, Bool verbose\n): RichRecord +get_record_class(): RichRecordClass +get_temporary_record_class(): TemporaryRecordClass +df(Integer limit, Bool verbose): DataFrame - +from_df(DataFrame dataframe, String key, String description, String run)): TemporaryRecord + +from_df(\n#emsp;DataFrame dataframe, String key,\n#emsp;String description, String run\n): TemporaryRecord } class Field{ - +initialize( - #emsp;String type, String through, String field_name, String registry_name, - #emsp;String column_name, String module_name, Bool is_link_table, String relation_type, - #emsp;String related_field_name, String related_registry_name, String related_module_name - ): Field + +initialize(\n#emsp;String type, String through, String field_name,\n#emsp;String registry_name, String column_name, String module_name,\n#emsp;Bool is_link_table, String relation_type, String related_field_name,\n#emsp;String related_registry_name, String related_module_name\n): Field +type: String +through: Map +field_name: String @@ -378,15 +363,12 @@ classDiagram +related_module_name: String } class Record{ - +initialize(Instance Instance, Registry registry, API api, Map data): Record + +initialize(\n#emsp;Instance Instance, Registry registry,\n#emsp;API api, Map data\n): Record +get_value(String field_name): Any +delete(): NULL } class RelatedRecords{ - +initialize( - #emsp;Instance instance, Registry registry, Field field, - #emsp;String related_to, API api - ): RelatedRecords + +initialize(\n#emsp;Instance instance, Registry registry, Field field,\n#emsp;String related_to, API api\n): RelatedRecords +df(): DataFrame +field: Field } diff --git a/vignettes/development.qmd b/vignettes/development.qmd index a836d8c..7a15394 100644 --- a/vignettes/development.qmd +++ b/vignettes/development.qmd @@ -72,10 +72,11 @@ This document outlines the features of the **{laminr}** package and the roadmap ### Track notebooks & scripts -* [ ] **Track code execution**: Automatically track the execution of R scripts and notebooks. +* [x] **Track code execution**: Automatically track the execution of R scripts and notebooks. * [ ] **Capture run context**: Record information about the execution environment (e.g., package versions, parameters). -* [ ] **Link code to artifacts**: Associate code execution with generated artifacts. +* [x] **Link code to artifacts**: Associate code execution with generated artifacts. * [ ] **Visualize data lineage**: Create visualizations of data lineage and dependencies. +* [x] **Finalize tracking**: End and save a run. ### Curate datasets @@ -126,10 +127,11 @@ A first version of the package that allows users to: * Expand query functionality with comparators, relationships, and pagination. * Implement basic data and metadata management features (create, save, load and delete artifacts). * Expand support for different data formats and storage backends. +* Implement code tracking. ### Version 0.3.0 -* Implement code tracking and data lineage visualization. +* Implement data lineage visualization. * Introduce data curation features (validation, standardization, annotation). * Enhance support for bionty registries and ontology interactions.