Patch/docs (#278)

* Update template docs * Refactor docs code and update templates * Rename a module file * Add Dockerfile for doc-builder image * Clean up some dependencies and imports * Add workflow for docs * Add workflow comment * Build jsonld --------- Co-authored-by: nf-osi[bot] <[email protected]>
nf-osi · Apr 12, 2023 · f0fad92 · f0fad92
1 parent f3ced33
commit f0fad92
Show file tree

Hide file tree

Showing 28 changed files with 4,278 additions and 1,360 deletions.
diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
@@ -0,0 +1,45 @@
+name: Build and publish docs to GH Pages
+
+on:
+  push:
+    branches: 
+      - main
+
+  # TODO setup conditional to build but not push
+  #pull_request:
+  #  branches: 
+  #    - main
+
+jobs:
+
+  build-and-publish:
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: read
+      pages: write     
+      id-token: write 
+
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Setup GH Pages
+        uses: actions/configure-pages@v3
+
+      - name: Build using docker 
+        run: |
+          docker run -v $(pwd):/app ghcr.io/nf-osi/data-model-docs 
+
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v1
+        with:
+          path: docs 
+
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v2
diff --git a/NF.csv b/NF.csv
diff --git a/NF.jsonld b/NF.jsonld
@@ -7558,6 +7558,28 @@
             ],
             "sms:validationRules": []
         },
+        {
+            "@id": "bts:Particlecharacterization",
+            "@type": "rdfs:Class",
+            "rdfs:comment": "A series of analytical methods that provide information about entities such as composition, structure and defects. ",
+            "rdfs:label": "Particlecharacterization",
+            "rdfs:subClassOf": [
+                {
+                    "@id": "bts:DataType"
+                }
+            ],
+            "schema:isPartOf": {
+                "@id": "http://schema.biothings.io"
+            },
+            "sms:displayName": "particle characterization",
+            "sms:required": "sms:false",
+            "sms:requiresDependency": [
+                {
+                    "@id": "bts:Assay"
+                }
+            ],
+            "sms:validationRules": []
+        },
         {
             "@id": "bts:Kinomics",
             "@type": "rdfs:Class",
@@ -21519,23 +21541,6 @@
             "sms:required": "sms:false",
             "sms:validationRules": []
         },
-        {
-            "@id": "bts:Particlecharacterization",
-            "@type": "rdfs:Class",
-            "rdfs:comment": "TBD",
-            "rdfs:label": "Particlecharacterization",
-            "rdfs:subClassOf": [
-                {
-                    "@id": "bts:DataType"
-                }
-            ],
-            "schema:isPartOf": {
-                "@id": "http://schema.biothings.io"
-            },
-            "sms:displayName": "particle characterization",
-            "sms:required": "sms:false",
-            "sms:validationRules": []
-        },
         {
             "@id": "bts:TPM",
             "@type": "rdfs:Class",

diff --git a/docs/Dockerfile b/docs/Dockerfile
@@ -0,0 +1,9 @@
+FROM ghcr.io/nf-osi/nfportalutils:develop
+
+WORKDIR /app
+
+RUN apt-get -yq install pandoc
+
+RUN R -e "install.packages(c('rmarkdown', 'reactable', 'visNetwork'), repos='http://cran.rstudio.com/')"
+
+ENTRYPOINT ["/bin/bash", "R", "-e", "rmarkdown::render('docs/index.Rmd')"]
diff --git a/docs/docTemplate.R b/docs/docTemplate.R
@@ -15,48 +15,72 @@
 #' 
 #' Currently, schematic templates allow modeling more on the simplistic side and 
 #' don't formally express all these, so only a few are checked.
-#' Moreover, the jsonld version encodes much less information than the csv version
-#' (jsonld conversion loses custom metadata in the csv), which is why this currently depends on both formats. 
+#' Currently, the jsonld version loses some information when translated from the csv source
+#' (mainly the summary Range definition corresponding to https://www.w3.org/TR/rdf-schema/#ch_range and EditorNote).
 #' 
 #' @param templates Named vector of templates to process,
 #' where names corresponds to id without prefix (currently whatever follows "bts:"),
 #' and value is the real internal ID (in .ID).
-#' @param schema_csv Schema representation read from `.csv`.
-#' @param schema_jsonld Schema path to jsonld file.
+#' @param schema Schema list object parsed from a schematic jsonld.
+#' @param prefix Namespace prefix.
 #' @param savedir Directory where template representations will be outputted.
+#' @param verbose Whether to be verbose about what's going on.
 docTemplate <- function(templates,
-                        schema_csv,
-                        schema_jsonld = "../NF.jsonld",
-                        savedir = "templates/") {
+                        schema,
+                        prefix = "bts:",
+                        savedir = "templates/",
+                        verbose = TRUE) {
 
 
   for(x in names(templates)) {  # e.g. x <- "GenomicsAssayTemplate"
     # For template, parse DependsOn to get all props present in manifest
-    props <- nfportalutils::get_dependency_from_json_schema(paste0("bts:", x), 
-                                                            schema = schema_jsonld)
+    prop_ids <- nfportalutils::get_dependency_from_json_schema(paste0(prefix, x), 
+                                                               schema = schema, 
+                                                               return_labels = FALSE)
 
-    # Create the ControlledVocab aka Range col for each prop
-    # ControlledVocab col is handled specially and uses a custom Range col defined in csv
-    # For CV col we create a link to a class if the term editor has referenced a class in Range, 
-    # else we simply fall back to enumerating the valid values
-    index <- match(props, schema_csv$Attribute)
-    range <- dplyr::if_else(schema_csv[index, "Range"] != "", 
-                            paste0("#", schema_csv[index, "Range"]), 
-                            schema_csv[index, "Valid.Values"])
+    # The range of prop `assay` is anything of class `Assay` --
+    # However, the json-ld does not make this so conceptually concise for props, instead listing all possible values
+    # In the docs, we don't want to enumerate all values and instead want to create a _link_ to a class that defines the range
+    # To do this, we can infer class by look up the class of the first listed enum for that prop
+    # The range could also be inferred to be a boolean or string/integer rather than a class
+    summarize_range <- function(prop_id, schema, return_labels = FALSE) {
+
+      enums <- nfportalutils::get_by_prop_from_json_schema(id = prop_id,
+                                                           prop = "schema:rangeIncludes",
+                                                           schema = schema,
+                                                           return_labels = FALSE)
+      if(is.null(enums)) return("")
+      if(length(enums) < 5) return(paste(gsub("bts:", "", enums), collapse = ","))
+      if("bts:Yes" %in% enums) return("Y/N")
+      enum1 <- enums[1]
+      # additional lookup class
+      class <- nfportalutils::get_by_prop_from_json_schema(enum1, 
+                                                           prop = "rdfs:subClassOf", 
+                                                           schema = schema,
+                                                           return_labels = FALSE)[[1]] 
+      if(length(class) > 1) warning(enum1, " has multiple parent classes")
+      class <- sub("bts:", "", class[1]) # use first but warn
+      class <- paste0("#", class)
+      class
+    }
 
-    template_tab <- data.table(Field = props,
-                               Description = schema_csv[index, "Description"],
-                               Required = ifelse(schema_csv[index, "Required"], "required", "optional"),
-                               ControlledVocab = range,
-                               # Cardinality = schema_csv[index, "Cardinality"],
-                               Note = schema_csv[index, "EditorNote"])
+    # because of the way schematic imports biothings without us having much control over it some ids can be duplicated (!)
+    schema <- schema[!duplicated(sapply(schema, function(x) x$`@id`))]
+    sms <- Filter(function(x) x$`@id` %in% prop_ids, schema)
+    sms <- lapply(sms, function(x) {
+      list(Field = x$`sms:displayName`,
+           Description = if(!is.null(x$`rdfs:comment`)) x$`rdfs:comment` else " ",
+           Required = if(!is.null(x$`sms:required`)) sub("sms:", "", x$`sms:required`) else "?", 
+           ValidRange = summarize_range(x$`@id`, schema))
+    })
+    tt <- rbindlist(sms)
 
     # Sort to show by required, then alphabetically
-    template_tab <- template_tab[order(-Required, Field), ]
+    tt <- tt[order(-Required, Field), ]
 
     template_id <- templates[x]
     filepath <-  paste0(savedir, template_id, ".csv")
-    write.csv(template_tab, file = filepath, row.names = F)
+    write.csv(tt, file = filepath, row.names = F)
   }
 }
 

diff --git a/docs/graph.R b/docs/graph.R
@@ -1,5 +1,4 @@
 library(visNetwork)
-library(tidyverse)
 
 #-------------------------------------------------------------------------------#
 
@@ -10,7 +9,7 @@ library(tidyverse)
 # schema <- readExtSchema("NF.csv")
 readExtSchema <- function(schema_csv, ext_classes_csv = "ext_classes.csv") {
   schema <- read.csv(schema_csv) %>%
-    select(label = Attribute, id = .ID, Root = .Root, SubOf = .SubOf)
+    dplyr::select(label = Attribute, id = .ID, Root = .Root, SubOf = .SubOf)
 
   # Extended class definitions
   ext_classes <- read.csv(ext_classes_csv) %>%
@@ -32,7 +31,7 @@ getNodesEdges <- function(schema, cluster_root,
                                        font.color = list(A = "white", C = "white"))
                           ) {
   cluster <- schema %>% 
-    filter(Root == cluster_root)
+    dplyr::filter(Root == cluster_root)
 
   # Namespaces for cluster ancestor vs Children
   A <- paste(prefix, "A", sep = "_")
@@ -66,7 +65,7 @@ c2Cluster <- function(cluster_1, cluster_2, connect_by,
   # Configure between-cluster relations
   relations <- read.csv(ext_relations_csv, header = T)
   edges <- relations %>%
-    filter(property == connect_by)
+    dplyr::filter(property == connect_by)
   relations$color <- viz$color
   relations$width <- viz$width