Merge pull request #63 from bmeg/more-docs

[WIP] Updates to the Docs
bmeg · Jan 10, 2024 · fb2ec52 · fb2ec52
2 parents e0f25ed + e972690
commit fb2ec52
Show file tree

Hide file tree

Showing 10 changed files with 223 additions and 416 deletions.
diff --git a/docs/categories/index.xml b/docs/categories/index.xml
@@ -1,10 +1,11 @@
 <?xml version="1.0" encoding="utf-8" standalone="yes"?>
 <rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
   <channel>
-    <title>Categories on Sifter</title>
-    <link>https://bmeg.github.io/sifter/categories/</link>
-    <description>Recent content in Categories on Sifter</description>
+    <title>Categories on </title>
+    <link>/categories/</link>
+    <description>Recent content in Categories on </description>
     <generator>Hugo -- gohugo.io</generator>
-    <language>en-us</language><atom:link href="https://bmeg.github.io/sifter/categories/index.xml" rel="self" type="application/rss+xml" />
+    <language>en</language>
+    <atom:link href="/categories/index.xml" rel="self" type="application/rss+xml" />
   </channel>
 </rss>
diff --git a/docs/index.xml b/docs/index.xml
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
@@ -2,79 +2,10 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
   xmlns:xhtml="http://www.w3.org/1999/xhtml">
   <url>
-    <loc>https://bmeg.github.io/sifter/</loc>
-    <priority>0</priority>
+    <loc>/</loc>
   </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/transforms/accumulate/</loc>
+    <loc>/categories/</loc>
   </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/inputs/avroload/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/categories/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/transforms/clean/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/transforms/debug/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/transforms/distinct/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/inputs/embedded/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/transforms/emit/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/example/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/transforms/fieldparse/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/transforms/fieldprocess/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/transforms/fieldtype/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/transforms/filter/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/transforms/from/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/inputs/glob/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/transforms/graphbuild/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/inputs/gripperload/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/transforms/hash/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/inputs/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/inputs/jsonload/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/transforms/lookup/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/transforms/map/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/transforms/objectvalidate/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/transforms/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/transforms/project/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/transforms/reduce/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/transforms/regexreplace/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/playbook/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/transforms/split/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/inputs/sqldump/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/inputs/sqliteload/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/inputs/tableload/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/tags/</loc>
-  </url><url>
-    <loc>https://bmeg.github.io/sifter/docs/inputs/xmlload/</loc>
+    <loc>/tags/</loc>
   </url>
 </urlset>
diff --git a/docs/tags/index.xml b/docs/tags/index.xml
@@ -1,10 +1,11 @@
 <?xml version="1.0" encoding="utf-8" standalone="yes"?>
 <rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
   <channel>
-    <title>Tags on Sifter</title>
-    <link>https://bmeg.github.io/sifter/tags/</link>
-    <description>Recent content in Tags on Sifter</description>
+    <title>Tags on </title>
+    <link>/tags/</link>
+    <description>Recent content in Tags on </description>
     <generator>Hugo -- gohugo.io</generator>
-    <language>en-us</language><atom:link href="https://bmeg.github.io/sifter/tags/index.xml" rel="self" type="application/rss+xml" />
+    <language>en</language>
+    <atom:link href="/tags/index.xml" rel="self" type="application/rss+xml" />
   </channel>
 </rss>
diff --git a/website/content/docs.md b/website/content/docs.md
@@ -11,12 +11,12 @@ menu:
 
 Sifter pipelines process steams of nested JSON messages. Sifter comes with a number of 
 file extractors that operate as inputs to these pipelines. The pipeline engine 
-connects togeather arrays of transform steps into direct acylic graph that is processed
+connects togeather arrays of transform steps into directed acylic graph that is processed
 in parallel.
 
 Example Message:
 
-```
+```json
 {
   "firstName" : "bob",
   "age" : "25"
@@ -37,3 +37,109 @@ be done in a transform pipeline these include:
  - Table based field translation
  - Outputing the message as a JSON Schema checked object
 
+
+# Script structure
+
+## Header
+Each sifter file starts with a set of field to let the software know this is a sifter script, and not some random YAML file. There is also a `name` field for the script. This name will be used for output file creation and logging. Finally, there is an `outdir` that defines the directory where all output files will be placed. All paths are relative to the script file, so the `outdir` set to `my-results` will create the directory `my-results` in the same directory as the script file, regardless of where the sifter command is invoked. 
+```yaml
+class : sifter
+name: <name of script>
+outdir: <where files should be stored>
+```
+
+# Config and templating
+The `config` section is a set of defined keys that are used throughout the rest of the script. 
+
+Example config:
+```
+config:
+  sqlite:  ../../source/chembl/chembl_33/chembl_33_sqlite/chembl_33.db
+  uniprot2ensembl: ../../tables/uniprot2ensembl.tsv
+  schema: ../../schema/
+```
+
+Various fields in the script file will be be parsed using a [Mustache](https://mustache.github.io/) template engine. For example, to access the various values within the config block, the template `{{config.sqlite}}`.
+
+
+# Inputs
+The input block defines the various data extractors that will be used to open resources and create streams of JSON messages for processing. The possible input engines include:
+ - AVRO
+ - JSON
+ - XML
+ - SQL-dump
+ - SQLite
+ - TSV/CSV
+ - GLOB
+
+For any other file types, there is also a plugin option to allow the user to call their own code for opening files.
+
+# Pipeline
+The `pipelines` defined a set of named processing pipelines that can be used to transform data. Each pipeline starts with a `from` statement that defines where data comes from. It then defines a linear set of transforms that are chained togeather to do processing. Pipelines may used `emit` steps to output messages to disk. The possible data transform steps include:
+- Accumulate
+- Clean
+- Distinct
+- DropNull
+- Field Parse
+- Field Process
+- Field Type
+- Filter
+- FlatMap
+- GraphBuild
+- Hash
+- JSON Parse
+- Lookup
+- Value Mapping
+- Object Validation
+- Project
+- Reduce
+- Regex
+- Split
+- UUID Generation
+
+Additionally, users are able to define their one transform step types using the `plugin` step.
+
+# Example script
+```yaml
+class: sifter
+
+name: go
+outdir: ../../output/go/
+
+config:
+  oboFile: ../../source/go/go.obo
+  schema: ../../schema
+
+inputs:
+  oboData:
+    plugin:
+      commandLine: ../../util/obo_reader.py {{config.oboFile}}
+
+pipelines:
+  transform:
+    - from: oboData
+    - project:
+        mapping:
+          submitter_id: "{{row.id[0]}}"
+          case_id: "{{row.id[0]}}"
+          id: "{{row.id[0]}}"
+          go_id: "{{row.id[0]}}"
+          project_id: "gene_onotology"
+          namespace: "{{row.namespace[0]}}"
+          name: "{{row.name[0]}}"
+    - map: 
+        method: fix
+        gpython: | 
+          def fix(row):
+            row['definition'] = row['def'][0].strip('"')
+            if 'xref' not in row:
+              row['xref'] = []
+            if 'synonym' not in row:
+              row['synonym'] = []
+            return row
+    - objectValidate:
+        title: GeneOntologyTerm
+        schema: "{{config.schema}}"
+    - emit:
+        name: term
+```
diff --git a/website/content/docs/inputs/plugin.md b/website/content/docs/inputs/plugin.md
@@ -0,0 +1,74 @@
+---
+title: plugin
+menu:
+  main:
+    parent: inputs
+    weight: 100
+---
+
+# plugin
+Run user program for customized data extraction. 
+
+## Example
+
+```yaml
+inputs:
+  oboData:
+    plugin:
+      commandLine: ../../util/obo_reader.py {{config.oboFile}}
+```
+
+The plugin program is expected to output JSON messages, one per line, to STDOUT that will then 
+be passed to the transform pipelines.
+
+## Example Plugin
+The `obo_reader.py` plugin, it reads a OBO file, such as the kind the describe the GeneOntology, and emits the 
+records as single line JSON messages.
+```python
+ #!/usr/bin/env python
+
+import re
+import sys
+import json
+
+re_section = re.compile(r'^\[(.*)\]')
+re_field = re.compile(r'^(\w+): (.*)$')
+
+def obo_parse(handle):
+    rec = None
+    for line in handle:
+        res = re_section.search(line)
+        if res:
+            if rec is not None:
+                yield rec
+            rec = None
+            if res.group(1) == "Term":
+                rec = {"type": res.group(1)}
+        else:
+            if rec is not None:
+                res = re_field.search(line)
+                if res:
+                    key = res.group(1)
+                    val = res.group(2)
+                    val = re.split(" ! | \(|\)", val)
+                    val = ":".join(val[0:3])
+                    if key in rec:
+                        rec[key].append(val)
+                    else:
+                        rec[key] = [val]
+
+    if rec is not None:
+        yield rec
+
+
+def unquote(s):
+    res = re.search(r'"(.*)"', s)
+    if res:
+        return res.group(1)
+    return s
+
+
+with open(sys.argv[1]) as handle:
+    for rec in obo_parse(handle):
+        print(json.dumps(rec))
+```
diff --git a/website/content/docs/inputs/gripperLoad.md → website/content/docs/transforms/flatmap.md b/website/content/docs/inputs/gripperLoad.md → website/content/docs/transforms/flatmap.md
@@ -1,7 +1,7 @@
 ---
-title: gripperLoad
+title: flatMap
 menu:
   main:
-    parent: inputs
+    parent: transforms
     weight: 100
 ---
diff --git a/website/content/docs/transforms/plugin.md b/website/content/docs/transforms/plugin.md
@@ -0,0 +1,7 @@
+---
+title: plugin
+menu:
+  main:
+    parent: transforms
+    weight: 100
+---
diff --git a/website/content/docs/transforms/tableWrite.md b/website/content/docs/transforms/tableWrite.md
@@ -0,0 +1,7 @@
+---
+title: tableWrite
+menu:
+  main:
+    parent: transforms
+    weight: 100
+---
diff --git a/website/content/docs/transforms/uuid.md b/website/content/docs/transforms/uuid.md
@@ -0,0 +1,7 @@
+---
+title: uuid
+menu:
+  main:
+    parent: transforms
+    weight: 100
+---