Skip to content

Commit

Permalink
docs: some polishing / landing page work (#618)
Browse files Browse the repository at this point in the history
  • Loading branch information
bjchambers authored Aug 8, 2023
1 parent 50a27eb commit 6bd484c
Show file tree
Hide file tree
Showing 7 changed files with 302 additions and 32 deletions.
144 changes: 144 additions & 0 deletions sparrow-py/docs/source/_extensions/gallery_directive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
"""A directive to generate a gallery of images from structured data.
Generating a gallery of images that are all the same size is a common
pattern in documentation, and this can be cumbersome if the gallery is
generated programmatically. This directive wraps this particular use-case
in a helper-directive to generate it with a single YAML configuration file.
It currently exists for maintainers of the pydata-sphinx-theme,
but might be abstracted into a standalone package if it proves useful.
"""
from pathlib import Path
from typing import Any, Dict, List

from docutils import nodes
from docutils.parsers.rst import directives
from sphinx.application import Sphinx
from sphinx.util import logging
from sphinx.util.docutils import SphinxDirective
from yaml import safe_load

logger = logging.getLogger(__name__)


TEMPLATE_GRID = """
`````{{grid}} {columns}
{options}
{content}
`````
"""

GRID_CARD = """
````{{grid-item-card}} {title}
{options}
{content}
````
"""


class GalleryGridDirective(SphinxDirective):
"""A directive to show a gallery of images and links in a Bootstrap grid.
The grid can be generated from a YAML file that contains a list of items, or
from the content of the directive (also formatted in YAML). Use the parameter
"class-card" to add an additional CSS class to all cards. When specifying the grid
items, you can use all parameters from "grid-item-card" directive to customize
individual cards + ["image", "header", "content", "title"].
Danger:
This directive can only be used in the context of a Myst documentation page as
the templates use Markdown flavored formatting.
"""

name = "gallery-grid"
has_content = True
required_arguments = 0
optional_arguments = 1
final_argument_whitespace = True
option_spec = {
# A class to be added to the resulting container
"grid-columns": directives.unchanged,
"class-container": directives.unchanged,
"class-card": directives.unchanged,
}

def run(self) -> List[nodes.Node]:
"""Create the gallery grid."""
if self.arguments:
# If an argument is given, assume it's a path to a YAML file
# Parse it and load it into the directive content
path_data_rel = Path(self.arguments[0])
path_doc, _ = self.get_source_info()
path_doc = Path(path_doc).parent
path_data = (path_doc / path_data_rel).resolve()
if not path_data.exists():
logger.warn(f"Could not find grid data at {path_data}.")
nodes.text("No grid data found at {path_data}.")
return
yaml_string = path_data.read_text()
else:
yaml_string = "\n".join(self.content)

# Use all the element with an img-bottom key as sites to show
# and generate a card item for each of them
grid_items = []
for item in safe_load(yaml_string):

# remove parameters that are not needed for the card options
title = item.pop("title", "")

# build the content of the card using some extra parameters
header = f"{item.pop('header')} \n^^^ \n" if "header" in item else ""
image = f"![image]({item.pop('image')}) \n" if "image" in item else ""
content = f"{item.pop('content')} \n" if "content" in item else ""

# optional parameter that influence all cards
if "class-card" in self.options:
item["class-card"] = self.options["class-card"]

loc_options_str = "\n".join(f":{k}: {v}" for k, v in item.items()) + " \n"

card = GRID_CARD.format(
options=loc_options_str, content=header + image + content, title=title
)
grid_items.append(card)

# Parse the template with Sphinx Design to create an output container
# Prep the options for the template grid
class_ = "gallery-directive" + f' {self.options.get("class-container", "")}'
options = {"gutter": 2, "class-container": class_}
options_str = "\n".join(f":{k}: {v}" for k, v in options.items())

# Create the directive string for the grid
grid_directive = TEMPLATE_GRID.format(
columns=self.options.get("grid-columns", "1 2 3 4"),
options=options_str,
content="\n".join(grid_items),
)

# Parse content as a directive so Sphinx Design processes it
container = nodes.container()
self.state.nested_parse([grid_directive], 0, container)

# Sphinx Design outputs a container too, so just use that
return [container.children[0]]


def setup(app: Sphinx) -> Dict[str, Any]:
"""Add custom configuration to sphinx app.
Args:
app: the Sphinx application
Returns:
the 2 parallel parameters set to ``True``.
"""
app.add_directive("gallery-grid", GalleryGridDirective)

return {
"parallel_read_safe": True,
"parallel_write_safe": True,
}
10 changes: 8 additions & 2 deletions sparrow-py/docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
from typing import Any
from typing import Dict

from pathlib import Path
import sys
sys.path.append(str(Path(".").resolve()))

project = "sparrow-py"
author = "Kaskada Contributors"
copyright = "2023, Kaskada Contributors"
Expand All @@ -11,9 +15,11 @@
"sphinx.ext.napoleon",
"sphinx.ext.intersphinx",
"sphinx.ext.todo",
"sphinx_design",
# "myst_parser",
"myst_nb",
"sphinx_copybutton",
"_extensions.gallery_directive",
]
autodoc_typehints = "description"
language = "en"
Expand All @@ -28,6 +34,7 @@
"use_repository_button": True,
"use_source_button": True,
"use_edit_page_button": True,
"home_page_in_toc": True,
"use_issues_button": True,
"repository_branch": "main",
"path_to_docs": "sparrow-py/docs/source",
Expand All @@ -44,8 +51,7 @@
"icon": "fa-brands fa-slack",
},
],
"show_nav_level": 3,
"show_toc_level": 2,
"primary_sidebar_end": ["indices.html"],
}

templates_path = ["_templates"]
Expand Down
38 changes: 38 additions & 0 deletions sparrow-py/docs/source/guide/introduction.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Introduction

Understanding and reacting to the world in real-time requires understanding what is happening _now_ in the context of what happened in the past.
You need the ability to understand if what just happened is unusual, how it relates to what happened previously, and how it relates to other things that are happening at the same time.

Kaskada processes events from streams and historic data sources to answer these questions in real-time.

The power and convenience of Kaskad comes from a new: the Timestream.
Timestreams provide a declarative API like dataframes over the complete temporal context.
Easily combine multiple streams and reason about the complete sequence of events.
Use time-travel to compute training examples from historic data and understand how results change over time.

## What are "Timestreams"?

A [Timestream](../reference/timestream/index) describes how a value changes over time. In the same way that SQL
queries transform tables and graph queries transform nodes and edges,
Kaskada queries transform Timestreams.

In comparison to a timeseries which often contains simple values (e.g., numeric
observations) defined at fixed, periodic times (i.e., every minute), a Timestream
contains any kind of data (records or collections as well as primitives) and may
be defined at arbitrary times corresponding to when the events occur.

## Getting Started with Timestreams

Getting started with Timestreams is as simple as `pip` installing the Python library, loading some data and running a query.

```python
import timestreams as t

# Read data from a Parquet file.
data = t.sources.Parquet.from_file(
"path_to_file.parquet",
time = "time",
key = "user")
# Get the count of events associated with each user over time, as a dataframe.
data.count().run().to_pandas()
```
105 changes: 81 additions & 24 deletions sparrow-py/docs/source/index.md
Original file line number Diff line number Diff line change
@@ -1,39 +1,88 @@
---
hide-toc: true
html_theme.sidebar_secondary.remove: true
title: Kaskada Timestreams
---

# Kaskada Timestreams
<div class="px-4 py-5 my-5 text-center">
<img class="d-block mx-auto mb-4" src="_static/kaskada.svg" alt="" width="auto">
<h1 class="display-5 fw-bold">Event-processing without the fuss.</h1>
<div class="col-lg-6 mx-auto">
<p class="lead mb-4">Real-time and historic event processing in Python.
</p>
</div>
</div>

```{include} ../../README.md
:start-after: <!-- start elevator-pitch -->
:end-before: <!-- end elevator-pitch -->
```{gallery-grid}
:grid-columns: 1 2 2 3
- header: "{fas}`timeline;pst-color-primary` Real-time processing for all events"
content: "Quickly process structured events so you can respond in real-time."
link: ".#stream"
- header: "{fab}`python;pst-color-primary` Python-native"
content: "Use Python so you can load data, process it, and train and serve models from one place."
link: ".#python"
- header: "{fas}`gauge-high;pst-color-primary` Get started immediately"
content: "No infrastructure to deploy let's you jump right in."
link: ".#get-started"
- header: "{fas}`rocket;pst-color-primary` Local, Remote and Distributed"
content: "Develop and test locally. Deploy to Docker, K8s or a service for production."
link: ".#local-and-distributed"
- header: "{fas}`fast-forward;pst-color-primary` Real-time, Batch and Streaming"
content: "Execute large-historic queries or materialize in real-time. Or both."
link: ".#real-time-and-historic"
- header: "{fas}`backward;pst-color-primary` Time-travel"
content: "Generate training examples from the past to predict the future."
link: ".#time-travel"
```

## What are "Timestreams"?
A [Timestream](reference/timestream/index) describes how a value changes over time. In the same way that SQL
queries transform tables and graph queries transform nodes and edges,
Kaskada queries transform Timestreams.
* * *

In comparison to a timeseries which often contains simple values (e.g., numeric
observations) defined at fixed, periodic times (i.e., every minute), a Timestream
contains any kind of data (records or collections as well as primitives) and may
be defined at arbitrary times corresponding to when the events occur.
(stream)=
# Real-time event-processing

## Getting Started with Timestreams
Kaskada is built on Apache Arrow, providing an efficient, columnar representation of data.
The same approach is at the core of many analytic databases as well as Pandas and Polars.

Getting started with Timestreams is as simple as `pip` installing the Python library, loading some data and running a query.
Kaskada goes beyond the columnar representation, by introduce a Timestream -- a columnar representation of events, ordered by time and grouped by key.
This representation is a perfect fit for all kinds of events, modern event streams as well as events stored in a database.
Specializing for Timestreams allows Kaskada to optimize temporal queries and execute them much faster.

```python
import timestreams as t
(python)=
# Python-native

# Read data from a Parquet file.
data = t.sources.Parquet.from_file(
"path_to_file.parquet",
time = "time",
key = "user")
# Get the count of events associated with each user over time, as a dataframe.
data.count().run().to_pandas()
```
Connect to existing data in streams or databases, or load data using Python.
Wherever your events are stored, Kaskada can help you process them.

Build temporal queries and process the results using Python.
Connect straight to your visualizations, dashboards or machine learning systems.

Kaskada lets you do it all in one place.

(get_started)=
# Get Started

With no infrastructure to deploy, get started processing events immediately.
Check out the [Quick Start](quickstart) now!

(local-and-distributed)=
# Local, Remote and Distributed

Fast enough to run locally, Kaskada makes it easy to build and test your real-time queries.

Built for the cloud and supporting partitioned and distributed execution, Kaskada scales to the volume and throughput you need.


(real_time_and_historic)=
# Real-time and Historic

Process events in real-time as they arrive.
Backfill materializations by starting with history and switching to the stream.

(time-travel)=
# Time Travel
Compute temporal joins at the correct times, without risk of leakage.

```{toctree}
:hidden:
Expand All @@ -43,6 +92,14 @@ concepts
examples/index
```

```{toctree}
:caption: User Guide
:hidden:
:maxdepth: 1
guide/introduction
```

```{toctree}
:caption: Reference
:hidden:
Expand Down
11 changes: 6 additions & 5 deletions sparrow-py/noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,14 +153,15 @@ def xdoctest(session: Session) -> None:
session.run("python", "-m", "xdoctest", *args)

DOCS_DEPS = [
"sphinx",
"myst-nb",
"myst-parser",
"pandas",
"pyarrow",
"sphinx-autobuild",
"sphinx-book-theme",
"sphinx-copybutton",
"myst-parser",
"myst-nb",
"pandas",
"pyarrow"
"sphinx-design",
"sphinx",
]

@session(name="docs-build", python=python_versions[0])
Expand Down
Loading

0 comments on commit 6bd484c

Please sign in to comment.