diff --git a/sparrow-py/docs/source/_extensions/gallery_directive.py b/sparrow-py/docs/source/_extensions/gallery_directive.py new file mode 100644 index 000000000..878c17c33 --- /dev/null +++ b/sparrow-py/docs/source/_extensions/gallery_directive.py @@ -0,0 +1,144 @@ +"""A directive to generate a gallery of images from structured data. + +Generating a gallery of images that are all the same size is a common +pattern in documentation, and this can be cumbersome if the gallery is +generated programmatically. This directive wraps this particular use-case +in a helper-directive to generate it with a single YAML configuration file. + +It currently exists for maintainers of the pydata-sphinx-theme, +but might be abstracted into a standalone package if it proves useful. +""" +from pathlib import Path +from typing import Any, Dict, List + +from docutils import nodes +from docutils.parsers.rst import directives +from sphinx.application import Sphinx +from sphinx.util import logging +from sphinx.util.docutils import SphinxDirective +from yaml import safe_load + +logger = logging.getLogger(__name__) + + +TEMPLATE_GRID = """ +`````{{grid}} {columns} +{options} + +{content} + +````` +""" + +GRID_CARD = """ +````{{grid-item-card}} {title} +{options} + +{content} +```` +""" + + +class GalleryGridDirective(SphinxDirective): + """A directive to show a gallery of images and links in a Bootstrap grid. + + The grid can be generated from a YAML file that contains a list of items, or + from the content of the directive (also formatted in YAML). Use the parameter + "class-card" to add an additional CSS class to all cards. When specifying the grid + items, you can use all parameters from "grid-item-card" directive to customize + individual cards + ["image", "header", "content", "title"]. + + Danger: + This directive can only be used in the context of a Myst documentation page as + the templates use Markdown flavored formatting. + """ + + name = "gallery-grid" + has_content = True + required_arguments = 0 + optional_arguments = 1 + final_argument_whitespace = True + option_spec = { + # A class to be added to the resulting container + "grid-columns": directives.unchanged, + "class-container": directives.unchanged, + "class-card": directives.unchanged, + } + + def run(self) -> List[nodes.Node]: + """Create the gallery grid.""" + if self.arguments: + # If an argument is given, assume it's a path to a YAML file + # Parse it and load it into the directive content + path_data_rel = Path(self.arguments[0]) + path_doc, _ = self.get_source_info() + path_doc = Path(path_doc).parent + path_data = (path_doc / path_data_rel).resolve() + if not path_data.exists(): + logger.warn(f"Could not find grid data at {path_data}.") + nodes.text("No grid data found at {path_data}.") + return + yaml_string = path_data.read_text() + else: + yaml_string = "\n".join(self.content) + + # Use all the element with an img-bottom key as sites to show + # and generate a card item for each of them + grid_items = [] + for item in safe_load(yaml_string): + + # remove parameters that are not needed for the card options + title = item.pop("title", "") + + # build the content of the card using some extra parameters + header = f"{item.pop('header')} \n^^^ \n" if "header" in item else "" + image = f"![image]({item.pop('image')}) \n" if "image" in item else "" + content = f"{item.pop('content')} \n" if "content" in item else "" + + # optional parameter that influence all cards + if "class-card" in self.options: + item["class-card"] = self.options["class-card"] + + loc_options_str = "\n".join(f":{k}: {v}" for k, v in item.items()) + " \n" + + card = GRID_CARD.format( + options=loc_options_str, content=header + image + content, title=title + ) + grid_items.append(card) + + # Parse the template with Sphinx Design to create an output container + # Prep the options for the template grid + class_ = "gallery-directive" + f' {self.options.get("class-container", "")}' + options = {"gutter": 2, "class-container": class_} + options_str = "\n".join(f":{k}: {v}" for k, v in options.items()) + + # Create the directive string for the grid + grid_directive = TEMPLATE_GRID.format( + columns=self.options.get("grid-columns", "1 2 3 4"), + options=options_str, + content="\n".join(grid_items), + ) + + # Parse content as a directive so Sphinx Design processes it + container = nodes.container() + self.state.nested_parse([grid_directive], 0, container) + + # Sphinx Design outputs a container too, so just use that + return [container.children[0]] + + +def setup(app: Sphinx) -> Dict[str, Any]: + """Add custom configuration to sphinx app. + + Args: + app: the Sphinx application + + Returns: + the 2 parallel parameters set to ``True``. + """ + app.add_directive("gallery-grid", GalleryGridDirective) + + return { + "parallel_read_safe": True, + "parallel_write_safe": True, + } \ No newline at end of file diff --git a/sparrow-py/docs/source/conf.py b/sparrow-py/docs/source/conf.py index 1fd5bcb8c..01b7667d5 100644 --- a/sparrow-py/docs/source/conf.py +++ b/sparrow-py/docs/source/conf.py @@ -2,6 +2,10 @@ from typing import Any from typing import Dict +from pathlib import Path +import sys +sys.path.append(str(Path(".").resolve())) + project = "sparrow-py" author = "Kaskada Contributors" copyright = "2023, Kaskada Contributors" @@ -11,9 +15,11 @@ "sphinx.ext.napoleon", "sphinx.ext.intersphinx", "sphinx.ext.todo", + "sphinx_design", # "myst_parser", "myst_nb", "sphinx_copybutton", + "_extensions.gallery_directive", ] autodoc_typehints = "description" language = "en" @@ -28,6 +34,7 @@ "use_repository_button": True, "use_source_button": True, "use_edit_page_button": True, + "home_page_in_toc": True, "use_issues_button": True, "repository_branch": "main", "path_to_docs": "sparrow-py/docs/source", @@ -44,8 +51,7 @@ "icon": "fa-brands fa-slack", }, ], - "show_nav_level": 3, - "show_toc_level": 2, + "primary_sidebar_end": ["indices.html"], } templates_path = ["_templates"] diff --git a/sparrow-py/docs/source/guide/introduction.md b/sparrow-py/docs/source/guide/introduction.md new file mode 100644 index 000000000..8317214c2 --- /dev/null +++ b/sparrow-py/docs/source/guide/introduction.md @@ -0,0 +1,38 @@ +# Introduction + +Understanding and reacting to the world in real-time requires understanding what is happening _now_ in the context of what happened in the past. +You need the ability to understand if what just happened is unusual, how it relates to what happened previously, and how it relates to other things that are happening at the same time. + +Kaskada processes events from streams and historic data sources to answer these questions in real-time. + +The power and convenience of Kaskad comes from a new: the Timestream. +Timestreams provide a declarative API like dataframes over the complete temporal context. +Easily combine multiple streams and reason about the complete sequence of events. +Use time-travel to compute training examples from historic data and understand how results change over time. + +## What are "Timestreams"? + +A [Timestream](../reference/timestream/index) describes how a value changes over time. In the same way that SQL +queries transform tables and graph queries transform nodes and edges, +Kaskada queries transform Timestreams. + +In comparison to a timeseries which often contains simple values (e.g., numeric +observations) defined at fixed, periodic times (i.e., every minute), a Timestream +contains any kind of data (records or collections as well as primitives) and may +be defined at arbitrary times corresponding to when the events occur. + +## Getting Started with Timestreams + +Getting started with Timestreams is as simple as `pip` installing the Python library, loading some data and running a query. + +```python +import timestreams as t + +# Read data from a Parquet file. +data = t.sources.Parquet.from_file( + "path_to_file.parquet", + time = "time", + key = "user") +# Get the count of events associated with each user over time, as a dataframe. +data.count().run().to_pandas() +``` \ No newline at end of file diff --git a/sparrow-py/docs/source/index.md b/sparrow-py/docs/source/index.md index c2431be87..9853e4aa1 100644 --- a/sparrow-py/docs/source/index.md +++ b/sparrow-py/docs/source/index.md @@ -1,39 +1,88 @@ --- hide-toc: true +html_theme.sidebar_secondary.remove: true +title: Kaskada Timestreams --- -# Kaskada Timestreams +
+ +

Event-processing without the fuss.

+
+

Real-time and historic event processing in Python. +

+
+
-```{include} ../../README.md -:start-after: -:end-before: +```{gallery-grid} +:grid-columns: 1 2 2 3 + +- header: "{fas}`timeline;pst-color-primary` Real-time processing for all events" + content: "Quickly process structured events so you can respond in real-time." + link: ".#stream" +- header: "{fab}`python;pst-color-primary` Python-native" + content: "Use Python so you can load data, process it, and train and serve models from one place." + link: ".#python" +- header: "{fas}`gauge-high;pst-color-primary` Get started immediately" + content: "No infrastructure to deploy let's you jump right in." + link: ".#get-started" + +- header: "{fas}`rocket;pst-color-primary` Local, Remote and Distributed" + content: "Develop and test locally. Deploy to Docker, K8s or a service for production." + link: ".#local-and-distributed" +- header: "{fas}`fast-forward;pst-color-primary` Real-time, Batch and Streaming" + content: "Execute large-historic queries or materialize in real-time. Or both." + link: ".#real-time-and-historic" +- header: "{fas}`backward;pst-color-primary` Time-travel" + content: "Generate training examples from the past to predict the future." + link: ".#time-travel" ``` -## What are "Timestreams"? -A [Timestream](reference/timestream/index) describes how a value changes over time. In the same way that SQL -queries transform tables and graph queries transform nodes and edges, -Kaskada queries transform Timestreams. +* * * -In comparison to a timeseries which often contains simple values (e.g., numeric -observations) defined at fixed, periodic times (i.e., every minute), a Timestream -contains any kind of data (records or collections as well as primitives) and may -be defined at arbitrary times corresponding to when the events occur. +(stream)= +# Real-time event-processing -## Getting Started with Timestreams +Kaskada is built on Apache Arrow, providing an efficient, columnar representation of data. +The same approach is at the core of many analytic databases as well as Pandas and Polars. -Getting started with Timestreams is as simple as `pip` installing the Python library, loading some data and running a query. +Kaskada goes beyond the columnar representation, by introduce a Timestream -- a columnar representation of events, ordered by time and grouped by key. +This representation is a perfect fit for all kinds of events, modern event streams as well as events stored in a database. +Specializing for Timestreams allows Kaskada to optimize temporal queries and execute them much faster. -```python -import timestreams as t +(python)= +# Python-native -# Read data from a Parquet file. -data = t.sources.Parquet.from_file( - "path_to_file.parquet", - time = "time", - key = "user") -# Get the count of events associated with each user over time, as a dataframe. -data.count().run().to_pandas() -``` +Connect to existing data in streams or databases, or load data using Python. +Wherever your events are stored, Kaskada can help you process them. + +Build temporal queries and process the results using Python. +Connect straight to your visualizations, dashboards or machine learning systems. + +Kaskada lets you do it all in one place. + +(get_started)= +# Get Started + +With no infrastructure to deploy, get started processing events immediately. +Check out the [Quick Start](quickstart) now! + +(local-and-distributed)= +# Local, Remote and Distributed + +Fast enough to run locally, Kaskada makes it easy to build and test your real-time queries. + +Built for the cloud and supporting partitioned and distributed execution, Kaskada scales to the volume and throughput you need. + + +(real_time_and_historic)= +# Real-time and Historic + +Process events in real-time as they arrive. +Backfill materializations by starting with history and switching to the stream. + +(time-travel)= +# Time Travel +Compute temporal joins at the correct times, without risk of leakage. ```{toctree} :hidden: @@ -43,6 +92,14 @@ concepts examples/index ``` +```{toctree} +:caption: User Guide +:hidden: +:maxdepth: 1 + +guide/introduction +``` + ```{toctree} :caption: Reference :hidden: diff --git a/sparrow-py/noxfile.py b/sparrow-py/noxfile.py index 7a261d447..e95e870a8 100644 --- a/sparrow-py/noxfile.py +++ b/sparrow-py/noxfile.py @@ -153,14 +153,15 @@ def xdoctest(session: Session) -> None: session.run("python", "-m", "xdoctest", *args) DOCS_DEPS = [ - "sphinx", + "myst-nb", + "myst-parser", + "pandas", + "pyarrow", "sphinx-autobuild", "sphinx-book-theme", "sphinx-copybutton", - "myst-parser", - "myst-nb", - "pandas", - "pyarrow" + "sphinx-design", + "sphinx", ] @session(name="docs-build", python=python_versions[0]) diff --git a/sparrow-py/poetry.lock b/sparrow-py/poetry.lock index 4264e9ba2..2a16470b2 100644 --- a/sparrow-py/poetry.lock +++ b/sparrow-py/poetry.lock @@ -2402,6 +2402,29 @@ sphinx = ">=1.8" code-style = ["pre-commit (==2.12.1)"] rtd = ["ipython", "myst-nb", "sphinx", "sphinx-book-theme", "sphinx-examples"] +[[package]] +name = "sphinx-design" +version = "0.5.0" +description = "A sphinx extension for designing beautiful, view size responsive web components." +optional = false +python-versions = ">=3.8" +files = [ + {file = "sphinx_design-0.5.0-py3-none-any.whl", hash = "sha256:1af1267b4cea2eedd6724614f19dcc88fe2e15aff65d06b2f6252cee9c4f4c1e"}, + {file = "sphinx_design-0.5.0.tar.gz", hash = "sha256:e8e513acea6f92d15c6de3b34e954458f245b8e761b45b63950f65373352ab00"}, +] + +[package.dependencies] +sphinx = ">=5,<8" + +[package.extras] +code-style = ["pre-commit (>=3,<4)"] +rtd = ["myst-parser (>=1,<3)"] +testing = ["myst-parser (>=1,<3)", "pytest (>=7.1,<8.0)", "pytest-cov", "pytest-regressions"] +theme-furo = ["furo (>=2023.7.0,<2023.8.0)"] +theme-pydata = ["pydata-sphinx-theme (>=0.13.0,<0.14.0)"] +theme-rtd = ["sphinx-rtd-theme (>=1.0,<2.0)"] +theme-sbt = ["sphinx-book-theme (>=1.0,<2.0)"] + [[package]] name = "sphinxcontrib-applehelp" version = "1.0.4" @@ -2786,4 +2809,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.8,<4.0" -content-hash = "fe9b719716756057987c023df4d1d827057bb6825edb8fe82ef87fe3db1a108c" +content-hash = "308185d6b8056288b53c0a72689d6340515c1fe3ba9bbc5b3e7374829054e877" diff --git a/sparrow-py/pyproject.toml b/sparrow-py/pyproject.toml index 31c7590ce..fa87c6fc7 100644 --- a/sparrow-py/pyproject.toml +++ b/sparrow-py/pyproject.toml @@ -31,6 +31,7 @@ sphinx = ">=6.0.0" sphinx-autobuild = ">=2021.3.14" sphinx-book-theme = "^1.0.1" sphinx-copybutton = "^0.5.2" +sphinx-design = "^0.5.0" typeguard = ">=2.13.3" xdoctest = {extras = ["colors"], version = ">=0.15.10"} myst-parser = {version = ">=0.16.1"}