From 1fb920f282151c95e57c467dc7021b73fcaafa4d Mon Sep 17 00:00:00 2001 From: Ben Chambers <35960+bjchambers@users.noreply.github.com> Date: Thu, 17 Aug 2023 02:30:06 -0700 Subject: [PATCH] docs: Stub more doc content (#668) Pulling in stuff from existing docs and past drafts. --- python/docs/source/concepts.md | 1 - python/docs/source/conf.py | 5 + python/docs/source/guide/aggregation.md | 3 + python/docs/source/guide/data_types.md | 108 ++++++++++++++++++ python/docs/source/guide/entities.md | 61 ++++++++++ .../guide/{introduction.md => index.md} | 25 ++-- python/docs/source/guide/installation.md | 25 ++++ python/docs/source/guide/joins.md | 18 +++ python/docs/source/guide/sources.md | 1 + python/docs/source/guide/timestreams.md | 3 + python/docs/source/index.md | 13 +-- python/docs/source/quickstart.md | 12 +- .../docs/source/reference/timestream/index.md | 1 - python/docs/source/tour.md | 86 ++++++++++++++ python/docs/source/why.md | 15 +++ python/noxfile.py | 2 +- 16 files changed, 353 insertions(+), 26 deletions(-) delete mode 100644 python/docs/source/concepts.md create mode 100644 python/docs/source/guide/aggregation.md create mode 100644 python/docs/source/guide/data_types.md create mode 100644 python/docs/source/guide/entities.md rename python/docs/source/guide/{introduction.md => index.md} (69%) create mode 100644 python/docs/source/guide/installation.md create mode 100644 python/docs/source/guide/joins.md create mode 100644 python/docs/source/guide/sources.md create mode 100644 python/docs/source/guide/timestreams.md create mode 100644 python/docs/source/tour.md create mode 100644 python/docs/source/why.md diff --git a/python/docs/source/concepts.md b/python/docs/source/concepts.md deleted file mode 100644 index 639d88086..000000000 --- a/python/docs/source/concepts.md +++ /dev/null @@ -1 +0,0 @@ -# Concepts \ No newline at end of file diff --git a/python/docs/source/conf.py b/python/docs/source/conf.py index c429967ea..5f2826bb5 100644 --- a/python/docs/source/conf.py +++ b/python/docs/source/conf.py @@ -39,6 +39,10 @@ "use_issues_button": True, "repository_branch": "main", "path_to_docs": "kaskada/docs/source", + "announcement": ( + "This describes the next version of Kaskada. " + "It is currently available as an alpha release." + ), "icon_links": [ { "name": "GitHub", @@ -54,6 +58,7 @@ ], "primary_sidebar_end": ["indices.html"], "show_toc_level": 2, + "show_nav_level": 2, } templates_path = ["_templates"] diff --git a/python/docs/source/guide/aggregation.md b/python/docs/source/guide/aggregation.md new file mode 100644 index 000000000..79eba855a --- /dev/null +++ b/python/docs/source/guide/aggregation.md @@ -0,0 +1,3 @@ +# Aggregation + +## Windowing \ No newline at end of file diff --git a/python/docs/source/guide/data_types.md b/python/docs/source/guide/data_types.md new file mode 100644 index 000000000..bb60a7dd0 --- /dev/null +++ b/python/docs/source/guide/data_types.md @@ -0,0 +1,108 @@ +# Data Types + +Kaskada operates on typed Timestreams. +Similar to how every Pandas `DataFrame` has an associated `dtype`, every Kaskada `Timestream` has an associated type. +The set of supported types is based on the types supported by [Apache Arrow](https://arrow.apache.org/). + +Each `Timestream` contains points of the corresponding type. +We'll often say that the "type" of a `Timestream` is the type of the values it contains. + +Kaskada's type system describes several kinds of values. +Scalar types correspond to simple values, such as the string `"hello"` or the integer `57`. +They correspond to a stream containing values of the given type, or `null`. +Composite types are created from other types. +For instance, records may be created using scalar and other composite types as fields. +An expression producing a record type is a stream that produces a value of the given record type or `null`. + +## Scalar Types + +Scalar types include booleans, numbers, strings, timestamps, durations and calendar intervals. + +:::{list-table} Scalar Types +:widths: 1, 3 +:header-rows: 1 + +- * Types + * Description +- * `bool` + * Booleans represent true or false. + + Examples: `true`, `false`. +- * `u8`, `u16`, `u32`, `u64` + * Unsigned integer numbers of the specified bit width. + + Examples: `0`, `1`, `1000` +- * `i8`, `i16`, `i32`, `i64` + * Signed integer numbers of the specified bit width. + + Examples: `0`, `1`, `-100` +- * `f32`, `f64` + * Floating point numbers of the specified bit width. + + Examples: `0`, `1`, `-100`, `1000`, `0.0`, `-1.0`, `-100837.631`. +- * `str` + * Unicode strings. + + Examples: `"hello", "hi 'bob'"`. + +- * `timestamp_s`, `timestamp_ms`, `timestamp_us`, `timestamp_ns` + * Points in time relative the Unix Epoch (00:00:00 UTC on January 1, 1970). + Time unit may be seconds (s), milliseconds (ms), microseconds (us) or nanoseconds (ns). + + Examples: `1639595174 as timestamp_s` +- * `duration_s`, `duration_ms`, `duration_us`, `duration_ns` + * A duration of a fixed amount of a specific time unit. + Time unit may be seconds (s), milliseconds (ms), microseconds (us) or nanoseconds (ns). + + Examples: `-100 as duration_ms` +- * `interval_days`, `interval_months` + * A calendar interval corresponding to the given amount of the corresponding time. + The length of an interval depends on the point in time it is added to. + For instance, adding 1 `interval_month` to a timestamp will shift to the same day of the next month. + + Examples: `1 as interval_days`, `-100 as interval_months` +::: + +## Record Types + +Records allow combining 1 or more values of potentially different types into a single value. +Records are unnamed - any two records with the same set of field names and value types are considered equal. Fields within a record may have different types. +Field names must start with a letter. + +For example, `{name: string, age: u32 }` is a record type with two fields and `{name: 'Ben', age: 33 }` is corresponding value. + +NOTE: Record types may be nested. + +## Type Coercion +Kaskada implicitly coerces numeric types when different kinds of numbers are combined. +For example adding a 64-bit signed integer value to a 32-bit floating point value produces a 64-point floating point value + +Type coercion will never produce an integer overflow or reduction in numeric precision. +If needed, such conversions must be explicitly specified using `as`. + +The coercion rules can be summarized with the following rules: + +1. Unsigned integers can be widened: `u8` ⇨ `u16` ⇨ `u32` ⇨ `u64`. +2. Integers can be widened: `i8` ⇨ `i16` ⇨ `i32` ⇨ `i64`. +3. Floating point numbers can be widened: `f16` ⇨ `f32` ⇨ `f64`. +4. Unsigned integers can be promoted to the next wider integer `u8` ⇨ `i16`, `u16` ⇨ `i32`, `u32` ⇨ `i64`. +5. All numbers may be converted to `f64`. +6. Strings may be implicitly converted to timestamps by attempting to parse them as RFC3339 values. +The timestamp will be null for strings that don't successfully parse. + +One aspect of the coercion rules is that when an operation is applied to two different numeric types the result may be a third type which they may both be coerced to. +The type promotion table shows the type resulting from a binary operation involving two different numeric types. + +| | `u8` | `u16` | `u32` | `u64` | `i8` | `i16` | `i32` | `i64` | `f16` | `f32` | `f64` | +| --------- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | +| **`u8`** | `u8` | `u16` | `u32` | `u64` | `i16` | `i16` | `i32` | `i64` | `f16` | `f32` | `f64` | +| **`u16`** | `u16` | `u16` | `u32` | `u64` | `i32` | `i32` | `i32` | `i64` | `f16` | `f32` | `f64` | +| **`u32`** | `u32` | `u32` | `u32` | `u64` | `i64` | `i64` | `i64` | `i64` | `f32` | `f32` | `f64` | +| **`u64`** | `u64` | `u64` | `u64` | `u64` | `f64` | `f64` | `f64` | `f64` | `f64` | `f64` | `f64` | +| **`i8`** | `i16` | `i32` | `i64` | `f64` | `i8` | `i16` | `i32` | `i64` | `f16` | `f32` | `f64` | +| **`i16`** | `i16` | `i32` | `i64` | `f64` | `i16` | `i16` | `i32` | `i64` | `f16` | `f32` | `f64` | +| **`i32`** | `i32` | `i32` | `i64` | `f64` | `i32` | `i32` | `i32` | `i64` | `f16` | `f32` | `f64` | +| **`i64`** | `i64` | `i64` | `i64` | `f64` | `i64` | `i64` | `i64` | `i64` | `f16` | `f32` | `f64` | +| **`f16`** | `f16` | `f16` | `f16` | `f16` | `f16` | `f16` | `f16` | `f16` | `f16` | `f32` | `f64` | +| **`f32`** | `f32` | `f32` | `f32` | `f32` | `f32` | `f32` | `f32` | `f32` | `f32` | `f32` | `f64` | +| **`f64`** | `f64` | `f64` | `f64` | `f64` | `f64` | `f64` | `f64` | `f64` | `f64` | `f64` | `f64` | \ No newline at end of file diff --git a/python/docs/source/guide/entities.md b/python/docs/source/guide/entities.md new file mode 100644 index 000000000..79f4a9cf9 --- /dev/null +++ b/python/docs/source/guide/entities.md @@ -0,0 +1,61 @@ +# Entities and Grouping + +Entities organize data for use in feature engineering. +They describe the particular objects that a prediction will be made for. +The result of a feature computation is a _feature vector_ for each entity at various points in time. + +## What is an Entity? +Entities represent the categories or "nouns" associated with the data. +They can generally be thought of as any category of object related to the events being processed. +For example, when manipulating purchase events, there may be entities for the customers, vendors and items being purchased. +Each purchase event may be related to a customer, a vendor, and one or more items. + +If something can be given a name or other unique identifier, it can likely be used as an entity. +In a relational database, an entity would be anything that is identified by the same key in a set of tables. + +## What is an Entity Key? +An entity kind is a category of objects, for example customer or vendor. +An entity key identifies a unique instance of that category -- a `customer_id` or a `vendor_id`. + +One may think of an entity as a table containing instances -- or rows -- of that type of entity. +The entity key would be the primary key of that table. + +The following table shows some example entities and possible keys. +Many of the example instances may not be suitable for use as the entity key, for the same reason you wouldn't use them as a primary key. +For example, using `Vancouver` to identify cities would lead to ambiguity between Vancouver in British Columbia and Vancouver in Washington State. +In these cases, you'd likely use some other identifier for instances. +Others may be useful, such as using the airport code. + +:::{list-table} Example Entities and corresponding keys. +:header-rows: 1 + +* - Example Entity + - Example Entity Instance +* - Houses + - 1600 Pennsylvania Avenue +* - Airports + - SEA +* - Customers + - John Doe +* - City + - Vancouver +* - State + - Washington +::: + +## Entities and Aggregation + +Many, if not all, Kaskada queries involve aggregating events to produce values. +Entities provide an implicit grouping for the aggregation. +When we write `sum(Purchases.amount)` it is an aggregation that returns the sum of purchases made _by each entity_. +This is helpful since the _feature vector_ for an entity will depend only on events related to that entity. + +```{todo} +Example of grouped streams and aggregation +``` + +## Joining + +Joining with the same entity happens automatically. +Joining with other entities (and even other kinds of entities) is done using `lookup`. +See [Joins](joins.md) for more information. \ No newline at end of file diff --git a/python/docs/source/guide/introduction.md b/python/docs/source/guide/index.md similarity index 69% rename from python/docs/source/guide/introduction.md rename to python/docs/source/guide/index.md index 8317214c2..bbccb4e5c 100644 --- a/python/docs/source/guide/introduction.md +++ b/python/docs/source/guide/index.md @@ -1,4 +1,4 @@ -# Introduction +# User Guide Understanding and reacting to the world in real-time requires understanding what is happening _now_ in the context of what happened in the past. You need the ability to understand if what just happened is unusual, how it relates to what happened previously, and how it relates to other things that are happening at the same time. @@ -12,14 +12,10 @@ Use time-travel to compute training examples from historic data and understand h ## What are "Timestreams"? -A [Timestream](../reference/timestream/index) describes how a value changes over time. In the same way that SQL -queries transform tables and graph queries transform nodes and edges, -Kaskada queries transform Timestreams. +A [Timestream](timestreams) describes how a value changes over time. +In the same way that SQL queries transform tables and graph queries transform nodes and edges, Kaskada queries transform Timestreams. -In comparison to a timeseries which often contains simple values (e.g., numeric -observations) defined at fixed, periodic times (i.e., every minute), a Timestream -contains any kind of data (records or collections as well as primitives) and may -be defined at arbitrary times corresponding to when the events occur. +In comparison to a timeseries which often contains simple values (e.g., numeric observations) defined at fixed, periodic times (i.e., every minute), a Timestream contains any kind of data (records or collections as well as primitives) and may be defined at arbitrary times corresponding to when the events occur. ## Getting Started with Timestreams @@ -35,4 +31,17 @@ data = t.sources.Parquet.from_file( key = "user") # Get the count of events associated with each user over time, as a dataframe. data.count().run().to_pandas() +``` + +```{toctree} +:hidden: +:maxdepth: 2 + +installation +timestreams +data_types +entities +aggregation +joins +sources ``` \ No newline at end of file diff --git a/python/docs/source/guide/installation.md b/python/docs/source/guide/installation.md new file mode 100644 index 000000000..5882f92f2 --- /dev/null +++ b/python/docs/source/guide/installation.md @@ -0,0 +1,25 @@ +# Installation + +To install Kaskada, you need to be using Python >= 3.8. +We suggest using 3.11 or newer, since that provides more precise error locations. + +```{code-block} bash +:caption: Installing Kaskada +pip install kaskada>=0.6.0-a.0 +``` + +```{warning} +This version of Kaskada is currently a pre-release, as indicated by the `-a.0` suffix. +It will not be installed by default if you `pip install kaskada`. +You need to either use `pip install --pre kaskada` or specify a specific version, as shown in the example. +``` + +```{admonition} Pip and pip3 and permissions +:class: tip + +Depending on you Python installation and configuration you may have `pip3` instead of `pip` available in your terminal. +If you do have `pip3` replace pip with `pip3` in your command, i.e., `pip3 install kaskada`. + +If you get a permission error when running the `pip` command, you may need to run as an administrator using `sudo pip install kaskada`. +If you don't have administrator access (e.g., in Google Colab, or other hosted environments) you amy use `pip`’s `--user` flag to install the package in your user directory. +``` diff --git a/python/docs/source/guide/joins.md b/python/docs/source/guide/joins.md new file mode 100644 index 000000000..ed46ab1c9 --- /dev/null +++ b/python/docs/source/guide/joins.md @@ -0,0 +1,18 @@ +# Joins + + +## Domains and Implicit Joins + +It is sometimes useful to consider the _domain_ of an expression. +This corresponds to the points in time and entities associated with the points in the expression. +For discrete timestreams, this corresponds to the points at which those values occur. +For continuous timestreams, this corresponds to the points at which the value changes. + +Whenever expressions with two (or more) different domains are used in the same expression they are implicitly joined. +The join is an outer join that contains an event if either (any) of the input domains contained an event. +For any input table that is continuous, the join is `as of` the time of the output, taking the latest value from that input. + + +## Implicit Joins + +## Explicit Lookups \ No newline at end of file diff --git a/python/docs/source/guide/sources.md b/python/docs/source/guide/sources.md new file mode 100644 index 000000000..24cf12ed2 --- /dev/null +++ b/python/docs/source/guide/sources.md @@ -0,0 +1 @@ +# Sources \ No newline at end of file diff --git a/python/docs/source/guide/timestreams.md b/python/docs/source/guide/timestreams.md new file mode 100644 index 000000000..e11a86a64 --- /dev/null +++ b/python/docs/source/guide/timestreams.md @@ -0,0 +1,3 @@ +# Timestreams + +## Continuity \ No newline at end of file diff --git a/python/docs/source/index.md b/python/docs/source/index.md index 1f7877532..a3820c44f 100644 --- a/python/docs/source/index.md +++ b/python/docs/source/index.md @@ -86,18 +86,13 @@ Compute temporal joins at the correct times, without risk of leakage. ```{toctree} :hidden: +:maxdepth: 3 +why +tour quickstart -concepts examples/index -``` - -```{toctree} -:caption: User Guide -:hidden: -:maxdepth: 1 - -guide/introduction +guide/index ``` ```{toctree} diff --git a/python/docs/source/quickstart.md b/python/docs/source/quickstart.md index 742766ffe..95e019f51 100644 --- a/python/docs/source/quickstart.md +++ b/python/docs/source/quickstart.md @@ -20,12 +20,12 @@ kd.init_session() content = "\n".join( [ "time,key,m,n", - "1996-12-19T16:39:57-08:00,A,5,10", - "1996-12-19T16:39:58-08:00,B,24,3", - "1996-12-19T16:39:59-08:00,A,17,6", - "1996-12-19T16:40:00-08:00,A,,9", - "1996-12-19T16:40:01-08:00,A,12,", - "1996-12-19T16:40:02-08:00,A,,", + "1996-12-19T16:39:57,A,5,10", + "1996-12-19T16:39:58,B,24,3", + "1996-12-19T16:39:59,A,17,6", + "1996-12-19T16:40:00,A,,9", + "1996-12-19T16:40:01,A,12,", + "1996-12-19T16:40:02,A,,", ] ) source = kd.sources.CsvString(content, time_column_name="time", key_column_name="key") diff --git a/python/docs/source/reference/timestream/index.md b/python/docs/source/reference/timestream/index.md index 17a020ca0..a6fc03c99 100644 --- a/python/docs/source/reference/timestream/index.md +++ b/python/docs/source/reference/timestream/index.md @@ -10,7 +10,6 @@ ```{toctree} :hidden: -:maxdepth: 3 aggregation arithmetic diff --git a/python/docs/source/tour.md b/python/docs/source/tour.md new file mode 100644 index 000000000..9db2639a7 --- /dev/null +++ b/python/docs/source/tour.md @@ -0,0 +1,86 @@ +% Level: Beginner +% Goal: Overview of the key features of Kaskada focused on explaining *why* you want them. +% Audience: Someone who has read the landing page and wants to understand what Kaskada can do for them. + +# Tour of Kaskada + +This provides an overview of the key features in Kaskada that enable feature engineering on event-based data. +The [Quick Start](quickstart) has details on how you can quickly get started running Kaskada queries. +For a more complete explanation, see the User Guide. + +## Events and Aggregations + +Every Kaskada query operates on one or more _sources_ containing events. +Every event in a source happens at a specific point in time and relates to a specific entity. +A source contains events with the same schema. +Often, each source represents a specific kind of event, such as a login event or purchase. + +It is often convenient to picture temporal data as a sequence of timestamped events. +A natural question to ask about the purchases is the total--or `sum`--of all purchases made. +This is accomplished by _aggregating_ the events. +The results of an aggregation change over time as additional events occur. + +```{todo} +Port an example showing timestreams and aggregations. +``` + +The User Guide has [more details on aggregation](guide/aggregation.md), including how to use windows to control which events are aggregated. + +## Discrete and Continuous +We say that events (and values derived from them) are _discrete_ because they occur at specific in time. +and the results of the aggregation are [_continuous_](guide/timestreams.md#continuity). +In the example, after the purchase with amount 13 the sum was 20. +And it _continued_ to be 20 at every point in time until the next purchase was made, with amount 4. +A continuous value is inclusive of the event that causes the value to change and exclusive of the next change. + +Thus, an aggregation at a given point in time reflects all events that have happened up to (and including) that point in time. +The concept of continuity applies to many other operations in Kaskada, not just aggregations. +This is part of what we mean when we say that Kaskada is a temporal query language. + +## Grouping +Another property of Kaskada is that events are implicitly grouped by _entity_. +In the previous example, we assumed that all purchases were made by the same user. +When the purchases are made by multiple users, there is a natural grouping for the aggregation. +When computing a machine learning feature such as "total purchases", we usually wish to aggregate the events related to a specific user or entity. + +One way to understand this grouping is as a separate stream associated with each entity. +The stream of purchases for each user may be shown separately, as we do here, or it may be pictured flattened into a single stream keyed by user. +The idea of grouped streams as separate, per-entity streams is often useful for understanding the behavior of Kaskada Timestreams. + +```{todo} +Add example of multiple entity aggregation. +``` + +The User Guide has [more details on grouping](guide/entities.md), including how to change the grouping of a Timestream. + +## History and Snapshots + +Since the Timestream describes how values are computed at every point in time, there are several useful ways they may be output. + +For training a model, it is often useful to output historic values matching some `filter`. +These historic points can then be used as training examples, allowing the model to be trained on past points. +This historic output is also useful for visualizing a Timestream at multiple points. + +For serving a model, it is often useful to output the value of a Timestream for every entity at a specific point in time. +This is most often used to output a snapshot at the current time. + +For both kinds of output, it is also useful to be able to select only the points after a specific time. +This would filter out points from the history, or limit the snapshot to only those entities which have changed. + +## Windowed Aggregation + +```{todo} +Update to reflect actual syntax. Include example. +``` + +In addition to the default behavior of aggregating over all events up to a given time, aggregations may be performed over specific windows. +For example, `hourly()` describes periodic windows of an hour. +The aggregation, `sum(Purchases, window=hourly())` would produce the cumulative sum of purchases made since the beginning of the hour. +For example, if there were purchases at 8:45 AM, 9:15 AM and 9:25 AM and 10:02 AM, then the result at 9:25 AM is the sum from 9:00 AM to 9:25 AM, which would include only the events at 9:15 AM and 9:25 AM. + +A non-cumulative windowed aggregation produces values only at the end of a window. +For instance, `sum(Purchases, window=hourly(), cumulative=false)` will produce the sum for the past hour. +With the purchases in the previous example, this would mean that at 9:00 AM an event is produced containing the amount of the purchase at 8:45 AM, and at 10:00 AM an event is produced containing the sum of the purchases at 9:15 AM and 9:25 AM. +A window must be specified when using a non-cumulative aggregation. + +The User Guide [on Aggregation](guide/aggregation.md#windowing) has more information on windowing. \ No newline at end of file diff --git a/python/docs/source/why.md b/python/docs/source/why.md new file mode 100644 index 000000000..ff2e17b6d --- /dev/null +++ b/python/docs/source/why.md @@ -0,0 +1,15 @@ +# Why Kaskada? + +Kaskada is a library for executing temporal queries over event-based data. +An "event" can be any fact about the world associated with a time. +For example, a user signing up for a service, or a customer purchasing a product. +As additional events occur the computed values may change as well. + +Traditional data processing systems are designed to answer questions about the current state of a dataset. +For instance, "how many purchases has a given user made?" +Over time, the user makes additional purchases and the answer *should* change. +With these traditional data processing systems, the answer changes based on when it is asked. + +With Kaskada, the query "how many purchases has a given user made?" is expressed as a _Timestream_. +This represents how the result of that query changes over time for each user. +Kaskada makes it easy to combine Timestreams to produce a new Timestream -- joining points from each input as needed. \ No newline at end of file diff --git a/python/noxfile.py b/python/noxfile.py index a904b0b6f..bc9b0d3cd 100644 --- a/python/noxfile.py +++ b/python/noxfile.py @@ -119,7 +119,7 @@ def xdoctest(session: nox.Session) -> None: @nox.session(name="docs-build", python=python_versions[0]) def docs_build(session: nox.Session) -> None: """Build the documentation.""" - args = session.posargs or ["docs/source", "docs/_build", "-j", "auto"] + args = session.posargs or ["docs/source", "docs/_build", "-j", "auto", "-W"] if not session.posargs and "FORCE_COLOR" in os.environ: args.insert(0, "--color")