diff --git a/docs/conf.py b/docs/conf.py index 2be3b0185f..49a20baf53 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -96,6 +96,10 @@ # redirects redirects = { + "core_use_cases/index": "../flytesnacks/tutorials.html", + "core_use_cases/analytics": "../flytesnacks/tutorials.html", + "core_use_cases/data_engineering": "../flytesnacks/tutorials.html", + "core_use_cases/machine_learning": "../flytesnacks/tutorials.html", "flytesnacks/deprecated_integrations": "../deprecated_integrations/index.html", "flytesnacks/examples/bigquery_plugin/index": "../../../deprecated_integrations/bigquery_plugin/index.html", "flytesnacks/examples/bigquery_plugin/bigquery_plugin_example": "../../../deprecated_integrations/bigquery_plugin/biquery_plugin_example.html", diff --git a/docs/core_use_cases/analytics.md b/docs/core_use_cases/analytics.md deleted file mode 100644 index 71b5530c03..0000000000 --- a/docs/core_use_cases/analytics.md +++ /dev/null @@ -1,179 +0,0 @@ ---- -kernelspec: - display_name: Python 3 - language: python - name: python3 ---- - -(getting_started_analytics)= - -# Analytics - -Flyte is ideal for data cleaning, statistical summarization, and plotting -because with `flytekit` you can leverage the rich Python ecosystem of data -processing and visualization tools. - -## Cleaning data - -In this example, we are going to analyze some covid vaccination data: - -```{code-cell} ipython3 -import pandas as pd -import plotly -import plotly.graph_objects as go -from flytekit import Deck, task, workflow, Resources - - -@task(requests=Resources(mem="1Gi")) -def clean_data() -> pd.DataFrame: - """Clean the dataset.""" - df = pd.read_csv("https://covid.ourworldindata.org/data/owid-covid-data.csv") - filled_df = ( - df.sort_values(["people_vaccinated"], ascending=False) - .groupby("location") - .first() - .reset_index() - )[["location", "people_vaccinated", "population", "date"]] - return filled_df -``` - -As you can see, we're using `pandas` for data processing, and in the task -below we use `plotly` to create a choropleth map of the percent of a country's -population that has received at least one COVID-19 vaccination. - -## Rendering plots - -We can use {ref}`Flyte Decks ` for rendering a static HTML report -of the map. In this case, we normalize the `people_vaccinated` by the -`population` count of each country: - -```{code-cell} ipython3 -@task(enable_deck=True) -def plot(df: pd.DataFrame): - """Render a Choropleth map.""" - df["text"] = df["location"] + "
" + "Last updated on: " + df["date"] - fig = go.Figure( - data=go.Choropleth( - locations=df["location"], - z=df["people_vaccinated"].astype(float) / df["population"].astype(float), - text=df["text"], - locationmode="country names", - colorscale="Blues", - autocolorscale=False, - reversescale=False, - marker_line_color="darkgray", - marker_line_width=0.5, - zmax=1, - zmin=0, - ) - ) - - fig.update_layout( - title_text=( - "Percent population with at least one dose of COVID-19 vaccine" - ), - geo_scope="world", - geo=dict( - showframe=False, showcoastlines=False, projection_type="equirectangular" - ), - ) - Deck("Choropleth Map", plotly.io.to_html(fig)) - - -@workflow -def analytics_workflow(): - """Prepare a data analytics workflow.""" - plot(df=clean_data()) -``` - -Running this workflow, we get an interactive plot, courtesy of `plotly`: - -```{code-cell} ipython3 ---- -tags: [remove-input] ---- - -# this is an unrendered cell, used to capture the logs in order to render the -# Flyte Decks directly in the docs. -import logging -import os -import re -from pythonjsonlogger import jsonlogger -from IPython.display import HTML - - -class DeckFilter(logging.Filter): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.formatter = jsonlogger.JsonFormatter( - fmt="%(asctime)s %(name)s %(levelname)s %(message)s" - ) - self.logs = [] - self.deck_files = {} - - def filter(self, record): - patt = "(.+) task creates flyte deck html to (.+/deck.html)" - msg = record.getMessage() - matches = re.match(patt, msg) - - if msg == "Connection error. Skip stats collection.": - return False - - if matches: - task, filepath = matches.group(1), matches.group(2) - self.logs.append(self.formatter.format(record)) - self.deck_files[task] = re.sub("^file://", "", filepath) - return False - - -logger = logging.getLogger("flytekit") -logger.setLevel(20) - -deck_filter = DeckFilter() -logger.addFilter(deck_filter) -``` - -```{code-cell} ipython3 -analytics_workflow() -``` - -```{code-cell} ipython3 ---- -tags: [remove-input] ---- - -import os -import shutil -from pathlib import Path - -def cp_deck(src): - src = Path(src) - target = Path.cwd() / "_flyte_decks" / src.parent.name - target.mkdir(parents=True, exist_ok=True) - shutil.copy(src, target) - return target / "deck.html" - -logger.removeFilter(deck_filter) -HTML(filename=cp_deck(deck_filter.deck_files["plot"])) -``` - -## Custom Flyte deck renderers - -You can also create your own {ref}`custom Flyte Deck renderers ` -to visualize data with any plotting/visualization library of your choice, as -long as you can render HTML for the objects of interest. - -```{important} -Prefer other data processing frameworks? Flyte ships with -[Polars](https://github.com/flyteorg/flytekit/tree/master/plugins/flytekit-polars), -{ref}`Dask `, {ref}`Modin `, {ref}`Spark `, -[Vaex](https://github.com/flyteorg/flytekit/tree/master/plugins/flytekit-vaex), -and [DBT](https://github.com/flyteorg/flytekit/tree/master/plugins/flytekit-dbt) -integrations. - -If you need to connect to a database, Flyte provides first-party -support for {ref}`AWS Athena `, {ref}`Google Bigquery `, -{ref}`Snowflake `, {ref}`SQLAlchemy `, and -{ref}`SQLite3 `. -``` diff --git a/docs/core_use_cases/data_engineering.md b/docs/core_use_cases/data_engineering.md deleted file mode 100644 index 9cbfca430c..0000000000 --- a/docs/core_use_cases/data_engineering.md +++ /dev/null @@ -1,175 +0,0 @@ ---- -kernelspec: - display_name: Python 3 - language: python - name: python3 ---- - -(getting_started_data_engineering)= - -# Data engineering - -Flyte is well-suited for data engineering use cases, where you can interleave -SQL queries with data processing logic implemented in Python with whichever -data processing tools you prefer. - -In this example, we create an ETL workflow that extracts data from a public -[RNA database](https://rnacentral.org/help/public-database), performs some simple -transforms on the data, and loads it into a CSV file. - -## Extract - -First, we define an `extract_task` task using the -{ref}`flytekitplugins-sqlalchemy ` plugin, which provides an -interface to perform SQL queries via the -{py:class}`~flytekitplugins.sqlalchemy.SQLAlchemyTask` -and {py:class}`~flytekitplugins.sqlalchemy.SQLAlchemyConfig` classes. - -```{code-cell} ipython3 -import os - -import flytekit -import pandas as pd -from flytekit import Resources, kwtypes, task, workflow -from flytekit.types.file import CSVFile -from flytekitplugins.sqlalchemy import SQLAlchemyConfig, SQLAlchemyTask - -DATABASE_URI = ( - "postgresql://reader:NWDMCE5xdipIjRrp@hh-pgsql-public.ebi.ac.uk:5432/pfmegrnargs" -) - -extract_task = SQLAlchemyTask( - "extract_rna", - query_template=( - "select len as sequence_length, timestamp from rna " - "where len >= {{ .inputs.min_length }} and len <= {{ .inputs.max_length }} " - "limit {{ .inputs.limit }}" - ), - inputs=kwtypes(min_length=int, max_length=int, limit=int), - output_schema_type=pd.DataFrame, - task_config=SQLAlchemyConfig(uri=DATABASE_URI), -) -``` - -You can format the `query_template` with `{{ .inputs. }}` to -parameterize your query with the `input` keyword type specification, which -maps task argument names to their expected types. - -```{important} -You can request for access to secrets via the `secret_requests` of the -{py:class}`~flytekitplugins.sqlalchemy.SQLAlchemyTask` constructor, then -pass in a `secret_connect_args` argument to the -{py:class}`~flytekitplugins.sqlalchemy.SQLAlchemyConfig` constructor, assuming -that you have connection credentials available in the configured -{ref}`Secrets Management System `, which is K8s by default. -``` - -## Transform - -Next, we parse the raw `timestamp`s and represent the time as separate `date` -and `time` columns. Notice that we can encode the assumptions we have about this -task's resource requirements with the {py:class}`~flytekit.Resources` object. -If those assumptions ever change, we can update the resource request here, or -override it at the workflow-level with the {ref}`with_overrides ` method. - -```{code-cell} ipython3 -@task(requests=Resources(mem="700Mi")) -def transform(df: pd.DataFrame) -> pd.DataFrame: - """Add date and time columns; drop timestamp column.""" - timestamp = pd.to_datetime(df["timestamp"]) - df["date"] = timestamp.dt.date - df["time"] = timestamp.dt.time - df.drop("timestamp", axis=1, inplace=True) - return df -``` - -## Load - -Finally, we load the transformed data into its final destination: a CSV file in -blob storage. Flyte has a built-in `CSVFile` type that automatically handles -serializing/deserializing and uploading/downloading the file as it's passed from -one task to the next. All you need to do is write the file to some local location -and pass that location to the `path` argument of `CSVFile`. - -```{code-cell} ipython3 -@task(requests=Resources(mem="700Mi")) -def load(df: pd.DataFrame) -> CSVFile: - """Load the dataframe to a csv file.""" - csv_file = os.path.join(flytekit.current_context().working_directory, "rna_df.csv") - df.to_csv(csv_file, index=False) - return CSVFile(path=csv_file) -``` - -## ETL workflow - -Putting all the pieces together, we create an `etl_workflow` that produces a -dataset based on the parameters you give it. - -```{code-cell} ipython3 -@workflow -def etl_workflow( - min_length: int = 50, max_length: int = 200, limit: int = 10 -) -> CSVFile: - """Build an extract, transform and load pipeline.""" - return load( - df=transform( - df=extract_task(min_length=min_length, max_length=max_length, limit=limit) - ) - ) -``` - -During local execution, this CSV file lives in a random local -directory, but when the workflow is run on a Flyte cluster, this file lives in -the configured blob store, like S3 or GCS. - -Running this workflow locally, we can access the CSV file and read it into -a `pandas.DataFrame`. - -```{code-cell} ipython3 -csv_file = etl_workflow(limit=5) -pd.read_csv(csv_file) -``` - -## Workflows as reusable components - -Because Flyte tasks and workflows are simply functions, we can embed -`etl_workflow` as part of a larger workflow, where it's used to create a -CSV file that's then consumed by downstream tasks or subworkflows: - -```{code-cell} ipython3 -@task -def aggregate(file: CSVFile) -> pd.DataFrame: - data = pd.read_csv(file) - ... # process the data further - - -@task -def plot(data: pd.DataFrame): - ... # create a plot - - -@workflow -def downstream_workflow( - min_length: int = 50, max_length: int = 200, limit: int = 10 -): - """A downstream workflow that visualizes an aggregation of the data.""" - csv_file = etl_workflow( - min_length=min_length, - max_length=max_length, - limit=limit, - ) - return plot(data=aggregate(file=csv_file)) -``` - -```{important} -Prefer other data processing frameworks? Flyte ships with -[Polars](https://github.com/flyteorg/flytekit/tree/master/plugins/flytekit-polars), -{ref}`Dask `, {ref}`Modin `, {ref}`Spark `, -[Vaex](https://github.com/flyteorg/flytekit/tree/master/plugins/flytekit-vaex), -and [DBT](https://github.com/flyteorg/flytekit/tree/master/plugins/flytekit-dbt) -integrations. - -For database connectors, Flyte provides first-party support for {ref}`AWS Athena `, -{ref}`Google BigQuery `, {ref}`Snowflake `, -{ref}`SQLAlchemy `, and {ref}`SQLite3 `. -``` diff --git a/docs/core_use_cases/index.md b/docs/core_use_cases/index.md deleted file mode 100644 index 4ab3fa16af..0000000000 --- a/docs/core_use_cases/index.md +++ /dev/null @@ -1,42 +0,0 @@ -(getting_started_core_use_cases)= - -# Core use cases - -This section of the documentation will take you through the -core use cases for which Flyte is designed. Within the context of these guides, -we're going to assume that the discipline of data science can be broken down into -at least three specializations: data engineering, machine learning (or -statistical modeling more broadly), and analytics. - -The purpose of these guides is to provide you, the data and ML practitioner, -some practical and simple examples of how Flyte can help you in your daily -practice. - -```{list-table} -:header-rows: 0 -:widths: 10 30 - -* - {doc}`🛠 Data engineering ` - - Create an ETL workflow for processing data with SQLAlchemy and Pandas. -* - {doc}`🤖 Machine learning ` - - Train a classifier with Scikit-Learn and Pandas. -* - {doc}`📈 Analytics ` - - Develop a data cleaning and plotting pipeline with Plotly and Pandas. -``` - -```{admonition} Learn more -:class: important - -Check out more examples in the {ref}`Tutorials ` section, which -includes examples of Flyte in specific domains like -{ref}`bioinformatics `. -``` - -```{toctree} -:maxdepth: -1 -:hidden: - -data_engineering -machine_learning -analytics -``` diff --git a/docs/core_use_cases/machine_learning.md b/docs/core_use_cases/machine_learning.md deleted file mode 100644 index 6368b0aa54..0000000000 --- a/docs/core_use_cases/machine_learning.md +++ /dev/null @@ -1,125 +0,0 @@ ---- -kernelspec: - display_name: Python 3 - language: python - name: python3 ---- - -(getting_started_machine_learning)= - -# Machine learning - -Flyte can handle a full spectrum of machine learning workloads, from -training small models to gpu-accelerated deep learning and hyperparameter -optimization. - -## Getting the data - -In this simple example, we train a binary classification model on the -[wine dataset](https://scikit-learn.org/stable/datasets/toy_dataset.html#wine-dataset) -that is available through the `scikit-learn` package: - -```{code-cell} ipython3 -import pandas as pd -from flytekit import Resources, task, workflow -from sklearn.datasets import load_wine -from sklearn.linear_model import LogisticRegression - -import flytekit.extras.sklearn - - -@task(requests=Resources(mem="500Mi")) -def get_data() -> pd.DataFrame: - """Get the wine dataset.""" - return load_wine(as_frame=True).frame -``` - -## Define a training workflow - -Then, we define `process_data` and `train_model` tasks along with a -`training_workflow` to put all the pieces together for a model-training -pipeline. - -```{code-cell} ipython3 -@task -def process_data(data: pd.DataFrame) -> pd.DataFrame: - """Simplify the task from a 3-class to a binary classification problem.""" - return data.assign(target=lambda x: x["target"].where(x["target"] == 0, 1)) - - -@task -def train_model(data: pd.DataFrame, hyperparameters: dict) -> LogisticRegression: - """Train a model on the wine dataset.""" - features = data.drop("target", axis="columns") - target = data["target"] - return LogisticRegression(max_iter=5000, **hyperparameters).fit(features, target) - - -@workflow -def training_workflow(hyperparameters: dict) -> LogisticRegression: - """Put all of the steps together into a single workflow.""" - data = get_data() - processed_data = process_data(data=data) - return train_model( - data=processed_data, - hyperparameters=hyperparameters, - ) - -``` - -```{important} -Even though you can use a `dict` type to represent the model's hyperparameters, -we recommend using {ref}`dataclasses ` to define a custom -`Hyperparameter` Python object that provides more type information to the Flyte -compiler. For example, Flyte uses this type information to auto-generate -type-safe launch forms on the Flyte UI. Learn more in the -{ref}`Extending Flyte ` guide. -``` - -## Computing predictions - -Executing this workflow locally, we can call the `model.predict` method to make -sure we can use our newly trained model to make predictions based on some -feature matrix. - -```{code-cell} ipython3 -model = training_workflow(hyperparameters={"C": 0.01}) -X, _ = load_wine(as_frame=True, return_X_y=True) -model.predict(X.sample(10, random_state=41)) -``` - -## Extending your ML workloads - -There are many ways to extend your workloads: - -```{list-table} -:header-rows: 0 -:widths: 20 30 - -* - **🏔 Vertical Scaling** - - Use the {py:class}`~flytekit.Resources` task keyword argument to request - additional CPUs, GPUs, and/or memory. -* - **🗺 Horizontal Scaling** - - With constructs like {py:func}`~flytekit.dynamic` workflows and - {py:func}`~flytekit.map_task`s, implement gridsearch, random search, - and even [bayesian optimization](https://github.com/flyteorg/flytekit-python-template/tree/main/bayesian-optimization/%7B%7Bcookiecutter.project_name%7D%7D). -* - **🔧 Specialized Tuning Libraries** - - Use the {ref}`Ray Integration ` and leverage tools like - [Ray Tune](https://docs.ray.io/en/latest/tune/index.html) for hyperparameter - optimization, all orchestrated by Flyte as ephemerally-provisioned Ray clusters. -* - **📦 Ephemeral Cluster Resources** - - Use the {ref}`MPI Operator `, {ref}`Kubeflow Tensorflow `, - {ref}`Kubeflow Pytorch` and {doc}`more <_tags/DistributedComputing>` to do distributed training. -* - **🔎 Experiment Tracking** - - Auto-capture training logs with the {py:func}`~flytekitplugins.mlflow.mlflow_autolog` - decorator, which can be viewed as Flyte Decks with `@task(enable_deck=True)`. -* - **⏩ Inference Acceleration** - - Serialize your models in ONNX format using the {ref}`ONNX plugin `, which - supports ScikitLearn, TensorFlow, and PyTorch. -``` - -```{admonition} Learn more -:class: important - -See the {ref}`Tutorials ` for more machine learning examples. -``` diff --git a/docs/flyte_fundamentals/extending_flyte.md b/docs/flyte_fundamentals/extending_flyte.md index 7f0d7e475b..1e09e168f1 100644 --- a/docs/flyte_fundamentals/extending_flyte.md +++ b/docs/flyte_fundamentals/extending_flyte.md @@ -153,12 +153,3 @@ many more ways to customize Flyte tasks: - These tasks plugins require implementing a backend plugin to leverage external services like SageMaker, Snowflake, BigQuery, etc. ``` - -## What's next? - -Congratulations! 🎉 You've just completed the Flyte Fundamentals tour. - -The final section in the getting started section of the docs will provide you -with some {ref}`core use cases ` for implementing -your first workflows, whether you're a data scientist, data analyst, data engineer, -or machine learning engineer. diff --git a/docs/index.md b/docs/index.md index 28b9994ec5..c687c916bd 100644 --- a/docs/index.md +++ b/docs/index.md @@ -75,8 +75,7 @@ contribute its architecture and design. You can also access the :widths: 20 30 * - {doc}`🔤 Introduction to Flyte ` - - Get your first workflow running, learn about the Flyte development lifecycle - and core use cases. + - Get your first workflow running and learn about the Flyte development lifecycle. * - {doc}`📖 User Guide ` - A comprehensive view of Flyte's functionality for data and ML practitioners. * - {doc}`📚 Tutorials ` @@ -137,7 +136,6 @@ Quickstart guide Getting started with workflow development Flyte fundamentals Flyte agents -Core use cases ``` ```{toctree}