diff --git a/cookbook/integrations/kubernetes/k8s_spark/dataframe_passing.py b/cookbook/integrations/kubernetes/k8s_spark/dataframe_passing.py index e384bf0b25..33c4f2455a 100644 --- a/cookbook/integrations/kubernetes/k8s_spark/dataframe_passing.py +++ b/cookbook/integrations/kubernetes/k8s_spark/dataframe_passing.py @@ -1,8 +1,8 @@ """ .. _intermediate_spark_dataframes_passing: -Pass pandas dataframes -=========================== +Converting a Spark DataFrame to a Pandas DataFrame +================================================== This example shows how users can return a spark.Dataset from a task and consume it as a pandas.DataFrame. If the dataframe does not fit in memory, it will result in a runtime failure. @@ -17,12 +17,13 @@ # .. _df_my_schema_definition: # # Define my_schema -# -------------------- +# ----------------- # This section defines a simple schema type with 2 columns, `name: str` and `age: int` my_schema = FlyteSchema[kwtypes(name=str, age=int)] # %% -# ``create_spark_df`` is a spark task, that runs within a spark cotext (relies on having spark cluster up and running). This task generates a spark DataFrame whose schema matches the predefined :any:`df_my_schema_definition` +# ``create_spark_df`` is a spark task that runs within a spark cotext (and relies on having a spark cluster up and running). This task generates a spark DataFrame whose schema matches the predefined :any:`df_my_schema_definition` +# # Notice that the task simply returns a pyspark.DataFrame object, even though the return type specifies :any:`df_my_schema_definition` # The flytekit type-system will automatically convert the pyspark.DataFrame to Flyte Schema object. # FlyteSchema object is an abstract representation of a DataFrame, that can conform to multiple different dataframe formats. @@ -55,7 +56,7 @@ def create_spark_df() -> my_schema: # The task ``sum_of_all_ages`` receives a parameter of type :any:`df_my_schema_definition`. It is important to note that there is no # expectation that the schema is a pandas dataframe or a spark dataframe, but just a generic schema object. The Flytekit schema object # can be read into multiple formats using the ``open()`` method. Default conversion is to :py:class:`pandas.DataFrame` -# Refer to :py:class:`flytekit.types.schema.FlyteSchema` for more details +# Refer to :py:class:`flytekit.types.schema.FlyteSchema` for more details. # @task(cache_version="1") def sum_of_all_ages(s: my_schema) -> int: @@ -84,8 +85,8 @@ def my_smart_schema() -> int: # %% -# This program can be executed locally and it should work as expected and this greatly simplifies usage of disparate DataFrame technologies for the end user. -# Also new DataFrame technologies can be dynamically loaded in flytekit's TypeEngine. More on this later. +# This program can be executed locally and it should work as expected. This greatly simplifies using disparate DataFrame technologies for the end user. +# New DataFrame technologies can also be dynamically loaded in flytekit's TypeEngine. if __name__ == "__main__": """ This program can be run locally