flyteorg · Future-Outlier · Nov 19, 2024 · Nov 8, 2024 · Nov 10, 2024 · Nov 10, 2024
@@ -30,6 +30,7 @@
 from flytekit.models import types as type_models
 from flytekit.models.literals import Binary, Literal, Scalar, StructuredDatasetMetadata
 from flytekit.models.types import LiteralType, SchemaType, StructuredDatasetType
+from flytekit.utils.asyn import loop_manager
 
 if typing.TYPE_CHECKING:
     import pandas as pd
@@ -176,8 +177,32 @@ def all(self) -> DF:  # type: ignore
         if self._dataframe_type is None:
             raise ValueError("No dataframe type set. Use open() to set the local dataframe type you want to use.")
         ctx = FlyteContextManager.current_context()
+
+        if self.uri is not None and self.dataframe is None:
+            expected = TypeEngine.to_literal_type(StructuredDataset)
+            self._set_literal(ctx, expected)
+
         return flyte_dataset_transformer.open_as(ctx, self.literal, self._dataframe_type, self.metadata)
 
+    def _set_literal(self, ctx: FlyteContext, expected: LiteralType) -> None:
+        """
+        Explicitly set the StructuredDataset Literal to handle the following cases:
+
+        1. Read a dataframe from a StructuredDataset with an uri, for example:
+
+        @task
+        def return_sd() -> StructuredDataset:
+            sd = StructuredDataset(uri="s3://my-s3-bucket/s3_flyte_dir/df.parquet", file_format="parquet")
+            df = sd.open(pd.DataFrame).all()
+            return df
+
+        For details, please refer to this issue: https://github.com/flyteorg/flyte/issues/5954.
+        """
+        to_literal = loop_manager.synced(flyte_dataset_transformer.async_to_literal)
+        self._literal_sd = to_literal(ctx, self, StructuredDataset, expected).scalar.structured_dataset
+        if self.metadata is None:
+            self._metadata = self._literal_sd.metadata
+
     def iter(self) -> Generator[DF, None, None]:
         if self._dataframe_type is None:
             raise ValueError("No dataframe type set. Use open() to set the local dataframe type you want to use.")

@@ -2,6 +2,7 @@
 import tempfile
 import typing
 from collections import OrderedDict
+from pathlib import Path
 
 import google.cloud.bigquery
 import pytest
@@ -21,6 +22,7 @@
 from flytekit.models.literals import StructuredDatasetMetadata
 from flytekit.models.types import LiteralType, SchemaType, SimpleType, StructuredDatasetType
 from flytekit.tools.translator import get_serializable
+from flytekit.types.file import FlyteFile
 from flytekit.types.structured.structured_dataset import (
     PARQUET,
     StructuredDataset,
@@ -59,6 +61,21 @@ def generate_pandas() -> pd.DataFrame:
     return pd.DataFrame({"name": ["Tom", "Joseph"], "age": [20, 22]})
 
 
+@pytest.fixture
+def local_tmp_pqt_file():
+    df = generate_pandas()
+
+    # Create a temporary parquet file
+    with tempfile.NamedTemporaryFile(delete=False, mode="w+b", suffix=".parquet") as pqt_file:
+        pqt_path = pqt_file.name
+        df.to_parquet(pqt_path)
+
+    yield pqt_path
+
+    # Cleanup
+    Path(pqt_path).unlink(missing_ok=True)
+
+
 def test_formats_make_sense():
     @task
     def t1(a: pd.DataFrame) -> pd.DataFrame:
@@ -643,3 +660,70 @@ def wf_with_input() -> pd.DataFrame:
 
     pd.testing.assert_frame_equal(wf_no_input(), default_val)
     pd.testing.assert_frame_equal(wf_with_input(), input_val)
+
+
+
+def test_read_sd_from_uri(local_tmp_pqt_file):
+    import os
+
+    os.environ['FLYTE_AWS_ENDPOINT'] = 'http://localhost:30002/'
+    os.environ['FLYTE_AWS_ACCESS_KEY_ID'] = 'minio'
+    os.environ['FLYTE_AWS_SECRET_ACCESS_KEY'] = 'miniostorage'
+
+    @task
+    def upload_pqt_to_s3(local_path: str, remote_path: str) -> None:
+        """Upload local temp parquet file to s3 object storage"""
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            fs = FileAccessProvider(
+                local_sandbox_dir=tmp_dir,
+                raw_output_prefix="s3://my-s3-bucket"
+            )
+            fs.upload(local_path, remote_path)
+
+    @task
+    def read_sd_from_uri(uri: str) -> pd.DataFrame:
+        sd = StructuredDataset(uri=uri, file_format="parquet")
+        df = sd.open(pd.DataFrame).all()
+
+        return df
+
+    @workflow
+    def read_sd_from_local_uri(uri: str) -> pd.DataFrame:
+        df = read_sd_from_uri(uri=uri)
+
+        return df
+
+    @workflow
+    def read_sd_from_remote_uri(
+        local_path: str,
+        remote_path: str,
+        read_uri: str
+    ) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
+        # Upload parqut to s3
+        upload_pqt_to_s3(local_path=local_path, remote_path=remote_path)
+        ff = FlyteFile(path=REMOTE_PATH)
+        with ff.open(mode="rb") as f:
+            df_s3 = pd.read_parquet(f)
+
+        # Read sd from remote uri
+        df_remote = read_sd_from_uri(uri=read_uri)
+
+        return df_s3, df_remote
+
+
+    REMOTE_PATH = "s3://my-s3-bucket/my-test/df.parquet"
+    df = generate_pandas()
+
+    # Read sd from local uri
+    df_local = read_sd_from_local_uri(uri=local_tmp_pqt_file)
+    pd.testing.assert_frame_equal(df, df_local)
+
+    # Read sd from remote uri
+    df_s3, df_remote = read_sd_from_remote_uri(
+        local_path=local_tmp_pqt_file,
+        remote_path=REMOTE_PATH,
+        read_uri=REMOTE_PATH
+    )
+    pd.testing.assert_frame_equal(df, df_s3)
+    pd.testing.assert_frame_equal(df, df_remote)