Add support for Snowpark Pandas Index (streamlit#9222)

## Describe your changes Adds support for the Snowpark Pandas Index object. This PR also applies some minor modifications to our CI workflow so that snowflake dependency only gets installed for Snowflake integration tests, and not always. ## Testing Plan - Added to data_mocks -> will be tested across various commands and unit tests. --- **Contribution License Agreement** By submitting this pull request you agree that all contributions to this project are made under the Apache 2.0 license.
sfc-gh-ranton · Aug 7, 2024 · 8c39489 · 8c39489
1 parent bae8ffd
commit 8c39489
Show file tree

Hide file tree

Showing 8 changed files with 93 additions and 13 deletions.
diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml
@@ -207,7 +207,7 @@ jobs:
       )
 
     name: >
-      Python 3.8: Python tests for Snowflake
+      py-unit-tests (snowflake)
 
     env:
       USE_CONSTRAINTS_FILE: "${{ fromJson(needs.build_info.outputs.USE_CONSTRAINTS_FILE )}}"
@@ -220,10 +220,10 @@ jobs:
           ref: ${{ inputs.ref }}
           persist-credentials: false
           submodules: "recursive"
-      - name: Set up Python 3.8
+      - name: Set up Python 3.11
         uses: actions/setup-python@v5
         with:
-          python-version: "3.8"
+          python-version: "3.11"
       - name: Decrypt credentials
         run: ./.github/scripts/decrypt_credentials.sh
         env:
@@ -232,7 +232,7 @@ jobs:
         uses: ./.github/actions/make_init
       - name: Run make develop
         run: make develop
-      - name: Run Type Checkers
-        run: scripts/mypy --report
+      - name: Install lib with snowflake dependencies
+        run: uv pip install --editable ./lib[snowflake]
       - name: Run Python Tests for Snowflake
         run: make pytest-snowflake
diff --git a/Makefile b/Makefile
@@ -97,7 +97,7 @@ python-init-test-only: lib/test-requirements.txt
 
 .PHONY: python-init
 python-init:
-	pip_args=("--editable" "./lib[snowflake]");\
+	pip_args=("--editable" "./lib");\
 	if [ "${USE_CONSTRAINTS_FILE}" = "true" ] ; then\
 		pip_args+=(--constraint "${CONSTRAINTS_URL}"); \
 	fi;\

diff --git a/lib/setup.py b/lib/setup.py
@@ -71,7 +71,7 @@
 
 EXTRA_REQUIRES = {
     "snowflake": [
-        "snowflake-snowpark-python>=0.9.0; python_version<'3.12'",
+        "snowflake-snowpark-python[modin]>=1.17.0; python_version<'3.12'",
         "snowflake-connector-python>=2.8.0; python_version<'3.12'",
     ]
 }

diff --git a/lib/streamlit/dataframe_util.py b/lib/streamlit/dataframe_util.py
@@ -72,6 +72,9 @@
 _MODIN_SERIES_TYPE_STR: Final = "modin.pandas.series.Series"
 _SNOWPANDAS_DF_TYPE_STR: Final = "snowflake.snowpark.modin.pandas.dataframe.DataFrame"
 _SNOWPANDAS_SERIES_TYPE_STR: Final = "snowflake.snowpark.modin.pandas.series.Series"
+_SNOWPANDAS_INDEX_TYPE_STR: Final = (
+    "snowflake.snowpark.modin.plugin.extensions.index.Index"
+)
 
 V_co = TypeVar(
     "V_co",
@@ -181,7 +184,7 @@ def is_unevaluated_data_object(obj: object) -> bool:
     - Snowpark DataFrame / Table
     - PySpark DataFrame
     - Modin DataFrame / Series
-    - Snowpandas DataFrame / Series
+    - Snowpandas DataFrame / Series / Index
     - Generator functions
 
     Unevaluated means that the data is not yet in the local memory.
@@ -233,8 +236,10 @@ def is_modin_data_object(obj: object) -> bool:
 
 def is_snowpandas_data_object(obj: object) -> bool:
     """True if obj is a Snowpark Pandas DataFrame or Series."""
-    return is_type(obj, _SNOWPANDAS_DF_TYPE_STR) or is_type(
-        obj, _SNOWPANDAS_SERIES_TYPE_STR
+    return (
+        is_type(obj, _SNOWPANDAS_DF_TYPE_STR)
+        or is_type(obj, _SNOWPANDAS_SERIES_TYPE_STR)
+        or is_type(obj, _SNOWPANDAS_INDEX_TYPE_STR)
     )
 
 

diff --git a/lib/tests/conftest.py b/lib/tests/conftest.py
@@ -20,7 +20,6 @@
 from __future__ import annotations
 
 import os
-import sys
 from unittest.mock import mock_open, patch
 
 import pytest
@@ -87,8 +86,6 @@ def pytest_configure(config: pytest.Config):
 
     is_require_snowflake = config.getoption("--require-snowflake", default=False)
     if is_require_snowflake:
-        if sys.version_info[0:2] != (3, 8):
-            raise pytest.UsageError("Python 3.8 is required to run Snowflake tests")
         try:
             import snowflake.snowpark  # noqa: F401
         except ImportError:

diff --git a/lib/tests/streamlit/data_mocks.py b/lib/tests/streamlit/data_mocks.py
@@ -32,6 +32,7 @@
 from tests.streamlit.modin_mocks import Series as ModinSeries
 from tests.streamlit.pyspark_mocks import DataFrame as PySparkDataFrame
 from tests.streamlit.snowpandas_mocks import DataFrame as SnowpandasDataFrame
+from tests.streamlit.snowpandas_mocks import Index as SnowpandasIndex
 from tests.streamlit.snowpandas_mocks import Series as SnowpandasSeries
 from tests.streamlit.snowpark_mocks import DataFrame as SnowparkDataFrame
 from tests.streamlit.snowpark_mocks import Row as SnowparkRow
@@ -839,6 +840,21 @@ def data_generator():
             pd.DataFrame,
         ),
     ),
+    (
+        "Snowpandas Index",
+        SnowpandasIndex(
+            pd.Index(["st.text_area", "st.markdown"]),
+        ),
+        CaseMetadata(
+            2,
+            1,
+            DataFormat.SNOWPANDAS_OBJECT,
+            ["st.text_area", "st.markdown"],
+            "dataframe",
+            True,
+            pd.DataFrame,
+        ),
+    ),
     (
         "Modin DataFrame",
         ModinDataFrame(

diff --git a/lib/tests/streamlit/dataframe_util_test.py b/lib/tests/streamlit/dataframe_util_test.py
@@ -37,6 +37,7 @@
     TestObject,
 )
 from tests.streamlit.snowpandas_mocks import DataFrame as SnowpandasDataFrame
+from tests.streamlit.snowpandas_mocks import Index as SnowpandasIndex
 from tests.streamlit.snowpandas_mocks import Series as SnowpandasSeries
 from tests.streamlit.snowpark_mocks import DataFrame as SnowparkDataFrame
 from tests.streamlit.snowpark_mocks import Row as SnowparkRow
@@ -387,6 +388,7 @@ def test_is_snowpandas_data_object(self):
             dataframe_util.is_snowpandas_data_object(SnowpandasDataFrame(df))
         )
         self.assertTrue(dataframe_util.is_snowpandas_data_object(SnowpandasSeries(df)))
+        self.assertTrue(dataframe_util.is_snowpandas_data_object(SnowpandasIndex(df)))
 
     def test_is_snowpark_row_list(self):
         class DummyClass:
@@ -480,6 +482,39 @@ def test_verify_snowpark_integration(self):
                 pd.DataFrame,
             )
 
+    @pytest.mark.require_snowflake
+    def test_verify_snowpandas_integration(self):
+        """Integration test snowpark pandas object handling.
+        This is in addition to the tests using the mocks to verify that
+        the latest version of the library is still supported.
+        """
+        import modin.pandas as modin_pd
+
+        # Import the Snowpark pandas plugin for modin.
+        import snowflake.snowpark.modin.plugin  # noqa: F401
+
+        with create_snowpark_session():
+            snowpandas_df = modin_pd.DataFrame([1, 2, 3], columns=["col1"])
+            assert dataframe_util.is_snowpandas_data_object(snowpandas_df) is True
+            assert isinstance(
+                dataframe_util.convert_anything_to_pandas_df(snowpandas_df),
+                pd.DataFrame,
+            )
+
+            snowpandas_series = snowpandas_df["col1"]
+            assert dataframe_util.is_snowpandas_data_object(snowpandas_series) is True
+            assert isinstance(
+                dataframe_util.convert_anything_to_pandas_df(snowpandas_series),
+                pd.DataFrame,
+            )
+
+            snowpandas_index = snowpandas_df.index
+            assert dataframe_util.is_snowpandas_data_object(snowpandas_index) is True
+            assert isinstance(
+                dataframe_util.convert_anything_to_pandas_df(snowpandas_index),
+                pd.DataFrame,
+            )
+
     @parameterized.expand(
         SHARED_TEST_CASES,
     )

diff --git a/lib/tests/streamlit/snowpandas_mocks.py b/lib/tests/streamlit/snowpandas_mocks.py
@@ -74,3 +74,30 @@ def head(self, n: int) -> Series:
     def __getitem__(self, key: slice | int) -> Series:
         # Allow slicing and integer indexing
         return Series(self._data[key])
+
+
+class Index:
+    """This is dummy Index class, which imitates
+    snowflake.snowpark.modin.plugin.extensions.index.Index class for testing purposes.
+    We use this to make sure that our code does a special handling
+    if it detects a Snowpark Pandas Index.
+    This allows testing of the functionality without having the library installed,
+    but it won't capture changes in the API of the library. This requires
+    integration tests.
+    """
+
+    __module__ = "snowflake.snowpark.modin.plugin.extensions.index"
+
+    def __init__(self, data: pd.Index):
+        self._data: pd.Index = data
+
+    def to_pandas(self) -> pd.Index:
+        return self._data
+
+    def head(self, n: int) -> Index:
+        """Returns the top n element of a mock version of Snowpark Pandas Series"""
+        return Index(self[:n])
+
+    def __getitem__(self, key: slice | int) -> Index:
+        # Allow slicing and integer indexing
+        return Index(self._data[key])