From f8bcf49c9c1e8f354ce0edcab076a65492fefb29 Mon Sep 17 00:00:00 2001 From: orellabac Date: Tue, 17 Oct 2023 11:42:40 -0600 Subject: [PATCH] removing explode_outer --- CHANGE_LOG.txt | 7 ++- README.md | 47 +++++++++++++++++++-- setup.py | 2 +- snowpark_extensions/functions_extensions.py | 4 -- tests/test_dataframe_extensions.py | 2 +- 5 files changed, 51 insertions(+), 11 deletions(-) diff --git a/CHANGE_LOG.txt b/CHANGE_LOG.txt index 57e990f..41d99a9 100644 --- a/CHANGE_LOG.txt +++ b/CHANGE_LOG.txt @@ -171,4 +171,9 @@ Version 0.0.32 Version 0.0.33 -------------- -- Falling back to builtin applyInPandas implementation \ No newline at end of file +- Falling back to builtin applyInPandas implementation + +Version 0.0.34 +-------------- +- explode have been removed from this library as it is supported natively by snowpark. +- updated README providing information on how to use default `connections.toml` diff --git a/README.md b/README.md index 4e5120f..df51d60 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,47 @@ import snowpark_extensions new_session = Session.builder.env().appName("app1").create() ``` + +> NOTE: since 1.8.0 the [python connector was updated](https://docs.snowflake.com/en/release-notes/clients-drivers/python-connector-2023#version-3-1-0-july-31-2023) and we provide support for an unified configuration storage for `snowflake-python-connector` and `snowflake-snowpark-python` with this approach. +> +> You can use this connections leveraging `Session.builder.getOrCreate()` or `Session.builder.create()` +> +> By default, we look for the `connections.toml` file in the location specified in the `SNOWFLAKE_HOME` environment variable (default: `~/.snowflake`). If this folder does not exist, the Python connector looks for the file in the `platformdirs` location, as follows: +> +> * On Linux: `~/.config/snowflake/`, but follows XDG settings +> * On Mac: `~/Library/ApplicationĀ Support/snowflake/` +> * On Windows: `%USERPROFILE%\AppData\Local\snowflake\` +> +> The default connection by default is 'default' but it can be controlled with the environment variable: `SNOWFLAKE_DEFAULT_CONNECTION_NAME`. +> +> If you dont want to use a file you can set the file contents thru the `SNOWFLAKE_CONNECTIONS` environment variable. +> +> Connection file looks like: +> +> ``` +> [default] +> accountname = "myaccount" +> username = "user1" +> password = 'xxxxx' +> rolename = "user_role" +> dbname = "demodb" +> schemaname = "public" +> warehousename = "load_wh" +> +> +> [snowpark] +> accountname = "myaccount" +> username = "user2" +> password = 'yyyyy' +> rolename = "user_role" +> dbname = "demodb" +> schemaname = "public" +> warehousename = "load_wh" +> +> ``` + + + The `appName` can use to setup a query_tag like `APPNAME=tag;execution_id=guid` which can then be used to track job actions with a query like You can then use a query like: @@ -192,7 +233,6 @@ df.group_by("ID").applyInPandas( normalize, schema="id long, v double").show() ``` - ``` ------------------------------ |"ID" |"V" | @@ -205,7 +245,6 @@ df.group_by("ID").applyInPandas( ------------------------------ ``` - > NOTE: since snowflake-snowpark-python==1.8.0 applyInPandas is available. This version is kept because: > > 1. It supports string schemas @@ -285,7 +324,7 @@ That will return: | functions.format_number | formats numbers using the specified number of decimal places | | ~~functions.reverse~~ | ~~returns a reversed string~~ **Available in snowpark-python >= 1.2.0** | | ~~functions.explode~~ | ~~returns a new row for each element in the given array~~ **Available in snowpark-python >= 1.4.0** | -| functions.explode_outer | returns a new row for each element in the given array or map. Unlike explode, if the array/map is null or empty then null is producedThis | +| ~~functions.explode_outer~~ | ~~returns a new row for each element in the given array or map. Unlike explode, if the array/map is null or empty then null is producedThis~~
**Available in snowpark-python >= 1.4.0**
There is a breaking change as the explode_outer does not need the map argument anymore. | | functions.arrays_zip | returns a merged array of arrays | | functions.array_sort | sorts the input array in ascending order. The elements of the input array must be orderable. Null elements will be placed at the end of the returned array. | | functions.array_max | returns the maximon value of the array. | @@ -352,7 +391,7 @@ sf_df = session.createDataFrame([(1, ["foo", "bar"], {"x": 1.0}), (2, [], {}), ( ``` ``` -# +---+----------+----------+ +# +---+----------+----------+ # | id| an_array| a_map| # +---+----------+----------+ # | 1|[foo, bar]|{x -> 1.0}| diff --git a/setup.py b/setup.py index 0ae51d9..929d082 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ this_directory = Path(__file__).parent long_description = (this_directory / "README.md").read_text() -VERSION = '0.0.33' +VERSION = '0.0.34' setup(name='snowpark_extensions', version=VERSION, diff --git a/snowpark_extensions/functions_extensions.py b/snowpark_extensions/functions_extensions.py index 7917ef4..6dd6385 100644 --- a/snowpark_extensions/functions_extensions.py +++ b/snowpark_extensions/functions_extensions.py @@ -56,9 +56,6 @@ def create_map(*col_names): col_list.append(_to_col_if_str(name,"create_map")) col_list.append(value) return object_construct(*col_list) - - def _explode_outer(col,map=None): - return F.table_function("flatten")(input=col,outer=F.lit(True)) def _array(*cols): return F.array_construct(*cols) @@ -350,7 +347,6 @@ def map_values(obj:dict)->list: F.array_sort = _array_sort F.arrays_zip = _arrays_zip F.create_map = create_map - F.explode_outer = _explode_outer F.format_number = format_number F.flatten = _array_flatten F.map_values = _map_values diff --git a/tests/test_dataframe_extensions.py b/tests/test_dataframe_extensions.py index 1e21858..a22ab89 100644 --- a/tests/test_dataframe_extensions.py +++ b/tests/test_dataframe_extensions.py @@ -164,7 +164,7 @@ def test_explode_outer_with_map(): # | 3| null| null| # +---+----------+----------+ - results = sf_df.select("id", "an_array", explode_outer("a_map",map=True)).collect() + results = sf_df.select("id", "an_array", explode_outer("a_map")).collect() # +---+----------+----+-----+ # | id| an_array| KEY| VALUE| # +---+----------+----+-----+