diff --git a/CHANGE_LOG.txt b/CHANGE_LOG.txt index 914a284..d40ddaa 100644 --- a/CHANGE_LOG.txt +++ b/CHANGE_LOG.txt @@ -182,3 +182,9 @@ Version 0.0.34 Version 0.0.35 -------------- - added functions.to_utc_timestamp extension + +Version 0.0.36 +-------------- +- fixing issue with the `%%sql` magics that was causing a double execution +- update README.md for PMML +- update for the `extras/wheel_loader` thanks Karol Tarcak diff --git a/README.md b/README.md index 10c2109..d4010a1 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,10 @@ Snowpark by itself is a powerful library, but still some utility functions can always help. -**NOTE: we have working to integrate some of the snowpark extensions directly into the snowpark-python library. + +BTW what about `Java`/ `Scala`/ `SQL` ? There is an [additional repo](https://github.com/Snowflake-Labs/snowpark-extensions "Snowpark Extensions for Java, Scala and SQL") where you will have also utility functions and extensions for those technologies. + +**NOTE: we have been working to integrate some of the snowpark extensions directly into the snowpark-python library. In most cases the APIs will be exactly the same, so there should no changes needed in your code. However there might be breaking changes, so consider that before updating. If any of these breaking changes are affecting you, please enter an issue so we can address it.** @@ -59,7 +62,6 @@ import snowpark_extensions new_session = Session.builder.env().appName("app1").create() ``` - > NOTE: since 1.8.0 the [python connector was updated](https://docs.snowflake.com/en/release-notes/clients-drivers/python-connector-2023#version-3-1-0-july-31-2023) and we provide support for an unified configuration storage for `snowflake-python-connector` and `snowflake-snowpark-python` with this approach. > > You can use this connections leveraging `Session.builder.getOrCreate()` or `Session.builder.create()` @@ -98,8 +100,6 @@ new_session = Session.builder.env().appName("app1").create() > > ``` - - The `appName` can use to setup a query_tag like `APPNAME=tag;execution_id=guid` which can then be used to track job actions with a query like You can then use a query like: @@ -392,7 +392,7 @@ sf_df = session.createDataFrame([(1, ["foo", "bar"], {"x": 1.0}), (2, [], {}), ( ``` ``` -# +---+----------+----------+ +# +---+----------+----------+ # | id| an_array| a_map| # +---+----------+----------+ # | 1|[foo, bar]|{x -> 1.0}| diff --git a/extras/pmml/README.md b/extras/pmml/README.md index 5120179..8659475 100644 --- a/extras/pmml/README.md +++ b/extras/pmml/README.md @@ -4,10 +4,10 @@ Predictive Model Markup Language ([PMML](https://en.wikipedia.org/wiki/Predictiv In order to easily use these models in SnowPark a simple helper has been provided. - NOTE: this helper requires some JAR in order to perform the PMML loading and scoring. These libraries can be downloaded from maven and they should be uploaded into an stage: + ``` https://mvnrepository.com/artifact/org.pmml4s/pmml4s https://repo1.maven.org/maven2/org/pmml4s/pmml4s_2.12/1.0.1/pmml4s_2.12-1.0.1.jar @@ -20,6 +20,7 @@ These libraries can be downloaded from maven and they should be uploaded into an ``` And to use it in your code you can use an snippet like: + ``` from pmml_builder import ScoreModelBuilder scorer = ScoreModelBuilder() \ @@ -30,4 +31,22 @@ scorer = ScoreModelBuilder() \ scorer.transform(df).show() +``` + +# NOTE: + +In some scenarios the number for input parameters or the number of output parameters can vary, in order to simplify the usage in those situations we provide a [helper UDF ](https://github.com/Snowflake-Labs/snowpark-extensions-py/blob/main/extras/pmml/PMML_SCORER.sql) + +To use that helper you will need to previously upload the: + +* SCALA_LIB for example `scala-library-2.12.17.jar` into an stage, you can [check maven](https://mvnrepository.com/artifact/org.scala-lang/scala-library) +* PMMLS_LIB for example `pmml4s_2.12-1.0.1.jar` into an stage, you can [check maven](https://mvnrepository.com/artifact/org.pmml4s/pmml4s) +* SPRAY_LIB for example `spray-json_2.12-1.3.6.jar` into an stage, you can [check maven](https://mvnrepository.com/artifact/io.spray/spray-json) + +To use this in python it will be easy to do: + +```python +table_with_data +.select(sql_expr("object_construct(*)").alias("INPUT_DATA")) +.join_table_function("PMML_SCORER",lit("@STAGE/path/to/model"),col("INPUT_DATA")) ``` \ No newline at end of file diff --git a/setup.py b/setup.py index 886b1ba..686f642 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ this_directory = Path(__file__).parent long_description = (this_directory / "README.md").read_text() -VERSION = '0.0.35' +VERSION = '0.0.36' setup(name='snowpark_extensions', version=VERSION, diff --git a/snowpark_extensions/__init__.py b/snowpark_extensions/__init__.py index 135c22b..76e8afa 100644 --- a/snowpark_extensions/__init__.py +++ b/snowpark_extensions/__init__.py @@ -15,6 +15,7 @@ def register_sql_magic(): from IPython.core.magic import register_cell_magic def sql(line, cell): import IPython + import re user_ns = IPython.get_ipython().user_ns if "session" in user_ns: session = user_ns['session'] @@ -24,7 +25,13 @@ def sql(line, cell): name = None if line and line.strip(): name = line.strip().split(" ")[0] - df = session.sql(res) + # If there are several statements only last will be returned + # also we will remove all ; at the end to avoid issues with empty statements + res = re.sub(r';+$', '', res) + for cursor in session.connection.execute_string(res): + df = session.sql(f"SELECT * FROM TABLE(RESULT_SCAN('{cursor.sfqid}'))") + # to avoid needed to do a count on display + setattr(df,"_cached_rowcount",cursor.rowcount) if name: user_ns[name] = df else: diff --git a/snowpark_extensions/dataframe_extensions.py b/snowpark_extensions/dataframe_extensions.py index 05f2f7c..287f191 100644 --- a/snowpark_extensions/dataframe_extensions.py +++ b/snowpark_extensions/dataframe_extensions.py @@ -47,7 +47,8 @@ def _repr_html_(self): else: from IPython.display import display try: - count = self.count() + count = self._cached_rowcount if hasattr(self,"_cached_rowcount") else self.count() + self.count() if count == 0: return "No rows to display" elif count == 1: