forked from apache/spark
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SPARK-37516][PYTHON][SQL] Uses Python's standard string formatter fo…
…r SQL API in PySpark ### What changes were proposed in this pull request? This PR proposes to use [Python's standard string formatter](https://docs.python.org/3/library/string.html#custom-string-formatting) in `SparkSession.sql`, see also apache#34677. ### Why are the changes needed? To improve usability in PySpark. It works together with Python standard string formatter. ### Does this PR introduce _any_ user-facing change? By default, there is no user-facing change. If `kwargs` is specified, yes. 1. Attribute supports from frame (standard Python support): ```python mydf = spark.range(10) spark.sql("SELECT {tbl.id}, {tbl[id]} FROM {tbl}", tbl=mydf) ``` 2. Understanding `DataFrame`: ```python mydf = spark.range(10) spark.sql("SELECT * FROM {tbl}", tbl=mydf) ``` 3. Understanding `Column`. (explicit column reference only): ```python mydf = spark.range(10) spark.sql("SELECT {c} FROM {tbl}", c=col("id"), tbl=mydf) ``` 4. Leveraging other Python string format: ```python mydf = spark.range(10) spark.sql( "SELECT {col} FROM {mydf} WHERE id IN {x}", col=mydf.id, mydf=mydf, x=tuple(range(4))) ``` ### How was this patch tested? Doctests were added. Closes apache#34774 from HyukjinKwon/SPARK-37516. Authored-by: Hyukjin Kwon <[email protected]> Signed-off-by: Hyukjin Kwon <[email protected]>
- Loading branch information
1 parent
fdc276b
commit 26f4953
Showing
5 changed files
with
182 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
# | ||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
import string | ||
import typing | ||
from typing import Any, Optional, List, Tuple, Sequence, Mapping | ||
import uuid | ||
|
||
from py4j.java_gateway import is_instance_of | ||
|
||
if typing.TYPE_CHECKING: | ||
from pyspark.sql import SparkSession, DataFrame | ||
from pyspark.sql.functions import lit | ||
|
||
|
||
class SQLStringFormatter(string.Formatter): | ||
""" | ||
A standard ``string.Formatter`` in Python that can understand PySpark instances | ||
with basic Python objects. This object has to be clear after the use for single SQL | ||
query; cannot be reused across multiple SQL queries without cleaning. | ||
""" | ||
|
||
def __init__(self, session: "SparkSession") -> None: | ||
self._session: "SparkSession" = session | ||
self._temp_views: List[Tuple[DataFrame, str]] = [] | ||
|
||
def get_field(self, field_name: str, args: Sequence[Any], kwargs: Mapping[str, Any]) -> Any: | ||
obj, first = super(SQLStringFormatter, self).get_field(field_name, args, kwargs) | ||
return self._convert_value(obj, field_name), first | ||
|
||
def _convert_value(self, val: Any, field_name: str) -> Optional[str]: | ||
""" | ||
Converts the given value into a SQL string. | ||
""" | ||
from pyspark import SparkContext | ||
from pyspark.sql import Column, DataFrame | ||
|
||
if isinstance(val, Column): | ||
assert SparkContext._gateway is not None # type: ignore[attr-defined] | ||
|
||
gw = SparkContext._gateway # type: ignore[attr-defined] | ||
jexpr = val._jc.expr() | ||
if is_instance_of( | ||
gw, jexpr, "org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute" | ||
) or is_instance_of( | ||
gw, jexpr, "org.apache.spark.sql.catalyst.expressions.AttributeReference" | ||
): | ||
return jexpr.sql() | ||
else: | ||
raise ValueError( | ||
"%s in %s should be a plain column reference such as `df.col` " | ||
"or `col('column')`" % (val, field_name) | ||
) | ||
elif isinstance(val, DataFrame): | ||
for df, n in self._temp_views: | ||
if df is val: | ||
return n | ||
df_name = "_pyspark_%s" % str(uuid.uuid4()).replace("-", "") | ||
self._temp_views.append((val, df_name)) | ||
val.createOrReplaceTempView(df_name) | ||
return df_name | ||
elif isinstance(val, str): | ||
return lit(val)._jc.expr().sql() # for escaped characters. | ||
else: | ||
return val | ||
|
||
def clear(self) -> None: | ||
for _, n in self._temp_views: | ||
self._session.catalog.dropTempView(n) | ||
self._temp_views = [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters