Skip to content

[SPARK-52788][SQL] Fix error of converting binary value in BinaryType to XML #51470

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import org.apache.hadoop.shaded.com.ctc.wstx.api.WstxOutputProperties

import org.apache.spark.SparkIllegalArgumentException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.ToStringBase
import org.apache.spark.sql.catalyst.util.{ArrayData, DateFormatter, DateTimeUtils, MapData, TimestampFormatter}
import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
import org.apache.spark.sql.types._
Expand Down Expand Up @@ -63,6 +64,8 @@ class StaxXmlGenerator(
legacyFormat = FAST_DATE_FORMAT,
isParsing = false)

private val binaryFormatter = ToStringBase.getBinaryFormatter

private val gen = {
val factory = XMLOutputFactory.newInstance()
// to_xml disables structure validation to allow multiple root tags
Expand Down Expand Up @@ -197,6 +200,7 @@ class StaxXmlGenerator(
case (DecimalType(), v: Decimal) => gen.writeCharacters(v.toString)
case (ByteType, v: Byte) => gen.writeCharacters(v.toString)
case (BooleanType, v: Boolean) => gen.writeCharacters(v.toString)
case (BinaryType, v: Array[Byte]) => gen.writeCharacters(binaryFormatter(v).toString)

// For the case roundtrip in reading and writing XML files, [[ArrayType]] cannot have
// [[ArrayType]] as element type. It always wraps the element with [[StructType]]. So,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,10 @@ SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312
-- !query analysis
Project [to_csv(named_struct(n, 1, info, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333), Some(America/Los_Angeles)) AS to_csv(named_struct(n, 1, info, X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'))#x]
+- OneRowRelation


-- !query
select to_xml(named_struct('name', binary('Eason'), 'birth', 2018, 'org', binary('Kindergarten Cop')))
-- !query analysis
Project [to_xml(named_struct(name, cast(Eason as binary), birth, 2018, org, cast(Kindergarten Cop as binary)), Some(America/Los_Angeles)) AS to_xml(named_struct(name, Eason, birth, 2018, org, Kindergarten Cop))#x]
+- OneRowRelation
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,10 @@ SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312
-- !query analysis
Project [to_csv(named_struct(n, 1, info, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333), Some(America/Los_Angeles)) AS to_csv(named_struct(n, 1, info, X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'))#x]
+- OneRowRelation


-- !query
select to_xml(named_struct('name', binary('Eason'), 'birth', 2018, 'org', binary('Kindergarten Cop')))
-- !query analysis
Project [to_xml(named_struct(name, cast(Eason as binary), birth, 2018, org, cast(Kindergarten Cop as binary)), Some(America/Los_Angeles)) AS to_xml(named_struct(name, Eason, birth, 2018, org, Kindergarten Cop))#x]
+- OneRowRelation
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,10 @@ SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312
-- !query analysis
Project [to_csv(named_struct(n, 1, info, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333), Some(America/Los_Angeles)) AS to_csv(named_struct(n, 1, info, X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'))#x]
+- OneRowRelation


-- !query
select to_xml(named_struct('name', binary('Eason'), 'birth', 2018, 'org', binary('Kindergarten Cop')))
-- !query analysis
Project [to_xml(named_struct(name, cast(Eason as binary), birth, 2018, org, cast(Kindergarten Cop as binary)), Some(America/Los_Angeles)) AS to_xml(named_struct(name, Eason, birth, 2018, org, Kindergarten Cop))#x]
+- OneRowRelation
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,10 @@ SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312
-- !query analysis
Project [to_csv(named_struct(n, 1, info, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333), Some(America/Los_Angeles)) AS to_csv(named_struct(n, 1, info, X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'))#x]
+- OneRowRelation


-- !query
select to_xml(named_struct('name', binary('Eason'), 'birth', 2018, 'org', binary('Kindergarten Cop')))
-- !query analysis
Project [to_xml(named_struct(name, cast(Eason as binary), birth, 2018, org, cast(Kindergarten Cop as binary)), Some(America/Los_Angeles)) AS to_xml(named_struct(name, Eason, birth, 2018, org, Kindergarten Cop))#x]
+- OneRowRelation
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,10 @@ SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312
-- !query analysis
Project [to_csv(named_struct(n, 1, info, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333), Some(America/Los_Angeles)) AS to_csv(named_struct(n, 1, info, X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'))#x]
+- OneRowRelation


-- !query
select to_xml(named_struct('name', binary('Eason'), 'birth', 2018, 'org', binary('Kindergarten Cop')))
-- !query analysis
Project [to_xml(named_struct(name, cast(Eason as binary), birth, 2018, org, cast(Kindergarten Cop as binary)), Some(America/Los_Angeles)) AS to_xml(named_struct(name, Eason, birth, 2018, org, Kindergarten Cop))#x]
+- OneRowRelation
3 changes: 2 additions & 1 deletion sql/core/src/test/resources/sql-tests/inputs/binary.sql
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ SELECT X'';
SELECT X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333';
SELECT CAST('Spark' as BINARY);
SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST('Spark' as BINARY));
SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'));
SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'));
select to_xml(named_struct('name', binary('Eason'), 'birth', 2018, 'org', binary('Kindergarten Cop')));
12 changes: 12 additions & 0 deletions sql/core/src/test/resources/sql-tests/results/binary.sql.out
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,15 @@ SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312
struct<to_csv(named_struct(n, 1, info, X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')):string>
-- !query output
1,Eason Yao 2018-11-17:13:33:33


-- !query
select to_xml(named_struct('name', binary('Eason'), 'birth', 2018, 'org', binary('Kindergarten Cop')))
-- !query schema
struct<to_xml(named_struct(name, Eason, birth, 2018, org, Kindergarten Cop)):string>
-- !query output
<ROW>
<name>Eason</name>
<birth>2018</birth>
<org>Kindergarten Cop</org>
</ROW>
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,15 @@ SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312
struct<to_csv(named_struct(n, 1, info, X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')):string>
-- !query output
1,RWFzb24gWWFvIDIwMTgtMTEtMTc6MTM6MzM6MzM


-- !query
select to_xml(named_struct('name', binary('Eason'), 'birth', 2018, 'org', binary('Kindergarten Cop')))
-- !query schema
struct<to_xml(named_struct(name, Eason, birth, 2018, org, Kindergarten Cop)):string>
-- !query output
<ROW>
<name>RWFzb24</name>
<birth>2018</birth>
<org>S2luZGVyZ2FydGVuIENvcA</org>
</ROW>
12 changes: 12 additions & 0 deletions sql/core/src/test/resources/sql-tests/results/binary_basic.sql.out
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,15 @@ SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312
struct<to_csv(named_struct(n, 1, info, X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')):string>
-- !query output
1,"[69, 97, 115, 111, 110, 32, 89, 97, 111, 32, 50, 48, 49, 56, 45, 49, 49, 45, 49, 55, 58, 49, 51, 58, 51, 51, 58, 51, 51]"


-- !query
select to_xml(named_struct('name', binary('Eason'), 'birth', 2018, 'org', binary('Kindergarten Cop')))
-- !query schema
struct<to_xml(named_struct(name, Eason, birth, 2018, org, Kindergarten Cop)):string>
-- !query output
<ROW>
<name>[69, 97, 115, 111, 110]</name>
<birth>2018</birth>
<org>[75, 105, 110, 100, 101, 114, 103, 97, 114, 116, 101, 110, 32, 67, 111, 112]</org>
</ROW>
12 changes: 12 additions & 0 deletions sql/core/src/test/resources/sql-tests/results/binary_hex.sql.out
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,15 @@ SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312
struct<to_csv(named_struct(n, 1, info, X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')):string>
-- !query output
1,4561736F6E2059616F20323031382D31312D31373A31333A33333A3333


-- !query
select to_xml(named_struct('name', binary('Eason'), 'birth', 2018, 'org', binary('Kindergarten Cop')))
-- !query schema
struct<to_xml(named_struct(name, Eason, birth, 2018, org, Kindergarten Cop)):string>
-- !query output
<ROW>
<name>4561736F6E</name>
<birth>2018</birth>
<org>4B696E64657267617274656E20436F70</org>
</ROW>
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,15 @@ SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312
struct<to_csv(named_struct(n, 1, info, X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')):string>
-- !query output
1,[45 61 73 6F 6E 20 59 61 6F 20 32 30 31 38 2D 31 31 2D 31 37 3A 31 33 3A 33 33 3A 33 33]


-- !query
select to_xml(named_struct('name', binary('Eason'), 'birth', 2018, 'org', binary('Kindergarten Cop')))
-- !query schema
struct<to_xml(named_struct(name, Eason, birth, 2018, org, Kindergarten Cop)):string>
-- !query output
<ROW>
<name>[45 61 73 6F 6E]</name>
<birth>2018</birth>
<org>[4B 69 6E 64 65 72 67 61 72 74 65 6E 20 43 6F 70]</org>
</ROW>