From 851487a64cdf6e7c700f579319536209cfb0bddd Mon Sep 17 00:00:00 2001 From: "mingming.ge@kyligence.io" Date: Thu, 15 Aug 2019 10:01:21 +0800 Subject: [PATCH] upgrade parquet 1.12.0-kylin-r3 --- assembly/pom.xml | 2 +- common/kvstore/pom.xml | 2 +- common/network-common/pom.xml | 2 +- common/network-shuffle/pom.xml | 2 +- common/network-yarn/pom.xml | 2 +- common/sketch/pom.xml | 2 +- common/tags/pom.xml | 2 +- common/unsafe/pom.xml | 2 +- core/pom.xml | 2 +- examples/pom.xml | 2 +- external/avro/pom.xml | 2 +- external/docker-integration-tests/pom.xml | 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka-0-10-assembly/pom.xml | 2 +- external/kafka-0-10-sql/pom.xml | 2 +- external/kafka-0-10/pom.xml | 2 +- external/kafka-0-8-assembly/pom.xml | 2 +- external/kafka-0-8/pom.xml | 2 +- external/kinesis-asl-assembly/pom.xml | 2 +- external/kinesis-asl/pom.xml | 2 +- external/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml | 2 +- hadoop-cloud/pom.xml | 2 +- launcher/pom.xml | 2 +- mllib-local/pom.xml | 2 +- mllib/pom.xml | 2 +- pom.xml | 4 ++-- repl/pom.xml | 2 +- resource-managers/kubernetes/core/pom.xml | 2 +- .../kubernetes/integration-tests/pom.xml | 2 +- resource-managers/mesos/pom.xml | 2 +- resource-managers/yarn/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- .../apache/spark/sql/internal/SQLConf.scala | 5 +++++ sql/core/pom.xml | 2 +- .../parquet/ParquetWriteSupport.scala | 18 ++++++++++++++++-- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- 42 files changed, 62 insertions(+), 43 deletions(-) diff --git a/assembly/pom.xml b/assembly/pom.xml index c9aef1dd703d5..38aca5b450269 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../pom.xml diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml index 1269eb4d9cf38..c0ae7947cdebf 100644 --- a/common/kvstore/pom.xml +++ b/common/kvstore/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index f285b3edf37f4..8c05cd9af70e1 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index df08788451fac..a311f2d0bf380 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml index d1b6ab1d4539b..0b7dd909f96ad 100644 --- a/common/network-yarn/pom.xml +++ b/common/network-yarn/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml index c497e5673b4de..4958528313f0f 100644 --- a/common/sketch/pom.xml +++ b/common/sketch/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/common/tags/pom.xml b/common/tags/pom.xml index 324cb14b66151..23b63740fea44 100644 --- a/common/tags/pom.xml +++ b/common/tags/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml index ef03eb52044e0..02ac2deb5eed1 100644 --- a/common/unsafe/pom.xml +++ b/common/unsafe/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/core/pom.xml b/core/pom.xml index d3defa86a159f..e38ead3ee15de 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../pom.xml diff --git a/examples/pom.xml b/examples/pom.xml index a32bbe3fc60ac..5e33c2f099535 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../pom.xml diff --git a/external/avro/pom.xml b/external/avro/pom.xml index 6fb062cd68520..6935cdc4e483a 100644 --- a/external/avro/pom.xml +++ b/external/avro/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml index ccf92dbaac068..60369002628f6 100644 --- a/external/docker-integration-tests/pom.xml +++ b/external/docker-integration-tests/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml index f761586cf1e28..18e7874aa470b 100644 --- a/external/flume-assembly/pom.xml +++ b/external/flume-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index f9657e75421f8..8899350b8eaa8 100644 --- a/external/flume-sink/pom.xml +++ b/external/flume-sink/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/external/flume/pom.xml b/external/flume/pom.xml index bfaa7df88830f..915517563b1be 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml index 8fd6a36f59d39..4774ea1cc5102 100644 --- a/external/kafka-0-10-assembly/pom.xml +++ b/external/kafka-0-10-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml index 66aa7877278c6..665dd6a333385 100644 --- a/external/kafka-0-10-sql/pom.xml +++ b/external/kafka-0-10-sql/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml index 4582047b0bce9..69b082c2d159f 100644 --- a/external/kafka-0-10/pom.xml +++ b/external/kafka-0-10/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml index 7915361f4d1ce..67518221b22c1 100644 --- a/external/kafka-0-8-assembly/pom.xml +++ b/external/kafka-0-8-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml index 5448156d096b9..34d09c23cec0f 100644 --- a/external/kafka-0-8/pom.xml +++ b/external/kafka-0-8/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml index 6f3ead11da275..8496787e5d24f 100644 --- a/external/kinesis-asl-assembly/pom.xml +++ b/external/kinesis-asl-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml index de16094d22fa2..ad89d4effc6ca 100644 --- a/external/kinesis-asl/pom.xml +++ b/external/kinesis-asl/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml index 28930d077c807..6b1f6ad262ef2 100644 --- a/external/spark-ganglia-lgpl/pom.xml +++ b/external/spark-ganglia-lgpl/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/graphx/pom.xml b/graphx/pom.xml index 8227de9e3c73e..ae2e736b29df0 100644 --- a/graphx/pom.xml +++ b/graphx/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../pom.xml diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml index 4104e0f52e070..d3f1bd009042b 100644 --- a/hadoop-cloud/pom.xml +++ b/hadoop-cloud/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../pom.xml diff --git a/launcher/pom.xml b/launcher/pom.xml index 5c8e2d77f4171..8cd7e00a6a659 100644 --- a/launcher/pom.xml +++ b/launcher/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../pom.xml diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml index b0c3a680e95cb..209da0c9e6b60 100644 --- a/mllib-local/pom.xml +++ b/mllib-local/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../pom.xml diff --git a/mllib/pom.xml b/mllib/pom.xml index d9c3dac1b4bd9..0b9e1d0929135 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../pom.xml diff --git a/pom.xml b/pom.xml index 214842c1232ba..026d305f60230 100644 --- a/pom.xml +++ b/pom.xml @@ -26,7 +26,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 pom Spark Project Parent POM http://spark.apache.org/ @@ -130,7 +130,7 @@ 1.2.1 10.12.1.1 - 1.12.0-kylin-r2 + 1.12.0-kylin-r3 7.0.13 1.5.2 nohive diff --git a/repl/pom.xml b/repl/pom.xml index 1f7135a698bdf..cf21e83277b07 100644 --- a/repl/pom.xml +++ b/repl/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../pom.xml diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml index eddbd16825f85..71f349c48bd8b 100644 --- a/resource-managers/kubernetes/core/pom.xml +++ b/resource-managers/kubernetes/core/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../../pom.xml diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml index dedae69605ab1..34433db226353 100644 --- a/resource-managers/kubernetes/integration-tests/pom.xml +++ b/resource-managers/kubernetes/integration-tests/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../../pom.xml diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml index 1d1006f443a29..acb83467baea2 100644 --- a/resource-managers/mesos/pom.xml +++ b/resource-managers/mesos/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml index d2f9bf0504b9a..96c05b23c425d 100644 --- a/resource-managers/yarn/pom.xml +++ b/resource-managers/yarn/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml index b68e47df90d17..d85293e9a4c25 100644 --- a/sql/catalyst/pom.xml +++ b/sql/catalyst/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 7d24a5138892d..02c0424c5c1a5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -351,6 +351,11 @@ object SQLConf { val INT96, TIMESTAMP_MICROS, TIMESTAMP_MILLIS = Value } + val PARQUET_CELL_SIZE_LIMIT = buildConf("spark.sql.parquet.cellSizeLimit") + .doc(s"Parquet file cell size limit. default 512 * 1024") + .longConf + .createWithDefault(512 * 1024) + val PARQUET_OUTPUT_TIMESTAMP_TYPE = buildConf("spark.sql.parquet.outputTimestampType") .doc("Sets which Parquet timestamp type to use when Spark writes data to Parquet files. " + "INT96 is a non-standard but commonly used timestamp type in Parquet. TIMESTAMP_MICROS " + diff --git a/sql/core/pom.xml b/sql/core/pom.xml index 1c7e48fd6b3bb..727a8a0c71b08 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala index b40b8c2e61f33..20239ee7af300 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala @@ -68,6 +68,8 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit // Which parquet timestamp type to use when writing. private var outputTimestampType: SQLConf.ParquetOutputTimestampType.Value = _ + private val parquetCellSizeLimit = SQLConf.PARQUET_CELL_SIZE_LIMIT.defaultValue.get + // Reusable byte array used to write timestamps as Parquet INT96 values private val timestampBuffer = new Array[Byte](12) @@ -115,6 +117,14 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit } } + @inline private def checkCellSize(cellLength: Long): Unit = { + if (cellLength >= parquetCellSizeLimit) { + logInfo(s"single cell size: $cellLength ") + logInfo(s"spark.sql.parquet.cellSizeLimit: $parquetCellSizeLimit ") + this.needCheckRowSize = true + } + } + private def writeFields( row: InternalRow, schema: StructType, fieldWriters: Array[ValueWriter]): Unit = { var i = 0 @@ -160,8 +170,10 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit case StringType => (row: SpecializedGetters, ordinal: Int) => + val bytes = row.getUTF8String(ordinal).getBytes + checkCellSize(bytes.length) recordConsumer.addBinary( - Binary.fromReusedByteArray(row.getUTF8String(ordinal).getBytes)) + Binary.fromReusedByteArray(bytes)) case TimestampType => outputTimestampType match { @@ -184,7 +196,9 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit case BinaryType => (row: SpecializedGetters, ordinal: Int) => - recordConsumer.addBinary(Binary.fromReusedByteArray(row.getBinary(ordinal))) + val bytes = row.getBinary(ordinal) + checkCellSize(bytes.length) + recordConsumer.addBinary(Binary.fromReusedByteArray(bytes)) case DecimalType.Fixed(precision, scale) => makeDecimalWriter(precision, scale) diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml index babb6f12fc108..2b908c427c19d 100644 --- a/sql/hive-thriftserver/pom.xml +++ b/sql/hive-thriftserver/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml index be9c5aca4c367..3665c7e7d497c 100644 --- a/sql/hive/pom.xml +++ b/sql/hive/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../../pom.xml diff --git a/streaming/pom.xml b/streaming/pom.xml index 8f0c26570ee9e..96effd5b12476 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../pom.xml diff --git a/tools/pom.xml b/tools/pom.xml index 73f2293d07b72..4dbbaa7d3e232 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.11 - 2.4.1-kylin-r13 + 2.4.1-kylin-r14 ../pom.xml