diff --git a/.gitignore b/.gitignore index f883fa99..2d3b91cf 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,21 @@ dependency-reduced-pom.xml target .flattened-pom.xml +spark-load/.idea/ +spark-load/target +spark-load/spark-load-core/dependency-reduced-pom.xml +spark-load/spark-load-core/output/ +spark-load/spark-load-core/target/ +spark-load/spark-load-core/.idea/ +spark-load/spark-load-dist/dependency-reduced-pom.xml +spark-load/spark-load-dist/target/ +spark-load/spark-load-dpp/dependency-reduced-pom.xml +spark-load/spark-load-dpp/.flattened-pom.xml +spark-load/spark-load-dpp/target/ +spark-load/spark-load-common/dependency-reduced-pom.xml +spark-load/spark-load-common/target/ + + ### Java template # Compiled class file *.class diff --git a/spark-doris-connector/spark-doris-connector-it/src/test/java/org/apache/doris/spark/sql/TestSparkConnector.scala b/spark-doris-connector/spark-doris-connector-it/src/test/java/org/apache/doris/spark/sql/TestSparkConnector.scala index 1242a9ba..a5e756c1 100644 --- a/spark-doris-connector/spark-doris-connector-it/src/test/java/org/apache/doris/spark/sql/TestSparkConnector.scala +++ b/spark-doris-connector/spark-doris-connector-it/src/test/java/org/apache/doris/spark/sql/TestSparkConnector.scala @@ -19,7 +19,8 @@ package org.apache.doris.spark.sql import org.apache.spark.sql.SparkSession import org.apache.spark.{SparkConf, SparkContext} -import org.junit.{Ignore, Test} +import org.junit.Ignore +import org.junit.Test // This test need real connect info to run. // Set the connect info before comment out this @Ignore diff --git a/spark-load/build.sh b/spark-load/build.sh new file mode 100755 index 00000000..a8ca1c73 --- /dev/null +++ b/spark-load/build.sh @@ -0,0 +1,175 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +############################################################## +# This script is used to compile Spark-Load +# Usage: +# sh build.sh +# +############################################################## + +# Bugzilla 37848: When no TTY is available, don't output to console +have_tty=0 +# shellcheck disable=SC2006 +if [[ "`tty`" != "not a tty" ]]; then + have_tty=1 +fi + +# Bugzilla 37848: When no TTY is available, don't output to console +have_tty=0 +# shellcheck disable=SC2006 +if [[ "`tty`" != "not a tty" ]]; then + have_tty=1 +fi + + # Only use colors if connected to a terminal +if [[ ${have_tty} -eq 1 ]]; then + PRIMARY=$(printf '\033[38;5;082m') + RED=$(printf '\033[31m') + GREEN=$(printf '\033[32m') + YELLOW=$(printf '\033[33m') + BLUE=$(printf '\033[34m') + BOLD=$(printf '\033[1m') + RESET=$(printf '\033[0m') +else + PRIMARY="" + RED="" + GREEN="" + YELLOW="" + BLUE="" + BOLD="" + RESET="" +fi + +echo_r () { + # Color red: Error, Failed + [[ $# -ne 1 ]] && return 1 + # shellcheck disable=SC2059 + printf "[%sDoris%s] %s$1%s\n" $BLUE $RESET $RED $RESET +} + +echo_g () { + # Color green: Success + [[ $# -ne 1 ]] && return 1 + # shellcheck disable=SC2059 + printf "[%sDoris%s] %s$1%s\n" $BLUE $RESET $GREEN $RESET +} + +echo_y () { + # Color yellow: Warning + [[ $# -ne 1 ]] && return 1 + # shellcheck disable=SC2059 + printf "[%sDoris%s] %s$1%s\n" $BLUE $RESET $YELLOW $RESET +} + +echo_w () { + # Color yellow: White + [[ $# -ne 1 ]] && return 1 + # shellcheck disable=SC2059 + printf "[%sDoris%s] %s$1%s\n" $BLUE $RESET $WHITE $RESET +} + +# OS specific support. $var _must_ be set to either true or false. +cygwin=false +os400=false +# shellcheck disable=SC2006 +case "`uname`" in +CYGWIN*) cygwin=true;; +OS400*) os400=true;; +esac + +# resolve links - $0 may be a softlink +PRG="$0" + +while [[ -h "$PRG" ]]; do + # shellcheck disable=SC2006 + ls=`ls -ld "$PRG"` + # shellcheck disable=SC2006 + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + # shellcheck disable=SC2006 + PRG=`dirname "$PRG"`/"$link" + fi +done + +# Get standard environment variables +# shellcheck disable=SC2006 +ROOT=$(cd "$(dirname "$PRG")" &>/dev/null && pwd) +export DORIS_HOME=$(cd "$ROOT/../" &>/dev/null && pwd) + +. "${DORIS_HOME}"/env.sh + +# include custom environment variables +if [[ -f ${DORIS_HOME}/custom_env.sh ]]; then + . "${DORIS_HOME}"/custom_env.sh +fi + +selectSpark() { + echo 'Spark-Load supports multiple versions of spark. Which version do you need ?' + select spark in "2.x" "3.x" "other" + do + case $spark in + "2.x") + return 1 + ;; + "3.x") + return 2 + ;; + *) + echo "invalid selected, exit.." + exit 1 + ;; + esac + done +} + +SPARK_VERSION=0 +selectSpark +SparkVer=$? +if [ ${SparkVer} -eq 1 ]; then + SPARK_VERSION="spark2" + SCALA_VERSION="scala_2.11" +elif [ ${SparkVer} -eq 2 ]; then + SPARK_VERSION="spark3" + SCALA_VERSION="scala_2.12" +fi + +echo_g " spark load run based on : ${SPARK_VERSION} and ${SCALA_VERSION}" +echo_g " build starting..." + +${MVN_BIN} clean package -P${SPARK_VERSION},${SCALA_VERSION} "$@" + +EXIT_CODE=$? +if [ $EXIT_CODE -eq 0 ]; then + DIST_DIR=${DORIS_HOME}/dist + [ ! -d "$DIST_DIR" ] && mkdir "$DIST_DIR" + dist_jar=$(ls "${ROOT}"/target | grep "spark-load-") + rm -rf "${DIST_DIR}"/"${dist_jar}" + cp "${ROOT}"/target/"${dist_jar}" "$DIST_DIR" + + echo_g "*****************************************************************" + echo_g "Successfully build Spark-Load" + echo_g "dist: $DIST_DIR/$dist_jar " + echo_g "*****************************************************************" + exit 0; +else + echo_r "Failed build Spark-Load" + exit $EXIT_CODE; +fi diff --git a/spark-load/pom.xml b/spark-load/pom.xml new file mode 100644 index 00000000..480d4f92 --- /dev/null +++ b/spark-load/pom.xml @@ -0,0 +1,418 @@ + + + + 4.0.0 + + org.apache.doris + spark-load + ${revision} + pom + + spark-load-common + spark-load-core + spark-load-dpp + spark-load-dist + + + + 1.8 + 1.8 + UTF-8 + 24.0.0-SNAPSHOT + 1.13 + 3.9 + 3.3.6 + 4.1.104.Final + 1.13.1 + 3.2.2 + 4.0.2 + 32.1.2-jre + 2.14.2 + 1.18.30 + 1.4 + 4.5.13 + 5.8.2 + 1.49 + 2.17.1 + 2.0.7 + 1.2 + 1.12.669 + 0.8.13 + 2.9.1 + + + + + + + commons-codec + commons-codec + ${commons-codec.version} + + + + org.apache.commons + commons-lang3 + ${commons-lang3.version} + + + + + + + + + + + org.apache.spark + spark-core_${scala.major.version} + ${spark.version} + provided + + + org.apache.logging.log4j + log4j-1.2-api + + + org.apache.logging.log4j + log4j-api + + + org.apache.logging.log4j + log4j-core + + + commons-logging + commons-logging + + + org.slf4j + slf4j-api + + + + + + io.netty + netty-all + ${netty-all.version} + + + + + org.apache.spark + spark-sql_${scala.major.version} + ${spark.version} + provided + + + org.apache.hadoop + hadoop-common + ${hadoop.version} + + + org.slf4j + slf4j-api + + + + + org.apache.hadoop + hadoop-client + ${hadoop.version} + + + org.slf4j + slf4j-api + + + + + org.apache.hadoop + hadoop-aws + ${hadoop.version} + + + org.slf4j + slf4j-log4j12 + + + log4j + log4j + + + servlet-api + javax.servlet + + + + com.amazonaws + aws-java-sdk-s3 + + + com.amazonaws + aws-java-sdk-bundle + + + + + com.amazonaws + aws-java-sdk-s3 + ${aws-java-sdk.version} + + + com.amazonaws + aws-java-sdk-glue + ${aws-java-sdk.version} + + + com.amazonaws + aws-java-sdk-dynamodb + ${aws-java-sdk.version} + + + + com.amazonaws + aws-java-sdk-logs + ${aws-java-sdk.version} + + + org.apache.parquet + parquet-column + ${parquet.version} + + + org.apache.parquet + parquet-hadoop + ${parquet.version} + + + org.apache.parquet + parquet-common + ${parquet.version} + + + commons-collections + commons-collections + ${commons-collections.version} + + + org.scala-lang + scala-library + ${scala.version} + provided + + + com.esotericsoftware + kryo-shaded + ${kryo.version} + + + org.apache.spark + spark-catalyst_${scala.major.version} + ${spark.version} + + + org.slf4j + slf4j-api + + + provided + + + com.google.guava + guava + ${guava.version} + + + + com.fasterxml.jackson.core + jackson-databind + ${jackson.version} + + + + org.projectlombok + lombok + ${lombok.veresion} + provided + + + + commons-cli + commons-cli + ${commons-cli.version} + + + org.apache.spark + spark-launcher_${scala.major.version} + ${spark.version} + + + + org.apache.httpcomponents + httpclient + ${httpclient.version} + + + + + org.junit.jupiter + junit-jupiter-engine + ${junit.version} + test + + + + org.junit.jupiter + junit-jupiter-params + ${junit.version} + test + + + + org.jmockit + jmockit + ${jmockit.version} + test + + + + + org.apache.logging.log4j + log4j-core + ${log4j.version} + + + + org.apache.logging.log4j + log4j-api + ${log4j.version} + + + + org.apache.logging.log4j + log4j-slf4j-impl + ${log4j.version} + + + + + org.slf4j + slf4j-api + ${slf4j.version} + + + + commons-logging + commons-logging + ${commons-logging.version} + + + org.roaringbitmap + RoaringBitmap + ${RoaringBitmap.version} + + + + + com.google.code.gson + gson + ${gson.version} + + + ${project.groupId} + spark-load-common + ${project.version} + + + + + + + spark2 + + false + + + 2.4.8 + + + + spark3 + + true + + + 3.4.1 + + + + scala_2.11 + + false + + + 2.11.8 + 2.11 + + + + scala_2.12 + + true + + + 2.12.10 + 2.12 + + + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + 3.1.1 + + + org.apache.maven.plugins + maven-shade-plugin + 3.2.1 + + + org.apache.maven.plugins + maven-surefire-plugin + 2.22.2 + + + org.codehaus.mojo + flatten-maven-plugin + 1.4.1 + + + + + + \ No newline at end of file diff --git a/spark-load/spark-load-common/pom.xml b/spark-load/spark-load-common/pom.xml new file mode 100644 index 00000000..4a0e96b7 --- /dev/null +++ b/spark-load/spark-load-common/pom.xml @@ -0,0 +1,67 @@ + + + + + 4.0.0 + + org.apache.doris + spark-load + ${revision} + + + spark-load-common + jar + + 8 + 8 + UTF-8 + + + + + com.fasterxml.jackson.core + jackson-databind + + + com.google.code.gson + gson + + + com.google.guava + guava + + + org.roaringbitmap + RoaringBitmap + + + commons-codec + commons-codec + + + org.junit.jupiter + junit-jupiter-engine + test + + + + \ No newline at end of file diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/DppResult.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/DppResult.java new file mode 100644 index 00000000..7a2a9cb4 --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/DppResult.java @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +import java.io.Serializable; + +/** + * Copied from Apache Doris org.apache.doris.sparkdpp.DppResult + */ +public class DppResult implements Serializable { + + public boolean isSuccess; + + public String failedReason; + + public long scannedRows; + + public long fileNumber; + + public long fileSize; + + public long normalRows; + + public long abnormalRows; + + public long unselectRows; + + // only part of abnormal rows will be returned + public String partialAbnormalRows; + + public long scannedBytes; + + public DppResult() { + isSuccess = true; + failedReason = ""; + scannedRows = 0; + fileNumber = 0; + fileSize = 0; + normalRows = 0; + abnormalRows = 0; + unselectRows = 0; + partialAbnormalRows = ""; + scannedBytes = 0; + } + + @JsonCreator + public DppResult(@JsonProperty(value = "is_success", required = true) boolean isSuccess, + @JsonProperty(value = "failed_reason", required = true) String failedReason, + @JsonProperty(value = "scanned_rows", required = true) long scannedRows, + @JsonProperty(value = "file_number", required = true) long fileNumber, + @JsonProperty(value = "file_size", required = true) long fileSize, + @JsonProperty(value = "normal_rows", required = true) long normalRows, + @JsonProperty(value = "abnormal_rows", required = true) long abnormalRows, + @JsonProperty(value = "unselect_rows", required = true) long unselectRows, + @JsonProperty("partial_abnormal_rows") String partialAbnormalRows, + @JsonProperty("scanned_bytes") long scannedBytes) { + this.isSuccess = isSuccess; + this.failedReason = failedReason; + this.scannedRows = scannedRows; + this.fileNumber = fileNumber; + this.fileSize = fileSize; + this.normalRows = normalRows; + this.abnormalRows = abnormalRows; + this.unselectRows = unselectRows; + this.partialAbnormalRows = partialAbnormalRows; + this.scannedBytes = scannedBytes; + } + +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/BitmapValue.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/BitmapValue.java new file mode 100644 index 00000000..db4a65c2 --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/BitmapValue.java @@ -0,0 +1,423 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common.io; + +import org.roaringbitmap.Util; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +/** + * Copied from Apache Doris + */ +public class BitmapValue { + + public static final int EMPTY = 0; + public static final int SINGLE32 = 1; + public static final int BITMAP32 = 2; + public static final int SINGLE64 = 3; + public static final int BITMAP64 = 4; + + public static final int SINGLE_VALUE = 1; + public static final int BITMAP_VALUE = 2; + + public static final long UNSIGNED_32BIT_INT_MAX_VALUE = 4294967295L; + + private int bitmapType; + private long singleValue; + private Roaring64Map bitmap; + + public BitmapValue() { + bitmapType = EMPTY; + } + + public void add(int value) { + add(Util.toUnsignedLong(value)); + } + + public void add(long value) { + switch (bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + singleValue = value; + bitmapType = SINGLE_VALUE; + break; + case SINGLE_VALUE: + if (this.singleValue != value) { + bitmap = new Roaring64Map(); + bitmap.add(value); + bitmap.add(singleValue); + bitmapType = BITMAP_VALUE; + } + break; + case BITMAP_VALUE: + bitmap.addLong(value); + break; + } + } + + public boolean contains(int value) { + return contains(Util.toUnsignedLong(value)); + } + + public boolean contains(long value) { + switch (bitmapType) { + case EMPTY: + return false; + case SINGLE_VALUE: + return singleValue == value; + case BITMAP_VALUE: + return bitmap.contains(value); + default: + return false; + } + } + + public long cardinality() { + switch (bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + return 0; + case SINGLE_VALUE: + return 1; + case BITMAP_VALUE: + return bitmap.getLongCardinality(); + } + return 0; + } + + public void serialize(DataOutput output) throws IOException { + switch (bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + output.writeByte(EMPTY); + break; + case SINGLE_VALUE: + // is 32-bit enough + // FE is big end but BE is little end. + if (isLongValue32bitEnough(singleValue)) { + output.write(SINGLE32); + output.writeInt(Integer.reverseBytes((int) singleValue)); + } else { + output.writeByte(SINGLE64); + output.writeLong(Long.reverseBytes(singleValue)); + } + break; + case BITMAP_VALUE: + bitmap.serialize(output); + break; + } + } + + public void deserialize(DataInput input) throws IOException { + clear(); + int bitmapType = input.readByte(); + switch (bitmapType) { + case EMPTY: + break; + case SINGLE32: + singleValue = Util.toUnsignedLong(Integer.reverseBytes(input.readInt())); + this.bitmapType = SINGLE_VALUE; + break; + case SINGLE64: + singleValue = Long.reverseBytes(input.readLong()); + this.bitmapType = SINGLE_VALUE; + break; + case BITMAP32: + case BITMAP64: + bitmap = bitmap == null ? new Roaring64Map() : bitmap; + bitmap.deserialize(input, bitmapType); + this.bitmapType = BITMAP_VALUE; + break; + default: + throw new RuntimeException(String.format("unknown bitmap type %s ", bitmapType)); + } + } + + // In-place bitwise AND (intersection) operation. The current bitmap is modified. + public void and(BitmapValue other) { + switch (other.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + clear(); + break; + case SINGLE_VALUE: + switch (this.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + break; + case SINGLE_VALUE: + if (this.singleValue != other.singleValue) { + clear(); + } + break; + case BITMAP_VALUE: + if (!this.bitmap.contains(other.singleValue)) { + clear(); + } else { + clear(); + this.singleValue = other.singleValue; + this.bitmapType = SINGLE_VALUE; + } + break; + } + break; + case BITMAP_VALUE: + switch (this.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + break; + case SINGLE_VALUE: + if (!other.bitmap.contains(this.singleValue)) { + clear(); + } + break; + case BITMAP_VALUE: + this.bitmap.and(other.bitmap); + convertToSmallerType(); + break; + } + break; + } + } + + // In-place bitwise OR (union) operation. The current bitmap is modified. + public void or(BitmapValue other) { + switch (other.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + break; + case SINGLE_VALUE: + add(other.singleValue); + break; + case BITMAP_VALUE: + switch (this.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + // deep copy the bitmap in case of multi-rollups update the bitmap repeatedly + this.bitmap = new Roaring64Map(); + this.bitmap.or(other.bitmap); + this.bitmapType = BITMAP_VALUE; + break; + case SINGLE_VALUE: + this.bitmap = new Roaring64Map(); + this.bitmap.or(other.bitmap); + this.bitmap.add(this.singleValue); + this.bitmapType = BITMAP_VALUE; + break; + case BITMAP_VALUE: + this.bitmap.or(other.bitmap); + break; + } + break; + } + } + + public void remove(long value) { + switch (this.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + break; + case SINGLE_VALUE: + if (this.singleValue == value) { + clear(); + } + break; + case BITMAP_VALUE: + this.bitmap.removeLong(value); + convertToSmallerType(); + break; + } + } + + // In-place bitwise ANDNOT (difference) operation. The current bitmap is modified + public void not(BitmapValue other) { + switch (other.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + break; + case SINGLE_VALUE: + remove(other.singleValue); + break; + case BITMAP_VALUE: + switch (this.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + break; + case SINGLE_VALUE: + if (other.bitmap.contains(this.singleValue)) { + clear(); + } + break; + case BITMAP_VALUE: + this.bitmap.andNot(other.bitmap); + convertToSmallerType(); + break; + } + break; + } + } + + // In-place bitwise XOR (symmetric difference) operation. The current bitmap is modified + public void xor(BitmapValue other) { + switch (other.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + break; + case SINGLE_VALUE: + switch (this.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + add(other.singleValue); + break; + case SINGLE_VALUE: + if (this.singleValue != other.singleValue) { + add(other.singleValue); + } else { + clear(); + } + break; + case BITMAP_VALUE: + if (!this.bitmap.contains(other.singleValue)) { + this.bitmap.add(other.singleValue); + } else { + this.bitmap.removeLong(other.singleValue); + convertToSmallerType(); + } + break; + } + break; + case BITMAP_VALUE: + switch (this.bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + this.bitmap = other.bitmap; + this.bitmapType = BITMAP_VALUE; + break; + case SINGLE_VALUE: + this.bitmap = other.bitmap; + this.bitmapType = BITMAP_VALUE; + if (this.bitmap.contains(this.singleValue)) { + this.bitmap.removeLong(this.singleValue); + } else { + this.bitmap.add(this.bitmapType); + } + break; + case BITMAP_VALUE: + this.bitmap.xor(other.bitmap); + convertToSmallerType(); + break; + } + break; + } + } + + @Override + public boolean equals(Object other) { + if (other == null || !(other instanceof BitmapValue)) { + return false; + } + boolean ret = false; + if (this.bitmapType != ((BitmapValue) other).bitmapType) { + return false; + } + switch (((BitmapValue) other).bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + ret = true; + break; + case SINGLE_VALUE: + ret = this.singleValue == ((BitmapValue) other).singleValue; + break; + case BITMAP_VALUE: + ret = bitmap.equals(((BitmapValue) other).bitmap); + } + return ret; + } + + /** + * usage note: + * now getSizeInBytes is different from be' impl + * The reason is that java's roaring didn't implement method #shrinkToFit but be's getSizeInBytes need it + * Implementing java's shrinkToFit means refactor roaring whose fields are all unaccess in Doris Fe's package + * That would be an another big project + */ + // TODO(wb): keep getSizeInBytes consistent with be and refactor roaring + public long getSizeInBytes() { + long size = 0; + switch (bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + size = 1; + break; + case SINGLE_VALUE: + if (isLongValue32bitEnough(singleValue)) { + size = 1 + 4; + } else { + size = 1 + 8; + } + break; + case BITMAP_VALUE: + size = 1 + bitmap.getSizeInBytes(); + } + return size; + } + + @Override + public String toString() { + String toStringStr = "{}"; + switch (bitmapType) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case EMPTY: + break; + case SINGLE_VALUE: + toStringStr = String.format("{%s}", singleValue); + break; + case BITMAP_VALUE: + toStringStr = this.bitmap.toString(); + break; + } + return toStringStr; + } + + public void clear() { + this.bitmapType = EMPTY; + this.singleValue = -1; + this.bitmap = null; + } + + private void convertToSmallerType() { + if (bitmapType == BITMAP_VALUE) { + if (bitmap.getLongCardinality() == 0) { + this.bitmap = null; + this.bitmapType = EMPTY; + } else if (bitmap.getLongCardinality() == 1) { + this.singleValue = bitmap.select(0); + this.bitmapType = SINGLE_VALUE; + this.bitmap = null; + } + } + } + + private boolean isLongValue32bitEnough(long value) { + return value <= UNSIGNED_32BIT_INT_MAX_VALUE; + } + + // just for ut + public int getBitmapType() { + return bitmapType; + } + + // just for ut + public boolean is32BitsEnough() { + switch (bitmapType) { + case EMPTY: + return true; + case SINGLE_VALUE: + return isLongValue32bitEnough(singleValue); + case BITMAP_VALUE: + return bitmap.is32BitsEnough(); + default: + return false; + } + } +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Codec.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Codec.java new file mode 100644 index 00000000..3c57a0f1 --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Codec.java @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common.io; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +/** + * Copied from Apache Doris + */ +public class Codec { + + // not support encode negative value now + public static void encodeVarint64(long source, DataOutput out) throws IOException { + assert source >= 0; + short B = 128; // CHECKSTYLE IGNORE THIS LINE + + while (source >= B) { + out.write((int) (source & (B - 1) | B)); + source = source >> 7; + } + out.write((int) (source & (B - 1))); + } + + // not support decode negative value now + public static long decodeVarint64(DataInput in) throws IOException { + long result = 0; + int shift = 0; + short B = 128; // CHECKSTYLE IGNORE THIS LINE + + while (true) { + int oneByte = in.readUnsignedByte(); + boolean isEnd = (oneByte & B) == 0; + result = result | ((long) (oneByte & B - 1) << (shift * 7)); + if (isEnd) { + break; + } + shift++; + } + + return result; + } +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Hll.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Hll.java new file mode 100644 index 00000000..a28ea1d8 --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Hll.java @@ -0,0 +1,394 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common.io; + +import org.apache.commons.codec.binary.StringUtils; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.math.BigInteger; +import java.util.HashSet; +import java.util.Set; + +/** + * Copied from Apache Doris + */ +public class Hll { + + public static final byte HLL_DATA_EMPTY = 0; + public static final byte HLL_DATA_EXPLICIT = 1; + public static final byte HLL_DATA_SPARSE = 2; + public static final byte HLL_DATA_FULL = 3; + + public static final int HLL_COLUMN_PRECISION = 14; + public static final int HLL_ZERO_COUNT_BITS = (64 - HLL_COLUMN_PRECISION); + public static final int HLL_EXPLICIT_INT64_NUM = 160; + public static final int HLL_SPARSE_THRESHOLD = 4096; + public static final int HLL_REGISTERS_COUNT = 16 * 1024; + public static final long M64 = 0xc6a4a7935bd1e995L; + public static final int R64 = 47; + public static final int SEED = 0xadc83b19; + private int type; + private Set hashSet; + private byte[] registers; + + public Hll() { + type = HLL_DATA_EMPTY; + this.hashSet = new HashSet<>(); + } + + public static byte getLongTailZeroNum(long hashValue) { + if (hashValue == 0) { + return 0; + } + long value = 1L; + byte idx = 0; + for (; ; idx++) { + if ((value & hashValue) != 0) { + return idx; + } + value = value << 1; + if (idx == 62) { + break; + } + } + return idx; + } + + private static long getLittleEndianLong(final byte[] data, final int index) { + return (((long) data[index] & 0xff)) + | (((long) data[index + 1] & 0xff) << 8) + | (((long) data[index + 2] & 0xff) << 16) + | (((long) data[index + 3] & 0xff) << 24) + | (((long) data[index + 4] & 0xff) << 32) + | (((long) data[index + 5] & 0xff) << 40) + | (((long) data[index + 6] & 0xff) << 48) + | (((long) data[index + 7] & 0xff) << 56); + } + + public static long hash64(final byte[] data, final int length, final int seed) { + long h = (seed & 0xffffffffL) ^ (length * M64); + final int nblocks = length >> 3; + + // body + for (int i = 0; i < nblocks; i++) { + final int index = (i << 3); + long k = getLittleEndianLong(data, index); + + k *= M64; + k ^= k >>> R64; + k *= M64; + + h ^= k; + h *= M64; + } + + final int index = (nblocks << 3); + switch (length - index) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case 7: + h ^= ((long) data[index + 6] & 0xff) << 48; + case 6: // CHECKSTYLE IGNORE THIS LINE: fall through + h ^= ((long) data[index + 5] & 0xff) << 40; + case 5: // CHECKSTYLE IGNORE THIS LINE: fall through + h ^= ((long) data[index + 4] & 0xff) << 32; + case 4: // CHECKSTYLE IGNORE THIS LINE: fall through + h ^= ((long) data[index + 3] & 0xff) << 24; + case 3: // CHECKSTYLE IGNORE THIS LINE: fall through + h ^= ((long) data[index + 2] & 0xff) << 16; + case 2: // CHECKSTYLE IGNORE THIS LINE: fall through + h ^= ((long) data[index + 1] & 0xff) << 8; + case 1: // CHECKSTYLE IGNORE THIS LINE: fall through + h ^= ((long) data[index] & 0xff); + h *= M64; + } + + h ^= h >>> R64; + h *= M64; + h ^= h >>> R64; + + return h; + } + + private void convertExplicitToRegister() { + assert this.type == HLL_DATA_EXPLICIT; + registers = new byte[HLL_REGISTERS_COUNT]; + for (Long value : hashSet) { + updateRegisters(value); + } + hashSet.clear(); + } + + private void updateRegisters(long hashValue) { + int idx; + // hash value less than zero means we get a unsigned long + // so need to transfer to BigInter to mod + if (hashValue < 0) { + BigInteger unint64HashValue = new BigInteger(Long.toUnsignedString(hashValue)); + unint64HashValue = unint64HashValue.mod(new BigInteger(Long.toUnsignedString(HLL_REGISTERS_COUNT))); + idx = unint64HashValue.intValue(); + } else { + idx = (int) (hashValue % HLL_REGISTERS_COUNT); + } + + hashValue >>>= HLL_COLUMN_PRECISION; + hashValue |= (1L << HLL_ZERO_COUNT_BITS); + byte firstOneBit = (byte) (getLongTailZeroNum(hashValue) + 1); + registers[idx] = registers[idx] > firstOneBit ? registers[idx] : firstOneBit; + } + + private void mergeRegisters(byte[] other) { + for (int i = 0; i < HLL_REGISTERS_COUNT; i++) { + this.registers[i] = this.registers[i] > other[i] ? this.registers[i] : other[i]; + } + } + + public void updateWithHash(Object value) { + byte[] v = StringUtils.getBytesUtf8(String.valueOf(value)); + update(hash64(v, v.length, SEED)); + } + + public void update(long hashValue) { + switch (this.type) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case HLL_DATA_EMPTY: + hashSet.add(hashValue); + type = HLL_DATA_EXPLICIT; + break; + case HLL_DATA_EXPLICIT: + if (hashSet.size() < HLL_EXPLICIT_INT64_NUM) { + hashSet.add(hashValue); + break; + } + convertExplicitToRegister(); + type = HLL_DATA_FULL; + case HLL_DATA_SPARSE: // CHECKSTYLE IGNORE THIS LINE: fall through + case HLL_DATA_FULL: + updateRegisters(hashValue); + break; + } + } + + public void merge(Hll other) { + if (other.type == HLL_DATA_EMPTY) { + return; + } + switch (this.type) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case HLL_DATA_EMPTY: + this.type = other.type; + switch (other.type) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case HLL_DATA_EXPLICIT: + this.hashSet.addAll(other.hashSet); + break; + case HLL_DATA_SPARSE: + case HLL_DATA_FULL: + this.registers = new byte[HLL_REGISTERS_COUNT]; + System.arraycopy(other.registers, 0, this.registers, 0, HLL_REGISTERS_COUNT); + break; + } + break; + case HLL_DATA_EXPLICIT: + switch (other.type) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case HLL_DATA_EXPLICIT: + this.hashSet.addAll(other.hashSet); + if (this.hashSet.size() > HLL_EXPLICIT_INT64_NUM) { + convertExplicitToRegister(); + this.type = HLL_DATA_FULL; + } + break; + case HLL_DATA_SPARSE: + case HLL_DATA_FULL: + convertExplicitToRegister(); + mergeRegisters(other.registers); + this.type = HLL_DATA_FULL; + break; + } + break; + case HLL_DATA_SPARSE: + case HLL_DATA_FULL: + switch (other.type) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case HLL_DATA_EXPLICIT: + for (long value : other.hashSet) { + update(value); + } + break; + case HLL_DATA_SPARSE: + case HLL_DATA_FULL: + mergeRegisters(other.registers); + break; + } + break; + } + } + + public void serialize(DataOutput output) throws IOException { + switch (type) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case HLL_DATA_EMPTY: + output.writeByte(type); + break; + case HLL_DATA_EXPLICIT: + output.writeByte(type); + output.writeByte(hashSet.size()); + for (long value : hashSet) { + output.writeLong(Long.reverseBytes(value)); + } + break; + case HLL_DATA_SPARSE: + case HLL_DATA_FULL: + int nonZeroRegisterNum = 0; + for (int i = 0; i < HLL_REGISTERS_COUNT; i++) { + if (registers[i] != 0) { + nonZeroRegisterNum++; + } + } + if (nonZeroRegisterNum > HLL_SPARSE_THRESHOLD) { + output.writeByte(HLL_DATA_FULL); + for (byte value : registers) { + output.writeByte(value); + } + } else { + output.writeByte(HLL_DATA_SPARSE); + output.writeInt(Integer.reverseBytes(nonZeroRegisterNum)); + for (int i = 0; i < HLL_REGISTERS_COUNT; i++) { + if (registers[i] != 0) { + output.writeShort(Short.reverseBytes((short) i)); + output.writeByte(registers[i]); + } + } + } + break; + } + } + + public boolean deserialize(DataInput input) throws IOException { + assert type == HLL_DATA_EMPTY; + + if (input == null) { + return false; + } + + this.type = input.readByte(); + switch (this.type) { + case HLL_DATA_EMPTY: + break; + case HLL_DATA_EXPLICIT: + int hashSetSize = input.readUnsignedByte(); + for (int i = 0; i < hashSetSize; i++) { + update(Long.reverseBytes(input.readLong())); + } + assert this.type == HLL_DATA_EXPLICIT; + break; + case HLL_DATA_SPARSE: + int sparseDataSize = Integer.reverseBytes(input.readInt()); + this.registers = new byte[HLL_REGISTERS_COUNT]; + for (int i = 0; i < sparseDataSize; i++) { + int idx = Short.reverseBytes(input.readShort()); + byte value = input.readByte(); + registers[idx] = value; + } + break; + case HLL_DATA_FULL: + this.registers = new byte[HLL_REGISTERS_COUNT]; + for (int i = 0; i < HLL_REGISTERS_COUNT; i++) { + registers[i] = input.readByte(); + } + break; + default: + return false; + } + + return true; + } + + // use strictfp to force java follow IEEE 754 to deal float point strictly + public strictfp long estimateCardinality() { + if (type == HLL_DATA_EMPTY) { + return 0; + } + if (type == HLL_DATA_EXPLICIT) { + return hashSet.size(); + } + + int numStreams = HLL_REGISTERS_COUNT; + float alpha = 0; + + if (numStreams == 16) { + alpha = 0.673f; + } else if (numStreams == 32) { + alpha = 0.697f; + } else if (numStreams == 64) { + alpha = 0.709f; + } else { + alpha = 0.7213f / (1 + 1.079f / numStreams); + } + + float harmonicMean = 0; + int numZeroRegisters = 0; + + for (int i = 0; i < HLL_REGISTERS_COUNT; i++) { + harmonicMean += Math.pow(2.0f, -registers[i]); + + if (registers[i] == 0) { + numZeroRegisters++; + } + } + + harmonicMean = 1.0f / harmonicMean; + double estimate = alpha * numStreams * numStreams * harmonicMean; + + if (estimate <= numStreams * 2.5 && numZeroRegisters != 0) { + estimate = numStreams * Math.log(((float) numStreams) / ((float) numZeroRegisters)); + } else if (numStreams == 16384 && estimate < 72000) { + double bias = 5.9119 * 1.0e-18 * (estimate * estimate * estimate * estimate) + - 1.4253 * 1.0e-12 * (estimate * estimate * estimate) + + 1.2940 * 1.0e-7 * (estimate * estimate) + - 5.2921 * 1.0e-3 * estimate + + 83.3216; + estimate -= estimate * (bias / 100); + } + + return (long) (estimate + 0.5); + } + + public int maxSerializedSize() { + switch (type) { + case HLL_DATA_EMPTY: + default: + return 1; + case HLL_DATA_EXPLICIT: + return 2 + hashSet.size() * 8; + case HLL_DATA_SPARSE: + case HLL_DATA_FULL: + return 1 + HLL_REGISTERS_COUNT; + } + } + + // just for ut + public int getType() { + return type; + } + + // For convert to statistics used Hll128 + public byte[] getRegisters() { + return registers; + } + + // For convert to statistics used Hll128 + public Set getHashSet() { + return hashSet; + } +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Roaring64Map.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Roaring64Map.java new file mode 100644 index 00000000..33237983 --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/io/Roaring64Map.java @@ -0,0 +1,1432 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common.io; + +import org.roaringbitmap.BitmapDataProvider; +import org.roaringbitmap.BitmapDataProviderSupplier; +import org.roaringbitmap.IntConsumer; +import org.roaringbitmap.IntIterator; +import org.roaringbitmap.InvalidRoaringFormat; +import org.roaringbitmap.RoaringBitmap; +import org.roaringbitmap.RoaringBitmapSupplier; +import org.roaringbitmap.Util; +import org.roaringbitmap.buffer.MutableRoaringBitmap; +import org.roaringbitmap.longlong.ImmutableLongBitmapDataProvider; +import org.roaringbitmap.longlong.LongConsumer; +import org.roaringbitmap.longlong.LongIterator; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.math.BigInteger; +import java.util.AbstractMap; +import java.util.Arrays; +import java.util.Comparator; +import java.util.Iterator; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Objects; +import java.util.SortedMap; +import java.util.TreeMap; + +/** + * Copied from Apache Doris + */ +public class Roaring64Map { + + private static final boolean DEFAULT_ORDER_IS_SIGNED = false; + private static final boolean DEFAULT_CARDINALITIES_ARE_CACHED = true; + /** + * the constant 2^64 + */ + private static final BigInteger TWO_64 = BigInteger.ONE.shiftLeft(64); + // Not final to enable initialization in Externalizable.readObject + private NavigableMap highToBitmap; + // If true, we handle longs a plain java longs: -1 if right before 0 + // If false, we handle longs as unsigned longs: 0 has no predecessor and Long.MAX_VALUE + 1L is + // expressed as a + // negative long + private boolean signedLongs = false; + private BitmapDataProviderSupplier supplier; + // By default, we cache cardinalities + private transient boolean doCacheCardinalities = true; + // Prevent recomputing all cardinalities when requesting consecutive ranks + private transient int firstHighNotValid = highestHigh() + 1; + // This boolean needs firstHighNotValid == Integer.MAX_VALUE to be allowed to be true + // If false, it means nearly all cumulated cardinalities are valid, except high=Integer.MAX_VALUE + // If true, it means all cumulated cardinalities are valid, even high=Integer.MAX_VALUE + private transient boolean allValid = false; + // TODO: I would prefer not managing arrays myself + private transient long[] sortedCumulatedCardinality = new long[0]; + private transient int[] sortedHighs = new int[0]; + // We guess consecutive .addLong will be on proximate longs: we remember the bitmap attached to + // this bucket in order + // to skip the indirection + private transient Map.Entry latestAddedHigh = null; + + /** + * By default, we consider longs are unsigned longs: normal longs: 0 is the lowest possible long. + * Long.MAX_VALUE is followed by Long.MIN_VALUE. -1L is the highest possible value + */ + public Roaring64Map() { + this(DEFAULT_ORDER_IS_SIGNED); + } + + /** + * By default, use RoaringBitmap as underlyings {@link BitmapDataProvider} + * + * @param signedLongs true if longs has to be ordered as plain java longs. False to handle them as + * unsigned 64bits long (as RoaringBitmap with unsigned integers) + */ + public Roaring64Map(boolean signedLongs) { + this(signedLongs, DEFAULT_CARDINALITIES_ARE_CACHED); + } + + /** + * By default, use RoaringBitmap as underlyings {@link BitmapDataProvider} + * + * @param signedLongs true if longs has to be ordered as plain java longs. False to handle them as + * unsigned 64bits long (as RoaringBitmap with unsigned integers) + * @param cacheCardinalities true if cardinalities have to be cached. It will prevent many + * iteration along the NavigableMap + */ + public Roaring64Map(boolean signedLongs, boolean cacheCardinalities) { + this(signedLongs, cacheCardinalities, new RoaringBitmapSupplier()); + } + + /** + * By default, longs are managed as unsigned longs and cardinalities are cached. + * + * @param supplier provide the logic to instantiate new {@link BitmapDataProvider}, typically + * instantiated once per high. + */ + public Roaring64Map(BitmapDataProviderSupplier supplier) { + this(DEFAULT_ORDER_IS_SIGNED, DEFAULT_CARDINALITIES_ARE_CACHED, supplier); + } + + /** + * By default, we activating cardinalities caching. + * + * @param signedLongs true if longs has to be ordered as plain java longs. False to handle them as + * unsigned 64bits long (as RoaringBitmap with unsigned integers) + * @param supplier provide the logic to instantiate new {@link BitmapDataProvider}, typically + * instantiated once per high. + */ + public Roaring64Map(boolean signedLongs, BitmapDataProviderSupplier supplier) { + this(signedLongs, DEFAULT_CARDINALITIES_ARE_CACHED, supplier); + } + + /** + * @param signedLongs true if longs has to be ordered as plain java longs. False to handle them as + * unsigned 64bits long (as RoaringBitmap with unsigned integers) + * @param cacheCardinalities true if cardinalities have to be cached. It will prevent many + * iteration along the NavigableMap + * @param supplier provide the logic to instantiate new {@link BitmapDataProvider}, typically + * instantiated once per high. + */ + public Roaring64Map(boolean signedLongs, boolean cacheCardinalities, + BitmapDataProviderSupplier supplier) { + this.signedLongs = signedLongs; + this.supplier = supplier; + + if (signedLongs) { + highToBitmap = new TreeMap<>(); + } else { + highToBitmap = new TreeMap<>(unsignedComparator()); + } + + this.doCacheCardinalities = cacheCardinalities; + resetPerfHelpers(); + } + + // From Arrays.binarySearch (Comparator). Check with org.roaringbitmap.Util.unsignedBinarySearch + private static int unsignedBinarySearch(int[] a, int fromIndex, int toIndex, int key, + Comparator c) { + int low = fromIndex; + int high = toIndex - 1; + + while (low <= high) { + int mid = (low + high) >>> 1; + int midVal = a[mid]; + int cmp = c.compare(midVal, key); + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + return mid; // key found + } + } + return -(low + 1); // key not found. + } + + /** + * Generate a bitmap with the specified values set to true. The provided longs values don't have + * to be in sorted order, but it may be preferable to sort them from a performance point of view. + * + * @param dat set values + * @return a new bitmap + */ + public static Roaring64Map bitmapOf(final long... dat) { + final Roaring64Map ans = new Roaring64Map(); + ans.add(dat); + return ans; + } + + /** + * @param id any long, positive or negative + * @return an int holding the 32 highest order bits of information of the input long + */ + public static int high(long id) { + return (int) (id >> 32); + } + + /** + * @param id any long, positive or negative + * @return an int holding the 32 lowest order bits of information of the input long + */ + public static int low(long id) { + return (int) id; + } + + /** + * @param high an integer representing the highest order bits of the output long + * @param low an integer representing the lowest order bits of the output long + * @return a long packing together the integers as computed by + * {@link #high(long)} and {@link #low(long)} + */ + // https://stackoverflow.com/questions/12772939/java-storing-two-ints-in-a-long + public static long pack(int high, int low) { + return (((long) high) << 32) | (low & 0xffffffffL); + } + + /** + * @param signedLongs true if long put in a {@link Roaring64Map} should be considered as + * signed long. + * @return the int representing the highest value which can be set as high value in a + */ + public static int highestHigh(boolean signedLongs) { + if (signedLongs) { + return Integer.MAX_VALUE; + } else { + return -1; + } + } + + /** + * @return A comparator for unsigned longs: a negative long is a long greater than Long.MAX_VALUE + */ + public static Comparator unsignedComparator() { + return new Comparator() { + + @Override + public int compare(Integer o1, Integer o2) { + return compareUnsigned(o1, o2); + } + }; + } + + /** + * Compares two {@code int} values numerically treating the values as unsigned. + * + * @param x the first {@code int} to compare + * @param y the second {@code int} to compare + * @return the value {@code 0} if {@code x == y}; a value less than {@code 0} if {@code x < y} as + * unsigned values; and a value greater than {@code 0} if {@code x > y} as unsigned values + * @since 1.8 + */ + // Duplicated from jdk8 Integer.compareUnsigned + public static int compareUnsigned(int x, int y) { + return Integer.compare(x + Integer.MIN_VALUE, y + Integer.MIN_VALUE); + } + + /** + * JDK8 Long.toUnsignedString was too complex to backport. Go for a slow version relying on + * BigInteger + */ + // https://stackoverflow.com/questions/7031198/java-signed-long-to-unsigned-long-string + static String toUnsignedString(long l) { + BigInteger b = BigInteger.valueOf(l); + if (b.signum() < 0) { + b = b.add(TWO_64); + } + return b.toString(); + } + + private void resetPerfHelpers() { + firstHighNotValid = highestHigh(signedLongs) + 1; + allValid = false; + + sortedCumulatedCardinality = new long[0]; + sortedHighs = new int[0]; + + latestAddedHigh = null; + } + + // Package-friendly: for the sake of unit-testing + // @VisibleForTesting + NavigableMap getHighToBitmap() { + return highToBitmap; + } + + // Package-friendly: for the sake of unit-testing + // @VisibleForTesting + int getLowestInvalidHigh() { + return firstHighNotValid; + } + + // Package-friendly: for the sake of unit-testing + // @VisibleForTesting + long[] getSortedCumulatedCardinality() { + return sortedCumulatedCardinality; + } + + /** + * Add the value to the container (set the value to "true"), whether it already appears or not. + *

+ * Java lacks native unsigned longs but the x argument is considered to be unsigned. Within + * bitmaps, numbers are ordered according to {@link Long#compareUnsigned}. We order the numbers + * like 0, 1, ..., 9223372036854775807, -9223372036854775808, -9223372036854775807,..., -1. + * + * @param x long value + */ + public void addLong(long x) { + int high = high(x); + int low = low(x); + + // Copy the reference to prevent race-condition + Map.Entry local = latestAddedHigh; + + BitmapDataProvider bitmap; + if (local != null && local.getKey().intValue() == high) { + bitmap = local.getValue(); + } else { + bitmap = highToBitmap.get(high); + if (bitmap == null) { + bitmap = newRoaringBitmap(); + pushBitmapForHigh(high, bitmap); + } + latestAddedHigh = new AbstractMap.SimpleImmutableEntry<>(high, bitmap); + } + bitmap.add(low); + + invalidateAboveHigh(high); + } + + /** + * Add the integer value to the container (set the value to "true"), whether it already appears or + * not. + *

+ * Javac lacks native unsigned integers but the x argument is considered to be unsigned. Within + * bitmaps, numbers are ordered according to {@link Integer#compareUnsigned}. We order the numbers + * like 0, 1, ..., 2147483647, -2147483648, -2147483647,..., -1. + * + * @param x integer value + */ + public void addInt(int x) { + addLong(Util.toUnsignedLong(x)); + } + + private BitmapDataProvider newRoaringBitmap() { + return supplier.newEmpty(); + } + + private void invalidateAboveHigh(int high) { + // The cardinalities after this bucket may not be valid anymore + if (compare(firstHighNotValid, high) > 0) { + // High was valid up to now + firstHighNotValid = high; + + int indexNotValid = binarySearch(sortedHighs, firstHighNotValid); + + final int indexAfterWhichToReset; + if (indexNotValid >= 0) { + indexAfterWhichToReset = indexNotValid; + } else { + // We have invalidate a high not already present: added a value for a brand new high + indexAfterWhichToReset = -indexNotValid - 1; + } + + // This way, sortedHighs remains sorted, without making a new/shorter array + Arrays.fill(sortedHighs, indexAfterWhichToReset, sortedHighs.length, highestHigh()); + } + allValid = false; + } + + private int compare(int x, int y) { + if (signedLongs) { + return Integer.compare(x, y); + } else { + return compareUnsigned(x, y); + } + } + + private void pushBitmapForHigh(int high, BitmapDataProvider bitmap) { + // TODO .size is too slow + // int nbHighBefore = highToBitmap.headMap(high).size(); + + BitmapDataProvider previous = highToBitmap.put(high, bitmap); + assert previous == null : "Should push only not-existing high"; + } + + /** + * Returns the number of distinct integers added to the bitmap (e.g., number of bits set). + * + * @return the cardinality + */ + public long getLongCardinality() { + if (doCacheCardinalities) { + if (highToBitmap.isEmpty()) { + return 0L; + } + int indexOk = ensureCumulatives(highestHigh()); + + // ensureCumulatives may have removed empty bitmaps + if (highToBitmap.isEmpty()) { + return 0L; + } + + + return sortedCumulatedCardinality[indexOk - 1]; + } else { + long cardinality = 0L; + for (BitmapDataProvider bitmap : highToBitmap.values()) { + cardinality += bitmap.getLongCardinality(); + } + return cardinality; + } + } + + /** + * @return the cardinality as an int + * @throws UnsupportedOperationException if the cardinality does not fit in an int + */ + public int getIntCardinality() throws UnsupportedOperationException { + long cardinality = getLongCardinality(); + + if (cardinality > Integer.MAX_VALUE) { + // TODO: we should handle cardinality fitting in an unsigned int + throw new UnsupportedOperationException( + "Can not call .getIntCardinality as the cardinality is bigger than Integer.MAX_VALUE"); + } + + return (int) cardinality; + } + + /** + * Return the jth value stored in this bitmap. + * + * @param j index of the value + * @return the value + * @throws IllegalArgumentException if j is out of the bounds of the bitmap cardinality + */ + public long select(final long j) throws IllegalArgumentException { + if (!doCacheCardinalities) { + return selectNoCache(j); + } + + // Ensure all cumulatives as we we have straightforward way to know in advance the high of the + // j-th value + int indexOk = ensureCumulatives(highestHigh()); + + if (highToBitmap.isEmpty()) { + return throwSelectInvalidIndex(j); + } + + // Use normal binarySearch as cardinality does not depends on considering longs signed or + // unsigned + // We need sortedCumulatedCardinality not to contain duplicated, else binarySearch may return + // any of the duplicates: we need to ensure it holds no high associated to an empty bitmap + int position = Arrays.binarySearch(sortedCumulatedCardinality, 0, indexOk, j); + + if (position >= 0) { + if (position == indexOk - 1) { + // .select has been called on this.getCardinality + return throwSelectInvalidIndex(j); + } + + // There is a bucket leading to this cardinality: the j-th element is the first element of + // next bucket + int high = sortedHighs[position + 1]; + BitmapDataProvider nextBitmap = highToBitmap.get(high); + return pack(high, nextBitmap.select(0)); + } else { + // There is no bucket with this cardinality + int insertionPoint = -position - 1; + + final long previousBucketCardinality; + if (insertionPoint == 0) { + previousBucketCardinality = 0L; + } else if (insertionPoint >= indexOk) { + return throwSelectInvalidIndex(j); + } else { + previousBucketCardinality = sortedCumulatedCardinality[insertionPoint - 1]; + } + + // We get a 'select' query for a single bitmap: should fit in an int + final int givenBitmapSelect = (int) (j - previousBucketCardinality); + + int high = sortedHighs[insertionPoint]; + BitmapDataProvider lowBitmap = highToBitmap.get(high); + int low = lowBitmap.select(givenBitmapSelect); + + return pack(high, low); + } + } + + // For benchmarks: compute without using cardinalities cache + // https://github.com/RoaringBitmap/CRoaring/blob/master/cpp/roaring64map.hh + private long selectNoCache(long j) { + long left = j; + + for (Map.Entry entry : highToBitmap.entrySet()) { + long lowCardinality = entry.getValue().getCardinality(); + + if (left >= lowCardinality) { + left -= lowCardinality; + } else { + // It is legit for left to be negative + int leftAsUnsignedInt = (int) left; + return pack(entry.getKey(), entry.getValue().select(leftAsUnsignedInt)); + } + } + + return throwSelectInvalidIndex(j); + } + + private long throwSelectInvalidIndex(long j) { + // see org.roaringbitmap.buffer.ImmutableRoaringBitmap.select(int) + throw new IllegalArgumentException( + "select " + j + " when the cardinality is " + this.getLongCardinality()); + } + + /** + * For better performance, consider the Use the {@link #forEach forEach} method. + * + * @return a custom iterator over set bits, the bits are traversed in ascending sorted order + */ + public Iterator iterator() { + final LongIterator it = getLongIterator(); + + return new Iterator() { + + @Override + public boolean hasNext() { + return it.hasNext(); + } + + @Override + public Long next() { + return it.next(); + } + + @Override + public void remove() { + // TODO? + throw new UnsupportedOperationException(); + } + }; + } + + public void forEach(final LongConsumer lc) { + for (final Map.Entry highEntry : highToBitmap.entrySet()) { + highEntry.getValue().forEach(new IntConsumer() { + + @Override + public void accept(int low) { + lc.accept(pack(highEntry.getKey(), low)); + } + }); + } + } + + public long rankLong(long id) { + int high = high(id); + int low = low(id); + + if (!doCacheCardinalities) { + return rankLongNoCache(high, low); + } + + int indexOk = ensureCumulatives(high); + + int highPosition = binarySearch(sortedHighs, 0, indexOk, high); + + if (highPosition >= 0) { + // There is a bucket holding this item + + final long previousBucketCardinality; + if (highPosition == 0) { + previousBucketCardinality = 0; + } else { + previousBucketCardinality = sortedCumulatedCardinality[highPosition - 1]; + } + + BitmapDataProvider lowBitmap = highToBitmap.get(sortedHighs[highPosition]); + + // Rank is previous cardinality plus rank in current bitmap + return previousBucketCardinality + lowBitmap.rankLong(low); + } else { + // There is no bucket holding this item: insertionPoint is previous bitmap + int insertionPoint = -highPosition - 1; + + if (insertionPoint == 0) { + // this key is before all inserted keys + return 0; + } else { + // The rank is the cardinality of this previous bitmap + return sortedCumulatedCardinality[insertionPoint - 1]; + } + } + } + + // https://github.com/RoaringBitmap/CRoaring/blob/master/cpp/roaring64map.hh + private long rankLongNoCache(int high, int low) { + long result = 0L; + + BitmapDataProvider lastBitmap = highToBitmap.get(high); + if (lastBitmap == null) { + // There is no value with same high: the rank is a sum of cardinalities + for (Map.Entry bitmap : highToBitmap.entrySet()) { + if (bitmap.getKey().intValue() > high) { + break; + } else { + result += bitmap.getValue().getLongCardinality(); + } + } + } else { + for (BitmapDataProvider bitmap : highToBitmap.values()) { + if (bitmap == lastBitmap) { + result += bitmap.rankLong(low); + break; + } else { + result += bitmap.getLongCardinality(); + } + } + } + + return result; + } + + /** + * @param high for which high bucket should we compute the cardinality + * @return the highest validatedIndex + */ + protected int ensureCumulatives(int high) { + if (allValid) { + // the whole array is valid (up-to its actual length, not its capacity) + return highToBitmap.size(); + } else if (compare(high, firstHighNotValid) < 0) { + // The high is strictly below the first not valid: it is valid + + // sortedHighs may have only a subset of valid values on the right. However, these invalid + // values have been set to maxValue, and we are here as high < firstHighNotValid ==> high < + // maxHigh() + int position = binarySearch(sortedHighs, high); + + if (position >= 0) { + // This high has a bitmap: +1 as this index will be used as right (excluded) bound in a + // binary-search + return position + 1; + } else { + // This high has no bitmap: it could be between 2 highs with bitmaps + int insertionPosition = -position - 1; + return insertionPosition; + } + } else { + + // For each deprecated buckets + SortedMap tailMap = + highToBitmap.tailMap(firstHighNotValid, true); + + // TODO .size on tailMap make an iterator: arg + int indexOk = highToBitmap.size() - tailMap.size(); + + // TODO: It should be possible to compute indexOk based on sortedHighs array + // assert indexOk == binarySearch(sortedHighs, firstHighNotValid); + + Iterator> it = tailMap.entrySet().iterator(); + while (it.hasNext()) { + Map.Entry e = it.next(); + int currentHigh = e.getKey(); + + if (compare(currentHigh, high) > 0) { + // No need to compute more than needed + break; + } else if (e.getValue().isEmpty()) { + // highToBitmap can not be modified as we iterate over it + if (latestAddedHigh != null && latestAddedHigh.getKey().intValue() == currentHigh) { + // Dismiss the cached bitmap as it is removed from the NavigableMap + latestAddedHigh = null; + } + it.remove(); + } else { + ensureOne(e, currentHigh, indexOk); + + // We have added one valid cardinality + indexOk++; + } + + } + + if (highToBitmap.isEmpty() || indexOk == highToBitmap.size()) { + // We have compute all cardinalities + allValid = true; + } + + return indexOk; + } + } + + private int binarySearch(int[] array, int key) { + if (signedLongs) { + return Arrays.binarySearch(array, key); + } else { + return unsignedBinarySearch(array, 0, array.length, key, + unsignedComparator()); + } + } + + private int binarySearch(int[] array, int from, int to, int key) { + if (signedLongs) { + return Arrays.binarySearch(array, from, to, key); + } else { + return unsignedBinarySearch(array, from, to, key, unsignedComparator()); + } + } + + private void ensureOne(Map.Entry e, int currentHigh, int indexOk) { + // sortedHighs are valid only up to some index + assert indexOk <= sortedHighs.length : indexOk + " is bigger than " + sortedHighs.length; + + final int index; + if (indexOk == 0) { + if (sortedHighs.length == 0) { + index = -1; + // } else if (sortedHighs[0] == currentHigh) { + // index = 0; + } else { + index = -1; + } + } else if (indexOk < sortedHighs.length) { + index = -indexOk - 1; + } else { + index = -sortedHighs.length - 1; + } + assert index == binarySearch(sortedHighs, 0, indexOk, currentHigh) : "Computed " + index + + " differs from dummy binary-search index: " + + binarySearch(sortedHighs, 0, indexOk, currentHigh); + + if (index >= 0) { + // This would mean calling .ensureOne is useless: should never got here at the first time + throw new IllegalStateException("Unexpectedly found " + currentHigh + " in " + + Arrays.toString(sortedHighs) + " strictly before index" + indexOk); + } else { + int insertionPosition = -index - 1; + + // This is a new key + if (insertionPosition >= sortedHighs.length) { + int previousSize = sortedHighs.length; + + // TODO softer growing factor + int newSize = Math.min(Integer.MAX_VALUE, sortedHighs.length * 2 + 1); + + // Insertion at the end + sortedHighs = Arrays.copyOf(sortedHighs, newSize); + sortedCumulatedCardinality = Arrays.copyOf(sortedCumulatedCardinality, newSize); + + // Not actually needed. But simplify the reading of array content + Arrays.fill(sortedHighs, previousSize, sortedHighs.length, highestHigh()); + Arrays.fill(sortedCumulatedCardinality, previousSize, sortedHighs.length, Long.MAX_VALUE); + } + sortedHighs[insertionPosition] = currentHigh; + + final long previousCardinality; + if (insertionPosition >= 1) { + previousCardinality = sortedCumulatedCardinality[insertionPosition - 1]; + } else { + previousCardinality = 0; + } + + sortedCumulatedCardinality[insertionPosition] = + previousCardinality + e.getValue().getLongCardinality(); + + if (currentHigh == highestHigh()) { + // We are already on the highest high. Do not set allValid as it is set anyway out of the + // loop + firstHighNotValid = currentHigh; + } else { + // The first not valid is the next high + // TODO: The entry comes from a NavigableMap: it may be quite cheap to know the next high + firstHighNotValid = currentHigh + 1; + } + } + } + + private int highestHigh() { + return highestHigh(signedLongs); + } + + /** + * In-place bitwise OR (union) operation. The current bitmap is modified. + * + * @param x2 other bitmap + */ + public void or(final Roaring64Map x2) { + boolean firstBucket = true; + + for (Map.Entry e2 : x2.highToBitmap.entrySet()) { + // Keep object to prevent auto-boxing + Integer high = e2.getKey(); + + BitmapDataProvider lowBitmap1 = this.highToBitmap.get(high); + + BitmapDataProvider lowBitmap2 = e2.getValue(); + + // TODO Reviewers: is it a good idea to rely on BitmapDataProvider except in methods + // expecting an actual MutableRoaringBitmap? + // TODO This code may lead to closing a buffer Bitmap in current Navigable even if current is + // not on buffer + if ((lowBitmap1 == null || lowBitmap1 instanceof RoaringBitmap) + && lowBitmap2 instanceof RoaringBitmap) { + if (lowBitmap1 == null) { + // Clone to prevent future modification of this modifying the input Bitmap + RoaringBitmap lowBitmap2Clone = ((RoaringBitmap) lowBitmap2).clone(); + + pushBitmapForHigh(high, lowBitmap2Clone); + } else { + ((RoaringBitmap) lowBitmap1).or((RoaringBitmap) lowBitmap2); + } + } else if ((lowBitmap1 == null || lowBitmap1 instanceof MutableRoaringBitmap) + && lowBitmap2 instanceof MutableRoaringBitmap) { + if (lowBitmap1 == null) { + // Clone to prevent future modification of this modifying the input Bitmap + BitmapDataProvider lowBitmap2Clone = ((MutableRoaringBitmap) lowBitmap2).clone(); + pushBitmapForHigh(high, lowBitmap2Clone); + } else { + ((MutableRoaringBitmap) lowBitmap1).or((MutableRoaringBitmap) lowBitmap2); + } + } else { + throw new UnsupportedOperationException( + ".or is not between " + this.getClass() + " and " + lowBitmap2.getClass()); + } + + if (firstBucket) { + firstBucket = false; + + // Invalidate the lowest high as lowest not valid + firstHighNotValid = Math.min(firstHighNotValid, high); + allValid = false; + } + } + } + + /** + * In-place bitwise XOR (symmetric difference) operation. The current bitmap is modified. + * + * @param x2 other bitmap + */ + public void xor(final Roaring64Map x2) { + boolean firstBucket = true; + + for (Map.Entry e2 : x2.highToBitmap.entrySet()) { + // Keep object to prevent auto-boxing + Integer high = e2.getKey(); + + BitmapDataProvider lowBitmap1 = this.highToBitmap.get(high); + + BitmapDataProvider lowBitmap2 = e2.getValue(); + + // TODO Reviewers: is it a good idea to rely on BitmapDataProvider except in methods + // expecting an actual MutableRoaringBitmap? + // TODO This code may lead to closing a buffer Bitmap in current Navigable even if current is + // not on buffer + if ((lowBitmap1 == null || lowBitmap1 instanceof RoaringBitmap) + && lowBitmap2 instanceof RoaringBitmap) { + if (lowBitmap1 == null) { + // Clone to prevent future modification of this modifying the input Bitmap + RoaringBitmap lowBitmap2Clone = ((RoaringBitmap) lowBitmap2).clone(); + + pushBitmapForHigh(high, lowBitmap2Clone); + } else { + ((RoaringBitmap) lowBitmap1).xor((RoaringBitmap) lowBitmap2); + } + } else if ((lowBitmap1 == null || lowBitmap1 instanceof MutableRoaringBitmap) + && lowBitmap2 instanceof MutableRoaringBitmap) { + if (lowBitmap1 == null) { + // Clone to prevent future modification of this modifying the input Bitmap + BitmapDataProvider lowBitmap2Clone = ((MutableRoaringBitmap) lowBitmap2).clone(); + + pushBitmapForHigh(high, lowBitmap2Clone); + } else { + ((MutableRoaringBitmap) lowBitmap1).xor((MutableRoaringBitmap) lowBitmap2); + } + } else { + throw new UnsupportedOperationException( + ".or is not between " + this.getClass() + " and " + lowBitmap2.getClass()); + } + + if (firstBucket) { + firstBucket = false; + + // Invalidate the lowest high as lowest not valid + firstHighNotValid = Math.min(firstHighNotValid, high); + allValid = false; + } + } + } + + /** + * In-place bitwise AND (intersection) operation. The current bitmap is modified. + * + * @param x2 other bitmap + */ + public void and(final Roaring64Map x2) { + boolean firstBucket = true; + + Iterator> thisIterator = highToBitmap.entrySet().iterator(); + while (thisIterator.hasNext()) { + Map.Entry e1 = thisIterator.next(); + + // Keep object to prevent auto-boxing + Integer high = e1.getKey(); + + BitmapDataProvider lowBitmap2 = x2.highToBitmap.get(high); + + if (lowBitmap2 == null) { + // None of given high values are present in x2 + thisIterator.remove(); + } else { + BitmapDataProvider lowBitmap1 = e1.getValue(); + + if (lowBitmap2 instanceof RoaringBitmap && lowBitmap1 instanceof RoaringBitmap) { + ((RoaringBitmap) lowBitmap1).and((RoaringBitmap) lowBitmap2); + } else if (lowBitmap2 instanceof MutableRoaringBitmap + && lowBitmap1 instanceof MutableRoaringBitmap) { + ((MutableRoaringBitmap) lowBitmap1).and((MutableRoaringBitmap) lowBitmap2); + } else { + throw new UnsupportedOperationException( + ".and is not between " + this.getClass() + " and " + lowBitmap1.getClass()); + } + } + + if (firstBucket) { + firstBucket = false; + + // Invalidate the lowest high as lowest not valid + firstHighNotValid = Math.min(firstHighNotValid, high); + allValid = false; + } + } + } + + /** + * In-place bitwise ANDNOT (difference) operation. The current bitmap is modified. + * + * @param x2 other bitmap + */ + public void andNot(final Roaring64Map x2) { + boolean firstBucket = true; + + Iterator> thisIterator = highToBitmap.entrySet().iterator(); + while (thisIterator.hasNext()) { + Map.Entry e1 = thisIterator.next(); + + // Keep object to prevent auto-boxing + Integer high = e1.getKey(); + + BitmapDataProvider lowBitmap2 = x2.highToBitmap.get(high); + + if (lowBitmap2 != null) { + BitmapDataProvider lowBitmap1 = e1.getValue(); + + if (lowBitmap2 instanceof RoaringBitmap && lowBitmap1 instanceof RoaringBitmap) { + ((RoaringBitmap) lowBitmap1).andNot((RoaringBitmap) lowBitmap2); + } else if (lowBitmap2 instanceof MutableRoaringBitmap + && lowBitmap1 instanceof MutableRoaringBitmap) { + ((MutableRoaringBitmap) lowBitmap1).andNot((MutableRoaringBitmap) lowBitmap2); + } else { + throw new UnsupportedOperationException( + ".and is not between " + this.getClass() + " and " + lowBitmap1.getClass()); + } + } + + if (firstBucket) { + firstBucket = false; + + // Invalidate the lowest high as lowest not valid + firstHighNotValid = Math.min(firstHighNotValid, high); + allValid = false; + } + } + } + + /** + * A string describing the bitmap. + * + * @return the string + */ + @Override + public String toString() { + final StringBuilder answer = new StringBuilder(); + final LongIterator i = this.getLongIterator(); + answer.append("{"); + if (i.hasNext()) { + if (signedLongs) { + answer.append(i.next()); + } else { + answer.append(toUnsignedString(i.next())); + } + } + while (i.hasNext()) { + answer.append(","); + // to avoid using too much memory, we limit the size + if (answer.length() > 0x80000) { + answer.append("..."); + break; + } + if (signedLongs) { + answer.append(i.next()); + } else { + answer.append(toUnsignedString(i.next())); + } + + } + answer.append("}"); + return answer.toString(); + } + + /** + * For better performance, consider the Use the {@link #forEach forEach} method. + * + * @return a custom iterator over set bits, the bits are traversed in ascending sorted order + */ + public LongIterator getLongIterator() { + final Iterator> it = highToBitmap.entrySet().iterator(); + + return toIterator(it, false); + } + + protected LongIterator toIterator(final Iterator> it, + final boolean reversed) { + return new LongIterator() { + + protected int currentKey; + protected IntIterator currentIt; + + @Override + public boolean hasNext() { + if (currentIt == null) { + // Were initially empty + if (!moveToNextEntry(it)) { + return false; + } + } + + while (true) { + if (currentIt.hasNext()) { + return true; + } else { + if (!moveToNextEntry(it)) { + return false; + } + } + } + } + + /** + * + * @param it the underlying iterator which has to be moved to next long + * @return true if we MAY have more entries. false if there is definitely nothing more + */ + private boolean moveToNextEntry(Iterator> it) { + if (it.hasNext()) { + Map.Entry next = it.next(); + currentKey = next.getKey(); + if (reversed) { + currentIt = next.getValue().getReverseIntIterator(); + } else { + currentIt = next.getValue().getIntIterator(); + } + + // We may have more long + return true; + } else { + // We know there is nothing more + return false; + } + } + + @Override + public long next() { + if (hasNext()) { + return pack(currentKey, currentIt.next()); + } else { + throw new IllegalStateException("empty"); + } + } + + @Override + public LongIterator clone() { + throw new UnsupportedOperationException("TODO"); + } + }; + } + + public boolean contains(long x) { + int high = high(x); + BitmapDataProvider lowBitmap = highToBitmap.get(high); + if (lowBitmap == null) { + return false; + } + + int low = low(x); + return lowBitmap.contains(low); + } + + public int getSizeInBytes() { + return (int) getLongSizeInBytes(); + } + + public long getLongSizeInBytes() { + long size = 8; + + // Size of containers + size += highToBitmap.values().stream().mapToLong(p -> p.getLongSizeInBytes()).sum(); + + // Size of Map data-structure: we consider each TreeMap entry costs 40 bytes + // http://java-performance.info/memory-consumption-of-java-data-types-2/ + size += 8 + 40 * highToBitmap.size(); + + // Size of (boxed) Integers used as keys + size += 16 * highToBitmap.size(); + + // The cache impacts the size in heap + size += 8 * sortedCumulatedCardinality.length; + size += 4 * sortedHighs.length; + + return size; + } + + public boolean isEmpty() { + return getLongCardinality() == 0L; + } + + public ImmutableLongBitmapDataProvider limit(long x) { + throw new UnsupportedOperationException("TODO"); + } + + /** + * Use a run-length encoding where it is estimated as more space efficient + * + * @return whether a change was applied + */ + public boolean runOptimize() { + boolean hasChanged = false; + for (BitmapDataProvider lowBitmap : highToBitmap.values()) { + if (lowBitmap instanceof RoaringBitmap) { + hasChanged |= ((RoaringBitmap) lowBitmap).runOptimize(); + } else if (lowBitmap instanceof MutableRoaringBitmap) { + hasChanged |= ((MutableRoaringBitmap) lowBitmap).runOptimize(); + } + } + return hasChanged; + } + + public long serializedSizeInBytes() { + long nbBytes = 0L; + + // .writeBoolean for signedLongs boolean + nbBytes += 1; + + // .writeInt for number of different high values + nbBytes += 4; + + for (Map.Entry entry : highToBitmap.entrySet()) { + // .writeInt for high + nbBytes += 4; + + // The low bitmap size in bytes + nbBytes += entry.getValue().serializedSizeInBytes(); + } + + return nbBytes; + } + + /** + * reset to an empty bitmap; result occupies as much space a newly created bitmap. + */ + public void clear() { + this.highToBitmap.clear(); + resetPerfHelpers(); + } + + /** + * Return the set values as an array, if the cardinality is smaller than 2147483648. The long + * values are in sorted order. + * + * @return array representing the set values. + */ + public long[] toArray() { + long cardinality = this.getLongCardinality(); + if (cardinality > Integer.MAX_VALUE) { + throw new IllegalStateException("The cardinality does not fit in an array"); + } + + final long[] array = new long[(int) cardinality]; + + int pos = 0; + LongIterator it = getLongIterator(); + + while (it.hasNext()) { + array[pos++] = it.next(); + } + return array; + } + + /* ------------------ method below from Roaring64NavigableMap and being overwritten ----------------------------- */ + + /** + * Set all the specified values to true. This can be expected to be slightly faster than calling + * "add" repeatedly. The provided integers values don't have to be in sorted order, but it may be + * preferable to sort them from a performance point of view. + * + * @param dat set values + */ + public void add(long... dat) { + for (long oneLong : dat) { + addLong(oneLong); + } + } + + /** + * Add to the current bitmap all longs in [rangeStart,rangeEnd). + * + * @param rangeStart inclusive beginning of range + * @param rangeEnd exclusive ending of range + */ + public void add(final long rangeStart, final long rangeEnd) { + int startHigh = high(rangeStart); + int startLow = low(rangeStart); + + int endHigh = high(rangeEnd); + int endLow = low(rangeEnd); + + for (int high = startHigh; high <= endHigh; high++) { + final int currentStartLow; + if (startHigh == high) { + // The whole range starts in this bucket + currentStartLow = startLow; + } else { + // Add the bucket from the beginning + currentStartLow = 0; + } + + long startLowAsLong = Util.toUnsignedLong(currentStartLow); + + final long endLowAsLong; + if (endHigh == high) { + // The whole range ends in this bucket + endLowAsLong = Util.toUnsignedLong(endLow); + } else { + // Add the bucket until the end: we have a +1 as, in RoaringBitmap.add(long,long), the end + // is excluded + endLowAsLong = Util.toUnsignedLong(-1) + 1; + } + + if (endLowAsLong > startLowAsLong) { + // Initialize the bitmap only if there is access data to write + BitmapDataProvider bitmap = highToBitmap.get(high); + if (bitmap == null) { + bitmap = new MutableRoaringBitmap(); + pushBitmapForHigh(high, bitmap); + } + + if (bitmap instanceof RoaringBitmap) { + ((RoaringBitmap) bitmap).add(startLowAsLong, endLowAsLong); + } else if (bitmap instanceof MutableRoaringBitmap) { + ((MutableRoaringBitmap) bitmap).add(startLowAsLong, endLowAsLong); + } else { + throw new UnsupportedOperationException("TODO. Not for " + bitmap.getClass()); + } + } + } + + invalidateAboveHigh(startHigh); + } + + + + /*---------------------------- method below is new written for doris's own bitmap --------------------------------*/ + + public LongIterator getReverseLongIterator() { + return toIterator(highToBitmap.descendingMap().entrySet().iterator(), true); + } + + /*--------------- method below fetched from org.roaringbitmap.longlong RoaringIntPacking -----------------------*/ + + public void removeLong(long x) { + int high = high(x); + + BitmapDataProvider bitmap = highToBitmap.get(high); + + if (bitmap != null) { + int low = low(x); + bitmap.remove(low); + + // Invalidate only if actually modified + invalidateAboveHigh(high); + } + + } + + public void trim() { + for (BitmapDataProvider bitmap : highToBitmap.values()) { + bitmap.trim(); + } + } + + @Override + public int hashCode() { + return highToBitmap.hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + Roaring64Map other = (Roaring64Map) obj; + return Objects.equals(highToBitmap, other.highToBitmap); + } + + /** + * Add the value if it is not already present, otherwise remove it. + * + * @param x long value + */ + public void flip(final long x) { + int high = high(x); + BitmapDataProvider lowBitmap = highToBitmap.get(high); + if (lowBitmap == null) { + // The value is not added: add it without any flip specific code + addLong(x); + } else { + int low = low(x); + + // .flip is not in BitmapDataProvider contract + // TODO Is it relevant to calling .flip with a cast? + if (lowBitmap instanceof RoaringBitmap) { + ((RoaringBitmap) lowBitmap).flip(low); + } else if (lowBitmap instanceof MutableRoaringBitmap) { + ((MutableRoaringBitmap) lowBitmap).flip(low); + } else { + // Fallback to a manual flip + if (lowBitmap.contains(low)) { + lowBitmap.remove(low); + } else { + lowBitmap.add(low); + } + } + } + + invalidateAboveHigh(high); + } + + /** + * Serialize this bitmap. + *

+ * Unlike RoaringBitmap, there is no specification for now: it may change from one java version + * to another, and from one RoaringBitmap version to another. + *

+ * Consider calling {@link #runOptimize} before serialization to improve compression. + *

+ * The current bitmap is not modified. + * + * @param out the DataOutput stream + * @throws IOException Signals that an I/O exception has occurred. + */ + public void serialize(DataOutput out) throws IOException { + if (highToBitmap.size() == 0) { + return; + } + if (is32BitsEnough()) { + out.write(BitmapValue.BITMAP32); + highToBitmap.get(0).serialize(out); + return; + } + + out.write(BitmapValue.BITMAP64); + Codec.encodeVarint64(highToBitmap.size(), out); + + for (Map.Entry entry : highToBitmap.entrySet()) { + // serialized in little end for BE cpp read in case of bugs when the value is larger than 32bits + out.writeInt(Integer.reverseBytes(entry.getKey().intValue())); + entry.getValue().serialize(out); + } + } + + /** + * Deserialize (retrieve) this bitmap. + *

+ * Unlike RoaringBitmap, there is no specification for now: it may change from one java version to + * another, and from one RoaringBitmap version to another. + *

+ * The current bitmap is overwritten. + * + * @param in the DataInput stream + * @throws IOException Signals that an I/O exception has occurred. + */ + public void deserialize(DataInput in, int bitmapType) throws IOException { + this.clear(); + highToBitmap = new TreeMap<>(); + + if (bitmapType == BitmapValue.BITMAP32) { + RoaringBitmap provider = new RoaringBitmap(); + provider.deserialize(in); + highToBitmap.put(0, provider); + return; + } + + if (bitmapType != BitmapValue.BITMAP64) { + throw new InvalidRoaringFormat("invalid bitmap type"); + } + + long nbHighs = Codec.decodeVarint64(in); + for (int i = 0; i < nbHighs; i++) { + // keep the same behavior with little-end serialize + int high = Integer.reverseBytes(in.readInt()); + RoaringBitmap provider = new RoaringBitmap(); + provider.deserialize(in); + highToBitmap.put(high, provider); + } + + resetPerfHelpers(); + } + + public boolean is32BitsEnough() { + return highToBitmap.size() == 1 && highToBitmap.get(0) != null; + } + +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/AutoType.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/AutoType.java new file mode 100644 index 00000000..f65a9fdf --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/AutoType.java @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2006 JMockit developers + * This file is subject to the terms of the MIT license (see LICENSE.txt). + */ + +package org.apache.doris.common.jmockit; + +import java.util.HashMap; +import java.util.Map; + +/** + * Helper class to convert type between Java's wrapper type and primitive type + * There are 8 wrapper/primitive types in Java: + * |Wrapped Type |Primitive Type + * -------------------------------------- + * |Boolean |boolean + * |Character |char + * |Byte |byte + * |Short |short + * |Integer |int + * |Float |float + * |Long |longFieldReflection + * |Double |double + *

+ * Copied from Apache Doris + */ +public class AutoType { + private static final Map, Class> PRIMITIVE_TO_WRAPPER = new HashMap(); + private static final Map, Class> WRAPPER_TO_PRIMITIVE = new HashMap(); + + static { + WRAPPER_TO_PRIMITIVE.put(Boolean.class, Boolean.TYPE); + WRAPPER_TO_PRIMITIVE.put(Character.class, Character.TYPE); + WRAPPER_TO_PRIMITIVE.put(Byte.class, Byte.TYPE); + WRAPPER_TO_PRIMITIVE.put(Short.class, Short.TYPE); + WRAPPER_TO_PRIMITIVE.put(Integer.class, Integer.TYPE); + WRAPPER_TO_PRIMITIVE.put(Float.class, Float.TYPE); + WRAPPER_TO_PRIMITIVE.put(Long.class, Long.TYPE); + WRAPPER_TO_PRIMITIVE.put(Double.class, Double.TYPE); + + PRIMITIVE_TO_WRAPPER.put(Boolean.TYPE, Boolean.class); + PRIMITIVE_TO_WRAPPER.put(Character.TYPE, Character.class); + PRIMITIVE_TO_WRAPPER.put(Byte.TYPE, Byte.class); + PRIMITIVE_TO_WRAPPER.put(Short.TYPE, Short.class); + PRIMITIVE_TO_WRAPPER.put(Integer.TYPE, Integer.class); + PRIMITIVE_TO_WRAPPER.put(Float.TYPE, Float.class); + PRIMITIVE_TO_WRAPPER.put(Long.TYPE, Long.class); + PRIMITIVE_TO_WRAPPER.put(Double.TYPE, Double.class); + } + + public static boolean isWrapperOfPrimitiveType(Class type) { + return WRAPPER_TO_PRIMITIVE.containsKey(type); + } + + public static Class getPrimitiveType(Class wrapperType) { + return WRAPPER_TO_PRIMITIVE.get(wrapperType); + } + + public static Class getWrapperType(Class primitiveType) { + return PRIMITIVE_TO_WRAPPER.get(primitiveType); + } +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ConstructorReflection.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ConstructorReflection.java new file mode 100644 index 00000000..4b437ce4 --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ConstructorReflection.java @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2006 JMockit developers + * This file is subject to the terms of the MIT license (see LICENSE.txt). + */ + +package org.apache.doris.common.jmockit; + +import java.lang.reflect.AccessibleObject; +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; + +/** + * Modify from mockit.internal.util.ConstructorReflection JMockit v1.13 + * Util class to invoke constructor of specified class. + *

+ * Copied from Apache Doris + */ +public final class ConstructorReflection { + + private ConstructorReflection() { + } + + /** + * invoke the {@constructor} with parameters {@initArgs}. + */ + public static T invoke(Constructor constructor, Object... initArgs) { + if (constructor == null || initArgs == null) { + throw new IllegalArgumentException(); + } + makeAccessible(constructor); + + try { + return constructor.newInstance(initArgs); + } catch (InstantiationException e) { + throw new RuntimeException(e); + } catch (IllegalAccessException e) { + throw new RuntimeException(e); + } catch (InvocationTargetException e) { + Throwable cause = e.getCause(); + if (cause instanceof Error) { + throw (Error) cause; + } else if (cause instanceof RuntimeException) { + throw (RuntimeException) cause; + } else { + throw new IllegalStateException("Should never get here", cause); + } + } + } + + /** + * invoke the constructor with parameters {@nonNullArgs Object...}. + */ + public static T newInstance(Class aClass, Object... nonNullArgs) { + if (aClass == null || nonNullArgs == null) { + throw new IllegalArgumentException(); + } else { + Class[] argTypes = ParameterReflection.getArgumentTypesFromArgumentValues(nonNullArgs); + Constructor constructor = findCompatibleConstructor(aClass, argTypes); + return invoke(constructor, nonNullArgs); + } + } + + /** + * invoke the constructor with no parameters of {@aClass Class}. + */ + private static T newInstance(Class aClass) { + return (T) newInstance((Class) aClass, ParameterReflection.NO_PARAMETERS); + } + + /** + * invoke the default constructor of {@aClass Class}. + * if the default constructor is not available, try to invoke the one constructor with no parameters. + */ + public static T newInstanceUsingDefaultConstructor(Class aClass) { + if (aClass == null) { + throw new IllegalArgumentException(); + } + try { + return aClass.newInstance(); + } catch (InstantiationException e) { + throw new RuntimeException(e); + } catch (IllegalAccessException e) { + return newInstance(aClass); + } + } + + /** + * invoke the default constructor of {@aClass Class}. + */ + public static T newInstanceUsingDefaultConstructorIfAvailable(Class aClass) { + if (aClass == null) { + throw new IllegalArgumentException(); + } + try { + return aClass.newInstance(); + } catch (InstantiationException e) { + return null; + } catch (IllegalAccessException e) { + return null; + } + } + + /** + * invoke inner-class constructor with outer-class instance {@outerInstance} and parameters {@nonNullArgs}. + */ + public static T newInnerInstance(Class innerClass, Object outerInstance, Object... nonNullArgs) { + if (innerClass == null || outerInstance == null || nonNullArgs == null) { + throw new IllegalArgumentException(); + } else { + Object[] initArgs = ParameterReflection.argumentsWithExtraFirstValue(nonNullArgs, outerInstance); + return newInstance(innerClass, initArgs); + } + } + + /** + * Get non-inner-class constructor with {@argTypes Class[]}. + * if more than one constructor was found, choose the more specific one. (i.e. constructor with parameters that have more concrete types is more specific) + * if no constructor was found, will check if {@theClass} is a inner class. Then a IllegalArgumentException exception will be thrown. + */ + private static Constructor findCompatibleConstructor(Class theClass, Class[] argTypes) { + if (theClass == null || argTypes == null) { + throw new IllegalArgumentException(); + } + Constructor found = null; + Class[] foundParameters = null; + Constructor[] declaredConstructors = theClass.getDeclaredConstructors(); + Constructor[] declaredConstructorsArray = declaredConstructors; + + for (Constructor declaredConstructor : declaredConstructorsArray) { + Class[] declaredParamTypes = declaredConstructor.getParameterTypes(); + int gap = declaredParamTypes.length - argTypes.length; + if (gap == 0 && (ParameterReflection.matchesParameterTypes(declaredParamTypes, argTypes) + || ParameterReflection.acceptsArgumentTypes(declaredParamTypes, argTypes)) + && + (found == null || ParameterReflection.hasMoreSpecificTypes(declaredParamTypes, foundParameters))) { + found = (Constructor) declaredConstructor; + foundParameters = declaredParamTypes; + } + } + + if (found != null) { + return found; + } else { + Class declaringClass = theClass.getDeclaringClass(); + Class[] paramTypes = declaredConstructors[0].getParameterTypes(); + // check if this constructor is belong to a inner class + // the parameter[0] of inner class's constructor is a instance of outer class + if (paramTypes[0] == declaringClass && paramTypes.length > argTypes.length) { + throw new IllegalArgumentException( + "Invalid instantiation of inner class; use newInnerInstance instead"); + } else { + String argTypesDesc = ParameterReflection.getParameterTypesDescription(argTypes); + throw new IllegalArgumentException( + "No compatible constructor found: " + theClass.getSimpleName() + argTypesDesc); + } + } + } + + // ensure that field is accessible + public static void makeAccessible(AccessibleObject classMember) { + if (!classMember.isAccessible()) { + classMember.setAccessible(true); + } + } +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/Deencapsulation.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/Deencapsulation.java new file mode 100644 index 00000000..74362e0c --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/Deencapsulation.java @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2006 JMockit developers + * This file is subject to the terms of the MIT license (see LICENSE.txt). + */ + +package org.apache.doris.common.jmockit; + +/** + * Modify from mockit.internal.util.Deencapsulation JMockit ver1.13 + *

+ * Copied from Apache Doris + */ +public final class Deencapsulation { + private Deencapsulation() { + } + + public static T getField(Object objectWithField, String fieldName) { + return FieldReflection.getField(objectWithField.getClass(), fieldName, objectWithField); + } + + public static T getField(Object objectWithField, Class fieldType) { + return FieldReflection.getField(objectWithField.getClass(), fieldType, objectWithField); + } + + public static T getField(Class classWithStaticField, String fieldName) { + return FieldReflection.getField(classWithStaticField, fieldName, null); + } + + public static T getField(Class classWithStaticField, Class fieldType) { + return FieldReflection.getField(classWithStaticField, fieldType, null); + } + + public static void setField(Object objectWithField, String fieldName, Object fieldValue) { + FieldReflection.setField(objectWithField.getClass(), objectWithField, fieldName, fieldValue); + } + + public static void setField(Object objectWithField, Object fieldValue) { + FieldReflection.setField(objectWithField.getClass(), objectWithField, null, fieldValue); + } + + public static void setField(Class classWithStaticField, String fieldName, Object fieldValue) { + FieldReflection.setField(classWithStaticField, null, fieldName, fieldValue); + } + + public static void setField(Class classWithStaticField, Object fieldValue) { + FieldReflection.setField(classWithStaticField, null, null, fieldValue); + } + + public static T invoke(Object objectWithMethod, String methodName, Object... nonNullArgs) { + Class theClass = objectWithMethod.getClass(); + return MethodReflection.invoke(theClass, objectWithMethod, methodName, nonNullArgs); + } + + public static T invoke(Class classWithStaticMethod, String methodName, Object... nonNullArgs) { + return MethodReflection.invoke(classWithStaticMethod, null, methodName, nonNullArgs); + } + + public static T newInstance(Class classToInstantiate, Object... nonNullArgs) { + return ConstructorReflection.newInstance(classToInstantiate, nonNullArgs); + } + + public static T newInnerInstance(Class innerClassToInstantiate, Object outerClassInstance, Object... nonNullArgs) { + return ConstructorReflection.newInnerInstance(innerClassToInstantiate, outerClassInstance, nonNullArgs); + } +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/FieldReflection.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/FieldReflection.java new file mode 100644 index 00000000..04c6d9cd --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/FieldReflection.java @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2006 JMockit developers + * This file is subject to the terms of the MIT license (see LICENSE.txt). + */ + +package org.apache.doris.common.jmockit; + +import java.lang.reflect.AccessibleObject; +import java.lang.reflect.Field; +import java.lang.reflect.Modifier; +import java.lang.reflect.ParameterizedType; +import java.lang.reflect.Type; +import java.lang.reflect.TypeVariable; + +/** + * Modify from mockit.internal.util.FieldReflection JMockit v1.13 + * Util class to set and get the value of specified field. + *

+ * Copied from Apache Doris + */ +public final class FieldReflection { + private FieldReflection() { + } + + /** + * Get field's value with field's name. + */ + public static T getField(Class theClass, String fieldName, Object targetObject) { + if (theClass == null || fieldName == null || targetObject == null) { + throw new IllegalStateException(); + } + Field field = getDeclaredField(theClass, fieldName, targetObject != null); + return getFieldValue(field, targetObject); + } + + /** + * Get field's value with field's type. + */ + public static T getField(Class theClass, Class fieldType, Object targetObject) { + if (theClass == null || fieldType == null) { + throw new IllegalStateException(); + } + Field field = getDeclaredField(theClass, fieldType, targetObject != null, false); + return getFieldValue(field, targetObject); + } + + /** + * Get field's value with field's type. + */ + public static T getField(Class theClass, Type fieldType, Object targetObject) { + if (theClass == null || fieldType == null) { + throw new IllegalStateException(); + } + Field field = getDeclaredField(theClass, fieldType, targetObject != null, false); + return getFieldValue(field, targetObject); + } + + /** + * Modify field's value in targetObject. + * If {@fieldName String} is null, will try to set field with field's type. + */ + public static Field setField(Class theClass, Object targetObject, String fieldName, Object fieldValue) { + if (theClass == null) { + throw new IllegalArgumentException(); + } + boolean instanceField = targetObject != null; + Field field; + if (fieldName != null) { + field = getDeclaredField(theClass, fieldName, instanceField); + } else { + if (fieldValue == null) { + throw new IllegalArgumentException("Missing field value when setting field by type"); + } + + field = getDeclaredField(theClass, fieldValue.getClass(), instanceField, true); + } + + setFieldValue(field, targetObject, fieldValue); + return field; + } + + /** + * Get field by field's name. + * If no field is found in this class, it will continue to look up its super class. + * If {@instanceField boolean} is true, will only search for the non-static field. + */ + private static Field getDeclaredField(Class theClass, String fieldName, boolean instanceField) { + if (theClass == null || fieldName == null) { + throw new IllegalStateException(); + } + try { + return theClass.getDeclaredField(fieldName); + } catch (NoSuchFieldException e) { + Class superClass = theClass.getSuperclass(); + if (superClass != null && superClass != Object.class) { + return getDeclaredField(superClass, fieldName, instanceField); + } else { + String kind = instanceField ? "instance" : "static"; + throw new IllegalArgumentException("No " + kind + " field of name \"" + fieldName + "\" found in " + theClass); + } + } + } + + /** + * Get field by field's type. + * If no field is found in this class, it will continue to look up its super class. + * If {@instanceField boolean} is true, will only search for the non-static field. + * If {@forAssignment boolean} is true, will compare its super type with desiredType. + */ + private static Field getDeclaredField(Class theClass, Type desiredType, boolean instanceField, boolean forAssignment) { + if (theClass == null || desiredType == null) { + throw new IllegalStateException(); + } + Field found = getDeclaredFieldInSingleClass(theClass, desiredType, instanceField, forAssignment); + if (found == null) { + Class superClass = theClass.getSuperclass(); + if (superClass != null && superClass != Object.class) { + return getDeclaredField(superClass, desiredType, instanceField, forAssignment); + } else { + StringBuilder errorMsg = new StringBuilder(instanceField ? "Instance" : "Static"); + String typeName = getTypeName(desiredType); + errorMsg.append(" field of type ").append(typeName).append(" not found in ").append(theClass); + throw new IllegalArgumentException(errorMsg.toString()); + } + } else { + return found; + } + } + + /** + * Get field by field's type. + * There is only one field is expected to be found in a single class. + * If {@instanceField boolean} is true, will only search for the non-static field. + * If {@forAssignment boolean} is true, will compare its super type with desiredType. + * If more than one field are found, a IllegalArgumentException will be thrown. + */ + private static Field getDeclaredFieldInSingleClass(Class theClass, Type desiredType, boolean instanceField, boolean forAssignment) { + if (theClass == null || desiredType == null) { + throw new IllegalStateException(); + } + Field found = null; + Field[] fields = theClass.getDeclaredFields(); + + for (Field field : fields) { + if (!field.isSynthetic()) { + Type fieldType = field.getGenericType(); + if (instanceField != Modifier.isStatic(field.getModifiers()) && isCompatibleFieldType(fieldType, desiredType, forAssignment)) { + if (found != null) { + String message = errorMessageForMoreThanOneFieldFound(desiredType, instanceField, forAssignment, found, field); + throw new IllegalArgumentException(message); + } + + found = field; + } + } + } + + return found; + } + + /** + * return true if the {@fieldType} is compatible with {@desiredType}. + * If {@forAssignment} is true, will compare its super type with desiredType. + * If {@forAssignment} is false, will also compare it with desiredType's super type. + */ + private static boolean isCompatibleFieldType(Type fieldType, Type desiredType, boolean forAssignment) { + if (fieldType == null || desiredType == null) { + throw new IllegalStateException(); + } + Class fieldClass = getClassType(fieldType); + Class desiredClass = getClassType(desiredType); + if (isSameType(desiredClass, fieldClass)) { + return true; + } else if (forAssignment) { + return fieldClass.isAssignableFrom(desiredClass); + } else { + return desiredClass.isAssignableFrom(fieldClass) || fieldClass.isAssignableFrom(desiredClass); + } + } + + private static String errorMessageForMoreThanOneFieldFound(Type desiredFieldType, boolean instanceField, boolean forAssignment, Field firstField, Field secondField) { + return "More than one " + (instanceField ? "instance" : "static") + " field " + (forAssignment ? "to" : "from") + + " which a value of type " + + getTypeName(desiredFieldType) + (forAssignment ? " can be assigned" : " can be read") + " exists in " + + secondField.getDeclaringClass() + ": " + firstField.getName() + ", " + secondField.getName(); + } + + private static String getTypeName(Type type) { + if (type == null) { + throw new IllegalStateException(); + } + Class classType = getClassType(type); + Class primitiveType = AutoType.getPrimitiveType(classType); + if (primitiveType != null) { + return primitiveType + " or " + classType.getSimpleName(); + } else { + String name = classType.getName(); + return name.startsWith("java.lang.") ? name.substring(10) : name; + } + } + + /** + * Get field in {@targetObject Object}. + */ + private static T getFieldValue(Field field, Object targetObject) { + if (field == null) { + throw new IllegalStateException(); + } + makeAccessible(field); + + try { + return (T) field.get(targetObject); + } catch (IllegalAccessException e) { + throw new RuntimeException(e); + } + } + + /** + * Modify field with value in {@targetObject Object}. + */ + public static void setFieldValue(Field field, Object targetObject, Object value) { + if (field == null) { + throw new IllegalStateException(); + } + try { + if (Modifier.isStatic(field.getModifiers()) && Modifier.isFinal(field.getModifiers())) { + throw new IllegalArgumentException("Do not allow to set static final field"); + } else { + makeAccessible(field); + field.set(targetObject, value); + } + + } catch (IllegalAccessException e) { + throw new RuntimeException(e); + } + } + + /* + private static void setStaticFinalField(Field field, Object value) throws IllegalAccessException { + if (field == null) { + throw new IllegalStateException(); + } + Field modifiersField; + try { + modifiersField = Field.class.getDeclaredField("modifiers"); + } catch (NoSuchFieldException e) { + throw new RuntimeException(e); + } + + modifiersField.setAccessible(true); + int nonFinalModifiers = modifiersField.getInt(field) - 16; + modifiersField.setInt(field, nonFinalModifiers); + FieldAccessor accessor = ReflectionFactory.getReflectionFactory().newFieldAccessor(field, false); + accessor.set((Object)null, value); + } + */ + + public static Class getClassType(Type declaredType) { + while (!(declaredType instanceof Class)) { + if (declaredType instanceof ParameterizedType) { + return (Class) ((ParameterizedType) declaredType).getRawType(); + } + + if (!(declaredType instanceof TypeVariable)) { + throw new IllegalArgumentException("Type of unexpected kind: " + declaredType); + } + + declaredType = ((TypeVariable) declaredType).getBounds()[0]; + } + + return (Class) declaredType; + } + + // ensure that field is accessible + public static void makeAccessible(AccessibleObject classMember) { + if (!classMember.isAccessible()) { + classMember.setAccessible(true); + } + } + + // return true if the two types are same type. + private static boolean isSameType(Class firstType, Class secondType) { + return firstType == secondType + || firstType.isPrimitive() && firstType == AutoType.getPrimitiveType(secondType) + || secondType.isPrimitive() && secondType == AutoType.getPrimitiveType(firstType); + } + +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/GeneratedClasses.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/GeneratedClasses.java new file mode 100644 index 00000000..1281f4ed --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/GeneratedClasses.java @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2006 JMockit developers + * This file is subject to the terms of the MIT license (see LICENSE.txt). + */ + +package org.apache.doris.common.jmockit; + +import java.lang.reflect.Proxy; + +/** + * Modify from mockit.internal.util.GeneratedClasses JMockit v1.13 + * Helper class to return type of mocked-object + *

+ * Copied from Apache Doris + */ +public final class GeneratedClasses { + private static final String IMPLCLASS_PREFIX = "$Impl_"; + private static final String SUBCLASS_PREFIX = "$Subclass_"; + + private GeneratedClasses() { + } + + static boolean isGeneratedImplementationClass(Class mockedType) { + return isGeneratedImplementationClass(mockedType.getName()); + } + + static boolean isGeneratedImplementationClass(String className) { + return className.contains(IMPLCLASS_PREFIX); + } + + static boolean isGeneratedSubclass(String className) { + return className.contains(SUBCLASS_PREFIX); + } + + static boolean isGeneratedClass(String className) { + return isGeneratedSubclass(className) || isGeneratedImplementationClass(className); + } + + static Class getMockedClassOrInterfaceType(Class aClass) { + if (!Proxy.isProxyClass(aClass) && !isGeneratedImplementationClass(aClass)) { + return isGeneratedSubclass(aClass.getName()) ? aClass.getSuperclass() : aClass; + } else { + return aClass.getInterfaces()[0]; + } + } + + static Class getMockedClass(Object mock) { + return getMockedClassOrInterfaceType(mock.getClass()); + } +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/MethodReflection.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/MethodReflection.java new file mode 100644 index 00000000..293e9816 --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/MethodReflection.java @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2006 JMockit developers + * This file is subject to the terms of the MIT license (see LICENSE.txt). + */ + +package org.apache.doris.common.jmockit; + +import java.lang.reflect.AccessibleObject; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.lang.reflect.Modifier; + +/** + * Modify from mockit.internal.util.MethodReflection JMockit v1.13 + * Util class to get and invoke method from specified class. + *

+ * Copied from Apache Doris + */ +public final class MethodReflection { + private MethodReflection() { + } + + public static T invoke(Class theClass, Object targetInstance, String methodName, Object... methodArgs) { + if (theClass == null || methodName == null) { + throw new IllegalArgumentException(); + } + boolean staticMethod = targetInstance == null; + Class[] argTypes = ParameterReflection.getArgumentTypesFromArgumentValues(methodArgs); + Method method = staticMethod ? findCompatibleStaticMethod(theClass, methodName, argTypes) : + findCompatibleMethod(theClass, methodName, argTypes); + if (staticMethod && !Modifier.isStatic(method.getModifiers())) { + throw new IllegalArgumentException( + "Attempted to invoke non-static method without an instance to invoke it on"); + } else { + T result = invoke(targetInstance, method, methodArgs); + return result; + } + } + + public static T invoke(Object targetInstance, Method method, Object... methodArgs) { + if (method == null || methodArgs == null) { + throw new IllegalArgumentException(); + } + makeAccessible(method); + + try { + return (T) method.invoke(targetInstance, methodArgs); + } catch (IllegalAccessException e) { + throw new RuntimeException(e); + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException("Failure to invoke method: " + method, e); + } catch (InvocationTargetException e) { + Throwable cause = e.getCause(); + if (cause instanceof Error) { + throw (Error) cause; + } else if (cause instanceof RuntimeException) { + throw (RuntimeException) cause; + } else { + ThrowOfCheckedException.doThrow((Exception) cause); + return null; + } + } + } + + /** + * Get a static method with {@methodName String} and {@argTypes Class[]}. + * If no method was found, a IllegalArgumentException will be thrown. + */ + private static Method findCompatibleStaticMethod(Class theClass, String methodName, Class[] argTypes) { + if (theClass == null || methodName == null || argTypes == null) { + throw new IllegalArgumentException(); + } + Method methodFound = findCompatibleMethodInClass(theClass, methodName, argTypes); + if (methodFound != null) { + return methodFound; + } else { + String argTypesDesc = ParameterReflection.getParameterTypesDescription(argTypes); + throw new IllegalArgumentException("No compatible static method found: " + methodName + argTypesDesc); + } + } + + /** + * Get a non-static method with {@methodName String} and {@argTypes Class[]}. + */ + public static Method findCompatibleMethod(Class theClass, String methodName, Class[] argTypes) { + if (theClass == null || methodName == null || argTypes == null) { + throw new IllegalArgumentException(); + } + Method methodFound = findCompatibleMethodIfAvailable(theClass, methodName, argTypes); + if (methodFound != null) { + return methodFound; + } else { + String argTypesDesc = ParameterReflection.getParameterTypesDescription(argTypes); + throw new IllegalArgumentException("No compatible method found: " + methodName + argTypesDesc); + } + } + + /** + * Get method with {@methodName String} and {@argTypes Class[]} from {@theClass Class}. + * If more than one method is found, choose the more specific one. (i.e. method with parameters that have more concrete types is more specific) + */ + private static Method findCompatibleMethodInClass(Class theClass, String methodName, Class[] argTypes) { + if (theClass == null || methodName == null || argTypes == null) { + throw new IllegalArgumentException(); + } + Method found = null; + Class[] foundParamTypes = null; + Method[] methods = theClass.getDeclaredMethods(); + + for (Method declaredMethod : methods) { + if (declaredMethod.getName().equals(methodName)) { + Class[] declaredParamTypes = declaredMethod.getParameterTypes(); + int gap = declaredParamTypes.length - argTypes.length; + if (gap == 0 && (ParameterReflection.matchesParameterTypes(declaredParamTypes, argTypes) + || ParameterReflection.acceptsArgumentTypes(declaredParamTypes, argTypes)) + && (foundParamTypes == null + || ParameterReflection.hasMoreSpecificTypes(declaredParamTypes, foundParamTypes))) { + found = declaredMethod; + foundParamTypes = declaredParamTypes; + } + } + } + + return found; + } + + /** + * Get method with {@methodName String} and {@argTypes Class[]} from {@theClass Class} as well as its super class. + * If more than one method is found, choose the more specify one. (i.e. choose the method with parameters that have more concrete types) + */ + private static Method findCompatibleMethodIfAvailable(Class theClass, String methodName, Class[] argTypes) { + if (theClass == null || methodName == null || argTypes == null) { + throw new IllegalArgumentException(); + } + Method methodFound = null; + + while (true) { + Method compatibleMethod = findCompatibleMethodInClass(theClass, methodName, argTypes); + if (compatibleMethod != null && (methodFound == null || + ParameterReflection.hasMoreSpecificTypes(compatibleMethod.getParameterTypes(), + methodFound.getParameterTypes()))) { + methodFound = compatibleMethod; + } + + Class superClass = theClass.getSuperclass(); + if (superClass == null || superClass == Object.class) { + return methodFound; + } + + theClass = superClass; + } + } + + + // ensure that field is accessible + public static void makeAccessible(AccessibleObject classMember) { + if (!classMember.isAccessible()) { + classMember.setAccessible(true); + } + } + + // return true if the two types are same type. + private static boolean isSameType(Class firstType, Class secondType) { + return firstType == secondType + || firstType.isPrimitive() && firstType == AutoType.getPrimitiveType(secondType) + || secondType.isPrimitive() && secondType == AutoType.getPrimitiveType(firstType); + } +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ParameterReflection.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ParameterReflection.java new file mode 100644 index 00000000..6a6efc11 --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ParameterReflection.java @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2006 JMockit developers + * This file is subject to the terms of the MIT license (see LICENSE.txt). + */ + +package org.apache.doris.common.jmockit; + +import java.util.regex.Pattern; + +/** + * Modify from mockit.internal.util.ParameterReflection JMockit v1.13 + * Util class to verify parameter of methods. + *

+ * Copied from Apache Doris + */ +public final class ParameterReflection { + public static final Class[] NO_PARAMETERS = new Class[0]; + + public static final Pattern JAVA_LANG = Pattern.compile("java.lang.", 16); + + private ParameterReflection() { + } + + /** + * check if every member in {@declaredTypes} is completely equal to the corresponding member {@specifiedTypes}. + */ + static boolean matchesParameterTypes(Class[] declaredTypes, Class[] specifiedTypes) { + if (declaredTypes == null || specifiedTypes == null) { + throw new IllegalArgumentException(); + } + for (int i = 0; i < declaredTypes.length; ++i) { + Class declaredType = declaredTypes[i]; + Class specifiedType = specifiedTypes[i]; + if (!isSameType(declaredType, specifiedType)) { + return false; + } + } + + return true; + } + + /** + * check if every member in {@paramTypes} is acceptable to the corresponding member in {@argTypes}. + */ + static boolean acceptsArgumentTypes(Class[] paramTypes, Class[] argTypes) { + if (paramTypes == null || argTypes == null) { + throw new IllegalArgumentException(); + } + for (int i = 0; i < paramTypes.length; ++i) { + Class parType = paramTypes[i]; + Class argType = argTypes[i]; + if (!isSameType(parType, argType) && !parType.isAssignableFrom(argType)) { + return false; + } + } + + return true; + } + + /** + * Get all types from objects {@args}. + */ + static Class[] getArgumentTypesFromArgumentValues(Object... args) { + if (args == null) { + throw new IllegalArgumentException(); + } + if (args.length == 0) { + return NO_PARAMETERS; + } else { + Class[] argTypes = new Class[args.length]; + + for (int i = 0; i < args.length; ++i) { + argTypes[i] = getArgumentTypeFromArgumentValue(i, args); + } + + return argTypes; + } + } + + /** + * Get type from {@args} by index. + */ + static Class getArgumentTypeFromArgumentValue(int i, Object[] args) { + Object arg = args[i]; + if (arg == null) { + throw new IllegalArgumentException("Invalid null value passed as argument " + i); + } else { + Class argType; + if (arg instanceof Class) { + argType = (Class) arg; + args[i] = null; + } else { + argType = GeneratedClasses.getMockedClass(arg); + } + + return argType; + } + } + + /** + * return true if {@currentTypes} is more specific than {@previousTypes}. + */ + static boolean hasMoreSpecificTypes(Class[] currentTypes, Class[] previousTypes) { + if (currentTypes == null || previousTypes == null) { + throw new IllegalArgumentException(); + } + for (int i = 0; i < currentTypes.length; ++i) { + Class current = wrappedIfPrimitive(currentTypes[i]); + Class previous = wrappedIfPrimitive(previousTypes[i]); + if (current != previous && previous.isAssignableFrom(current)) { + return true; + } + } + + return false; + } + + /** + * return the type names of {@paramTypes} wrapped in brackets. + */ + static String getParameterTypesDescription(Class[] paramTypes) { + if (paramTypes == null) { + throw new IllegalArgumentException(); + } + StringBuilder paramTypesDesc = new StringBuilder(200); + paramTypesDesc.append('('); + String sep = ""; + + for (Class paramType : paramTypes) { + String typeName = JAVA_LANG.matcher(paramType.getCanonicalName()).replaceAll(""); + paramTypesDesc.append(sep).append(typeName); + sep = ", "; + } + + paramTypesDesc.append(')'); + return paramTypesDesc.toString(); + } + + /** + * return real parameters array of inner-class belong to the outer-class instance {@firstValue Object}. + * the parameter[0] of a inner-class constructor is always the instance of its outer-class. + */ + static Object[] argumentsWithExtraFirstValue(Object[] args, Object firstValue) { + Object[] args2 = new Object[1 + args.length]; + args2[0] = firstValue; + System.arraycopy(args, 0, args2, 1, args.length); + return args2; + } + + // return wrapped type if its type is primitive. + private static Class wrappedIfPrimitive(Class parameterType) { + if (parameterType.isPrimitive()) { + Class wrapperType = AutoType.getWrapperType(parameterType); + + assert wrapperType != null; + + return wrapperType; + } else { + return parameterType; + } + } + + // return true if the two types are same type. + private static boolean isSameType(Class firstType, Class secondType) { + return firstType == secondType + || firstType.isPrimitive() && firstType == AutoType.getPrimitiveType(secondType) + || secondType.isPrimitive() && secondType == AutoType.getPrimitiveType(firstType); + } +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ThrowOfCheckedException.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ThrowOfCheckedException.java new file mode 100644 index 00000000..4dfc44ae --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/common/jmockit/ThrowOfCheckedException.java @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2006 JMockit developers + * This file is subject to the terms of the MIT license (see LICENSE.txt). + */ + +package org.apache.doris.common.jmockit; + +/** + * Modify from mockit.internal.reflection.ThrowOfCheckedException JMockit v1.13 + */ +public final class ThrowOfCheckedException { + private static Exception exceptionToThrow; + + ThrowOfCheckedException() throws Exception { + throw exceptionToThrow; + } + + public static synchronized void doThrow(Exception checkedException) { + exceptionToThrow = checkedException; + ConstructorReflection.newInstanceUsingDefaultConstructor(ThrowOfCheckedException.class); + } +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/config/EtlJobConfig.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/config/EtlJobConfig.java new file mode 100644 index 00000000..9cca8650 --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/config/EtlJobConfig.java @@ -0,0 +1,513 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.config; + + +import com.google.common.base.Strings; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; +import com.google.gson.ExclusionStrategy; +import com.google.gson.FieldAttributes; +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import com.google.gson.annotations.SerializedName; + +import java.io.Serializable; +import java.util.Comparator; +import java.util.List; +import java.util.Map; + +/** + * Copied from Apache Doris org.apache.doris.sparkdpp.EtlJobConfig + */ +public class EtlJobConfig implements Serializable { + // global dict + public static final String GLOBAL_DICT_TABLE_NAME = "doris_global_dict_table_%d"; + public static final String DISTINCT_KEY_TABLE_NAME = "doris_distinct_key_table_%d_%s"; + public static final String DORIS_INTERMEDIATE_HIVE_TABLE_NAME = "doris_intermediate_hive_table_%d_%s"; + // tableId.partitionId.indexId.bucket.schemaHash + public static final String TABLET_META_FORMAT = "%d.%d.%d.%d.%d"; + public static final String ETL_OUTPUT_FILE_FORMAT = "parquet"; + // dpp result + public static final String DPP_RESULT_NAME = "dpp_result.json"; + // hdfsEtlPath/jobs/dbId/loadLabel/PendingTaskSignature + private static final String ETL_OUTPUT_PATH_FORMAT = "%s/jobs/%d/%s/%d"; + private static final String ETL_OUTPUT_FILE_NAME_DESC_V1 = + "version.label.tableId.partitionId.indexId.bucket.schemaHash.parquet"; + @SerializedName(value = "tables") + public Map tables; + @SerializedName(value = "outputPath") + public String outputPath; + @SerializedName(value = "outputFilePattern") + public String outputFilePattern; + @SerializedName(value = "label") + public String label; + @SerializedName(value = "properties") + public EtlJobProperty properties; + @SerializedName(value = "configVersion") + public ConfigVersion configVersion; + + /** + * for json deserialize + */ + public EtlJobConfig() { + } + + public EtlJobConfig(Map tables, String outputFilePattern, String label, EtlJobProperty properties) { + this.tables = tables; + // set outputPath when submit etl job + this.outputPath = null; + this.outputFilePattern = outputFilePattern; + this.label = label; + this.properties = properties; + this.configVersion = ConfigVersion.V1; + } + + public static String getOutputPath(String hdfsEtlPath, long dbId, String loadLabel, long taskSignature) { + return String.format(ETL_OUTPUT_PATH_FORMAT, hdfsEtlPath, dbId, loadLabel, taskSignature); + } + + public static String getOutputFilePattern(String loadLabel, FilePatternVersion filePatternVersion) { + return String.format("%s.%s.%s.%s", filePatternVersion.name(), loadLabel, TABLET_META_FORMAT, + ETL_OUTPUT_FILE_FORMAT); + } + + public static String getDppResultFilePath(String outputPath) { + return outputPath + "/" + DPP_RESULT_NAME; + } + + public static String getTabletMetaStr(String filePath) throws Exception { + String fileName = filePath.substring(filePath.lastIndexOf("/") + 1); + String[] fileNameArr = fileName.split("\\."); + // check file version + switch (FilePatternVersion.valueOf(fileNameArr[0])) { + case V1: + // version.label.tableId.partitionId.indexId.bucket.schemaHash.parquet + if (fileNameArr.length != ETL_OUTPUT_FILE_NAME_DESC_V1.split("\\.").length) { + throw new Exception( + "etl output file name error, format: " + ETL_OUTPUT_FILE_NAME_DESC_V1 + ", name: " + + fileName); + } + long tableId = Long.parseLong(fileNameArr[2]); + long partitionId = Long.parseLong(fileNameArr[3]); + long indexId = Long.parseLong(fileNameArr[4]); + int bucket = Integer.parseInt(fileNameArr[5]); + int schemaHash = Integer.parseInt(fileNameArr[6]); + // tableId.partitionId.indexId.bucket.schemaHash + return String.format(TABLET_META_FORMAT, tableId, partitionId, indexId, bucket, schemaHash); + default: + throw new Exception("etl output file version error. version: " + fileNameArr[0]); + } + } + + public static EtlJobConfig configFromJson(String jsonConfig) { + return new Gson().fromJson(jsonConfig, EtlJobConfig.class); + } + + public String configToJson() { + Gson gson = + new GsonBuilder().addDeserializationExclusionStrategy(new HiddenAnnotationExclusionStrategy()).create(); + return gson.toJson(this); + } + + @Override + public String toString() { + return "EtlJobConfig{" + "tables=" + tables + ", outputPath='" + outputPath + '\'' + ", outputFilePattern='" + + outputFilePattern + '\'' + ", label='" + label + '\'' + ", properties=" + properties + ", version=" + + configVersion + '}'; + } + + public String getOutputPath() { + return outputPath; + } + + public enum ConfigVersion { + V1 + } + + public enum FilePatternVersion { + V1 + } + + public enum SourceType { + FILE, HIVE + } + + public static class EtlJobProperty implements Serializable { + @SerializedName(value = "strictMode") + public boolean strictMode; + @SerializedName(value = "timezone") + public String timezone; + + @Override + public String toString() { + return "EtlJobProperty{" + "strictMode=" + strictMode + ", timezone='" + timezone + '\'' + '}'; + } + } + + public static class EtlTable implements Serializable { + @SerializedName(value = "indexes") + public List indexes; + @SerializedName(value = "partitionInfo") + public EtlPartitionInfo partitionInfo; + @SerializedName(value = "fileGroups") + public List fileGroups; + + /** + * for json deserialize + */ + public EtlTable() { + } + + public EtlTable(List etlIndexes, EtlPartitionInfo etlPartitionInfo) { + this.indexes = etlIndexes; + this.partitionInfo = etlPartitionInfo; + this.fileGroups = Lists.newArrayList(); + } + + public void addFileGroup(EtlFileGroup etlFileGroup) { + fileGroups.add(etlFileGroup); + } + + @Override + public String toString() { + return "EtlTable{" + "indexes=" + indexes + ", partitionInfo=" + partitionInfo + ", fileGroups=" + + fileGroups + '}'; + } + } + + public static class EtlColumn implements Serializable { + @SerializedName(value = "columnName") + public String columnName; + @SerializedName(value = "columnType") + public String columnType; + @SerializedName(value = "isAllowNull") + public boolean isAllowNull; + @SerializedName(value = "isKey") + public boolean isKey; + @SerializedName(value = "aggregationType") + public String aggregationType; + @SerializedName(value = "defaultValue") + public String defaultValue; + @SerializedName(value = "stringLength") + public int stringLength; + @SerializedName(value = "precision") + public int precision; + @SerializedName(value = "scale") + public int scale; + @SerializedName(value = "defineExpr") + public String defineExpr; + + // for unit test + public EtlColumn() { + } + + public EtlColumn(String columnName, String columnType, boolean isAllowNull, boolean isKey, + String aggregationType, String defaultValue, int stringLength, int precision, int scale) { + this.columnName = columnName; + this.columnType = columnType; + this.isAllowNull = isAllowNull; + this.isKey = isKey; + this.aggregationType = aggregationType; + this.defaultValue = defaultValue; + this.stringLength = stringLength; + this.precision = precision; + this.scale = scale; + this.defineExpr = null; + } + + @Override + public String toString() { + return "EtlColumn{" + "columnName='" + columnName + '\'' + ", columnType='" + columnType + '\'' + + ", isAllowNull=" + isAllowNull + ", isKey=" + isKey + ", aggregationType='" + aggregationType + + '\'' + ", defaultValue='" + defaultValue + '\'' + ", stringLength=" + stringLength + + ", precision=" + precision + ", scale=" + scale + ", defineExpr='" + defineExpr + '\'' + '}'; + } + } + + public static class EtlIndexComparator implements Comparator { + @Override + public int compare(EtlIndex a, EtlIndex b) { + int diff = a.columns.size() - b.columns.size(); + if (diff == 0) { + return 0; + } else if (diff > 0) { + return 1; + } else { + return -1; + } + } + } + + public static class EtlIndex implements Serializable { + @SerializedName(value = "indexId") + public long indexId; + @SerializedName(value = "columns") + public List columns; + @SerializedName(value = "schemaHash") + public int schemaHash; + @SerializedName(value = "indexType") + public String indexType; + @SerializedName(value = "isBaseIndex") + public boolean isBaseIndex; + @SerializedName(value = "schemaVersion") + public int schemaVersion; + + /** + * for json deserialize + */ + public EtlIndex() { + } + + public EtlIndex(long indexId, List etlColumns, int schemaHash, String indexType, boolean isBaseIndex, + int schemaVersion) { + this.indexId = indexId; + this.columns = etlColumns; + this.schemaHash = schemaHash; + this.indexType = indexType; + this.isBaseIndex = isBaseIndex; + this.schemaVersion = schemaVersion; + } + + public EtlColumn getColumn(String name) { + for (EtlColumn column : columns) { + if (column.columnName.equals(name)) { + return column; + } + } + return null; + } + + @Override + public String toString() { + return "EtlIndex{" + "indexId=" + indexId + ", columns=" + columns + ", schemaHash=" + schemaHash + + ", indexType='" + indexType + '\'' + ", isBaseIndex=" + isBaseIndex + ", schemaVersion=" + + schemaVersion + '}'; + } + } + + public static class EtlPartitionInfo implements Serializable { + @SerializedName(value = "partitionType") + public String partitionType; + @SerializedName(value = "partitionColumnRefs") + public List partitionColumnRefs; + @SerializedName(value = "distributionColumnRefs") + public List distributionColumnRefs; + @SerializedName(value = "partitions") + public List partitions; + + /** + * for json deserialize + */ + public EtlPartitionInfo() { + } + + public EtlPartitionInfo(String partitionType, List partitionColumnRefs, + List distributionColumnRefs, List etlPartitions) { + this.partitionType = partitionType; + this.partitionColumnRefs = partitionColumnRefs; + this.distributionColumnRefs = distributionColumnRefs; + this.partitions = etlPartitions; + } + + @Override + public String toString() { + return "EtlPartitionInfo{" + "partitionType='" + partitionType + '\'' + ", partitionColumnRefs=" + + partitionColumnRefs + ", distributionColumnRefs=" + distributionColumnRefs + ", partitions=" + + partitions + '}'; + } + } + + public static class EtlPartition implements Serializable { + @SerializedName(value = "partitionId") + public long partitionId; + @SerializedName(value = "startKeys") + public List startKeys; + @SerializedName(value = "endKeys") + public List endKeys; + @SerializedName(value = "isMaxPartition") + public boolean isMaxPartition; + @SerializedName(value = "bucketNum") + public int bucketNum; + + /** + * for json deserialize + */ + public EtlPartition() { + } + + public EtlPartition(long partitionId, List startKeys, List endKeys, boolean isMaxPartition, + int bucketNum) { + this.partitionId = partitionId; + this.startKeys = startKeys; + this.endKeys = endKeys; + this.isMaxPartition = isMaxPartition; + this.bucketNum = bucketNum; + } + + @Override + public String toString() { + return "EtlPartition{" + "partitionId=" + partitionId + ", startKeys=" + startKeys + ", endKeys=" + + endKeys + ", isMaxPartition=" + isMaxPartition + ", bucketNum=" + bucketNum + '}'; + } + } + + public static class EtlFileGroup implements Serializable { + @SerializedName(value = "sourceType") + public SourceType sourceType = SourceType.FILE; + @SerializedName(value = "filePaths") + public List filePaths; + @SerializedName(value = "fileFieldNames") + public List fileFieldNames; + @SerializedName(value = "columnsFromPath") + public List columnsFromPath; + @SerializedName(value = "columnSeparator") + public String columnSeparator; + @SerializedName(value = "lineDelimiter") + public String lineDelimiter; + @SerializedName(value = "isNegative") + public boolean isNegative; + @SerializedName(value = "fileFormat") + public String fileFormat; + @SerializedName(value = "columnMappings") + public Map columnMappings; + @SerializedName(value = "where") + public String where; + @SerializedName(value = "partitions") + public List partitions; + @SerializedName(value = "hiveDbTableName") + public String hiveDbTableName; + @SerializedName(value = "hiveTableProperties") + public Map hiveTableProperties; + + // hive db table used in dpp, not serialized + // set with hiveDbTableName (no bitmap column) or IntermediateHiveTable (created by global dict builder) + // in spark etl job + public String dppHiveDbTableName; + + // for data infile path + public EtlFileGroup(SourceType sourceType, List filePaths, List fileFieldNames, + List columnsFromPath, String columnSeparator, String lineDelimiter, + boolean isNegative, String fileFormat, Map columnMappings, + String where, List partitions) { + this.sourceType = sourceType; + this.filePaths = filePaths; + this.fileFieldNames = fileFieldNames; + this.columnsFromPath = columnsFromPath; + this.columnSeparator = Strings.isNullOrEmpty(columnSeparator) ? "\t" : columnSeparator; + this.lineDelimiter = lineDelimiter; + this.isNegative = isNegative; + this.fileFormat = fileFormat; + this.columnMappings = columnMappings; + this.where = where; + this.partitions = partitions; + } + + // for data from table + public EtlFileGroup(SourceType sourceType, String hiveDbTableName, Map hiveTableProperties, + boolean isNegative, Map columnMappings, String where, + List partitions) { + this.sourceType = sourceType; + this.hiveDbTableName = hiveDbTableName; + this.hiveTableProperties = hiveTableProperties; + this.isNegative = isNegative; + this.columnMappings = columnMappings; + this.where = where; + this.partitions = partitions; + } + + @Override + public String toString() { + return "EtlFileGroup{" + "sourceType=" + sourceType + ", filePaths=" + filePaths + ", fileFieldNames=" + + fileFieldNames + ", columnsFromPath=" + columnsFromPath + ", columnSeparator='" + columnSeparator + + '\'' + ", lineDelimiter='" + lineDelimiter + '\'' + ", isNegative=" + isNegative + + ", fileFormat='" + fileFormat + '\'' + ", columnMappings=" + columnMappings + ", where='" + where + + '\'' + ", partitions=" + partitions + ", hiveDbTableName='" + hiveDbTableName + '\'' + + ", hiveTableProperties=" + hiveTableProperties + '}'; + } + } + + /** + * FunctionCallExpr = functionName(args) + * For compatibility with old designed functions used in Hadoop MapReduce etl + *

+ * expr is more general, like k1 + 1, not just FunctionCall + */ + public static class EtlColumnMapping implements Serializable { + + private static Map functionMap = + new ImmutableMap.Builder().put("md5sum", "md5").build(); + + @SerializedName(value = "functionName") + public String functionName; + @SerializedName(value = "args") + public List args; + @SerializedName(value = "expr") + public String expr; + + public EtlColumnMapping(String functionName, List args) { + this.functionName = functionName; + this.args = args; + } + + public EtlColumnMapping(String expr) { + this.expr = expr; + } + + public String toDescription() { + StringBuilder sb = new StringBuilder(); + if (functionName == null) { + sb.append(expr); + } else { + if (functionMap.containsKey(functionName)) { + sb.append(functionMap.get(functionName)); + } else { + sb.append(functionName); + } + sb.append("("); + if (args != null) { + for (String arg : args) { + sb.append(arg); + sb.append(","); + } + } + sb.deleteCharAt(sb.length() - 1); + sb.append(")"); + } + return sb.toString(); + } + + @Override + public String toString() { + return "EtlColumnMapping{" + "functionName='" + functionName + '\'' + ", args=" + args + ", expr=" + expr + + '}'; + } + } + + public static class HiddenAnnotationExclusionStrategy implements ExclusionStrategy { + public boolean shouldSkipField(FieldAttributes f) { + return f.getAnnotation(SerializedName.class) == null; + } + + @Override + public boolean shouldSkipClass(Class clazz) { + return false; + } + } + +} diff --git a/spark-load/spark-load-common/src/main/java/org/apache/doris/util/JsonUtils.java b/spark-load/spark-load-common/src/main/java/org/apache/doris/util/JsonUtils.java new file mode 100644 index 00000000..3d33e85b --- /dev/null +++ b/spark-load/spark-load-common/src/main/java/org/apache/doris/util/JsonUtils.java @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.util; + +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.MapperFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.json.JsonMapper; + +import java.io.File; +import java.io.IOException; + +/** + * json utilities + */ +public class JsonUtils { + + private static final ObjectMapper MAPPER = + JsonMapper.builder().enable(MapperFeature.ACCEPT_CASE_INSENSITIVE_ENUMS).build(); + + public static T readValue(String s, Class clazz) throws JsonProcessingException { + return MAPPER.readValue(s, clazz); + } + + public static T readValue(String s, TypeReference ref) throws JsonProcessingException { + return MAPPER.readValue(s, ref); + } + + public static T readValue(File file, Class clazz) throws IOException { + return MAPPER.readValue(file, clazz); + } + + public static T readValue(JsonParser parser, Class clazz) throws IOException { + return MAPPER.readValue(parser, clazz); + } + + public static T readValue(JsonParser parser, TypeReference ref) throws IOException { + return MAPPER.readValue(parser, ref); + } + + public static String writeValueAsString(Object o) throws JsonProcessingException { + return MAPPER.writeValueAsString(o); + } + + public static byte[] writeValueAsBytes(Object o) throws JsonProcessingException { + return MAPPER.writeValueAsBytes(o); + } + +} diff --git a/spark-load/spark-load-common/src/test/java/org/apache/doris/config/EtlJobConfigTest.java b/spark-load/spark-load-common/src/test/java/org/apache/doris/config/EtlJobConfigTest.java new file mode 100644 index 00000000..7d82e65f --- /dev/null +++ b/spark-load/spark-load-common/src/test/java/org/apache/doris/config/EtlJobConfigTest.java @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.config; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +class EtlJobConfigTest { + + @Test + void getOutputPath() { + String outputPath = EtlJobConfig.getOutputPath("hdfs://127.0.0.1/spark-load", 10001L, "test", 123L); + Assertions.assertEquals("hdfs://127.0.0.1/spark-load/jobs/10001/test/123", outputPath); + } + + @Test + void getOutputFilePattern() { + String outputFilePattern = EtlJobConfig.getOutputFilePattern("test", EtlJobConfig.FilePatternVersion.V1); + Assertions.assertEquals("V1.test.%d.%d.%d.%d.%d.parquet", outputFilePattern); + } + + @Test + void configFromJson() { + List etlIndexes = new ArrayList<>(); + List etlColumns = new ArrayList<>(); + EtlJobConfig.EtlColumn etlColumn0 = new EtlJobConfig.EtlColumn("c0", "INT", false, true, "NONE", "0", 0, 0, 0); + EtlJobConfig.EtlColumn etlColumn1 = + new EtlJobConfig.EtlColumn("c1", "VARCHAR", true, false, "NONE", "\\N", 10, 0, 0); + etlColumns.add(etlColumn0); + etlColumns.add(etlColumn1); + EtlJobConfig.EtlIndex etlIndex = new EtlJobConfig.EtlIndex(1L, etlColumns, 123, "DUPLICATE", true, 0); + etlIndexes.add(etlIndex); + EtlJobConfig.EtlPartitionInfo etlPartitionInfo = + new EtlJobConfig.EtlPartitionInfo("UNPARTITIONED", Collections.emptyList(), + Collections.singletonList("c0"), Collections.singletonList( + new EtlJobConfig.EtlPartition(0, Collections.emptyList(), Collections.emptyList(), true, 0))); + EtlJobConfig.EtlTable table = new EtlJobConfig.EtlTable(etlIndexes, etlPartitionInfo); + Map tables = new HashMap<>(); + tables.put(123L, table); + String outputFilePattern = EtlJobConfig.getOutputFilePattern("test", EtlJobConfig.FilePatternVersion.V1); + EtlJobConfig.EtlJobProperty properties = new EtlJobConfig.EtlJobProperty(); + EtlJobConfig jobConfig = new EtlJobConfig(tables, outputFilePattern, "test", properties); + Assertions.assertEquals(jobConfig.configToJson(), + EtlJobConfig.configFromJson("{\"tables\":{\"123\":{\"indexes\":[{\"indexId\":1,\"columns\":[{\"columnName\":\"c0\"," + + "\"columnType\":\"INT\",\"isAllowNull\":false,\"isKey\":true,\"aggregationType\":\"NONE\"," + + "\"defaultValue\":\"0\",\"stringLength\":0,\"precision\":0,\"scale\":0}," + + "{\"columnName\":\"c1\",\"columnType\":\"VARCHAR\",\"isAllowNull\":true,\"isKey\":false," + + "\"aggregationType\":\"NONE\",\"defaultValue\":\"\\\\N\",\"stringLength\":10,\"precision\":0," + + "\"scale\":0}],\"schemaHash\":123,\"indexType\":\"DUPLICATE\",\"isBaseIndex\":true," + + "\"schemaVersion\":0}],\"partitionInfo\":{\"partitionType\":\"UNPARTITIONED\"," + + "\"partitionColumnRefs\":[],\"distributionColumnRefs\":[\"c0\"],\"partitions\":" + + "[{\"partitionId\":0,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true,\"bucketNum\":0}]}," + + "\"fileGroups\":[]}},\"outputFilePattern\":\"V1.test.%d.%d.%d.%d.%d.parquet\"," + + "\"label\":\"test\",\"properties\":{\"strictMode\":false},\"configVersion\":\"V1\"}").configToJson()); + } + + @Test + void configToJson() { + List etlIndexes = new ArrayList<>(); + List etlColumns = new ArrayList<>(); + EtlJobConfig.EtlColumn etlColumn0 = new EtlJobConfig.EtlColumn("c0", "INT", false, true, "NONE", "0", 0, 0, 0); + EtlJobConfig.EtlColumn etlColumn1 = + new EtlJobConfig.EtlColumn("c1", "VARCHAR", true, false, "NONE", "\\N", 10, 0, 0); + etlColumns.add(etlColumn0); + etlColumns.add(etlColumn1); + EtlJobConfig.EtlIndex etlIndex = new EtlJobConfig.EtlIndex(1L, etlColumns, 123, "DUPLICATE", true, 0); + etlIndexes.add(etlIndex); + EtlJobConfig.EtlPartitionInfo etlPartitionInfo = + new EtlJobConfig.EtlPartitionInfo("UNPARTITIONED", Collections.emptyList(), + Collections.singletonList("c0"), Collections.singletonList( + new EtlJobConfig.EtlPartition(0, Collections.emptyList(), Collections.emptyList(), true, 0))); + EtlJobConfig.EtlTable table = new EtlJobConfig.EtlTable(etlIndexes, etlPartitionInfo); + Map tables = new HashMap<>(); + tables.put(123L, table); + String outputFilePattern = EtlJobConfig.getOutputFilePattern("test", EtlJobConfig.FilePatternVersion.V1); + EtlJobConfig.EtlJobProperty properties = new EtlJobConfig.EtlJobProperty(); + EtlJobConfig jobConfig = new EtlJobConfig(tables, outputFilePattern, "test", properties); + Assertions.assertEquals( + "{\"tables\":{\"123\":{\"indexes\":[{\"indexId\":1,\"columns\":[{\"columnName\":\"c0\"," + + "\"columnType\":\"INT\",\"isAllowNull\":false,\"isKey\":true,\"aggregationType\":\"NONE\"," + + "\"defaultValue\":\"0\",\"stringLength\":0,\"precision\":0,\"scale\":0}," + + "{\"columnName\":\"c1\",\"columnType\":\"VARCHAR\",\"isAllowNull\":true,\"isKey\":false," + + "\"aggregationType\":\"NONE\",\"defaultValue\":\"\\\\N\",\"stringLength\":10,\"precision\":0," + + "\"scale\":0}],\"schemaHash\":123,\"indexType\":\"DUPLICATE\",\"isBaseIndex\":true," + + "\"schemaVersion\":0}],\"partitionInfo\":{\"partitionType\":\"UNPARTITIONED\"," + + "\"partitionColumnRefs\":[],\"distributionColumnRefs\":[\"c0\"],\"partitions\":" + + "[{\"partitionId\":0,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true,\"bucketNum\":0}]}," + + "\"fileGroups\":[]}},\"outputFilePattern\":\"V1.test.%d.%d.%d.%d.%d.parquet\"," + + "\"label\":\"test\",\"properties\":{\"strictMode\":false},\"configVersion\":\"V1\"}", + jobConfig.configToJson()); + } +} \ No newline at end of file diff --git a/spark-load/spark-load-core/pom.xml b/spark-load/spark-load-core/pom.xml new file mode 100644 index 00000000..fbe3edaf --- /dev/null +++ b/spark-load/spark-load-core/pom.xml @@ -0,0 +1,187 @@ + + + + 4.0.0 + + org.apache.doris + spark-load + ${revision} + + + spark-load-core + + + 8 + 8 + UTF-8 + 1 + -Xmx512m + + + + + org.apache.doris + spark-load-common + ${project.version} + + + com.fasterxml.jackson.core + jackson-databind + + + org.projectlombok + lombok + provided + + + commons-cli + commons-cli + + + com.google.guava + guava + + + org.apache.spark + spark-launcher_${scala.major.version} + + + org.apache.spark + spark-core_${scala.major.version} + + + org.apache.hadoop + hadoop-client + + + + + org.apache.spark + spark-catalyst_${scala.major.version} + + + org.apache.hadoop + hadoop-common + + + org.apache.hadoop + hadoop-client + + + org.apache.logging.log4j + log4j-core + + + + org.apache.logging.log4j + log4j-api + + + + org.apache.logging.log4j + log4j-slf4j-impl + + + + org.slf4j + slf4j-api + + + org.junit.jupiter + junit-jupiter-engine + test + + + org.jmockit + jmockit + test + + + + + + + + org.apache.httpcomponents + httpclient + + + org.apache.hadoop + hadoop-aws + + + org.slf4j + slf4j-log4j12 + + + log4j + log4j + + + servlet-api + javax.servlet + + + + com.amazonaws + aws-java-sdk-s3 + + + com.amazonaws + aws-java-sdk-bundle + + + + + com.amazonaws + aws-java-sdk-s3 + + + com.amazonaws + aws-java-sdk-glue + + + com.amazonaws + aws-java-sdk-dynamodb + + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + set larger, eg, 3, to reduce the time or running FE unit tests<--> + ${fe_ut_parallel} + not reuse forked jvm, so that each unit test will run in separate jvm. to avoid singleton confict<--> + false + + -javaagent:${settings.localRepository}/org/jmockit/jmockit/${jmockit.version}/jmockit-${jmockit.version}.jar @{argLine} + + + + + + + \ No newline at end of file diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java new file mode 100644 index 00000000..f7920879 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/SparkLoadRunner.java @@ -0,0 +1,149 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris; + +import org.apache.commons.lang3.exception.ExceptionUtils; +import org.apache.doris.common.CommandLineOptions; +import org.apache.doris.config.JobConfig; +import org.apache.doris.load.LoaderFactory; +import org.apache.doris.load.job.Loader; +import org.apache.doris.load.job.Recoverable; +import org.apache.doris.util.JsonUtils; + +import com.google.common.base.Preconditions; +import com.google.common.base.Strings; +import io.netty.util.internal.logging.InternalLoggerFactory; +import io.netty.util.internal.logging.Log4JLoggerFactory; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.lang3.StringUtils; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.File; +import java.io.IOException; + +public class SparkLoadRunner { + + private static final Logger LOG = LogManager.getLogger(SparkLoadRunner.class); + + public static final String SPARK_LOAD_HOME = System.getenv("SPARK_LOAD_HOME"); + + static { + InternalLoggerFactory.setDefaultFactory(Log4JLoggerFactory.INSTANCE); + } + + public static void main(String[] args) { + + if (StringUtils.isBlank(SPARK_LOAD_HOME)) { + System.err.println("env SPARK_LOAD_HOME is not set."); + System.exit(-1); + } + + CommandLineOptions cmdOptions = parseArgs(args); + if (Strings.isNullOrEmpty(cmdOptions.getConfigPath())) { + System.err.println("config path is empty"); + System.exit(-1); + } + + JobConfig jobConfig = readConfig(cmdOptions.getConfigPath()); + try { + checkConfig(jobConfig); + } catch (IllegalArgumentException e) { + System.err.println("check config failed, msg: " + ExceptionUtils.getStackTrace(e)); + System.exit(-1); + } + + Loader loader = LoaderFactory.createLoader(jobConfig, cmdOptions.getRecovery()); + Runtime.getRuntime().addShutdownHook(new Thread(() -> { + LOG.info("Shutting down..."); + loader.cancel(); + })); + try { + + loader.prepare(); + do { + if (loader instanceof Recoverable) { + if (((Recoverable) loader).canBeRecovered()) { + LOG.info("recovery check passed, start prepare recovery."); + ((Recoverable) loader).prepareRecover(); + break; + } + } + loader.execute(); + } while (false); + + loader.afterFinished(); + + } catch (Exception e) { + loader.afterFailed(e); + LOG.error("start load failed", e); + System.err.println("start load failed, exit."); + System.exit(-1); + } + + } + + private static CommandLineOptions parseArgs(String[] args) { + CommandLineParser parser = new DefaultParser(); + Options options = new Options(); + options.addOption("c", "config", true, "Spark load config file"); + options.addOption("r", "recovery", false, "Recovery mode"); + CommandLine cmd = null; + try { + cmd = parser.parse(options, args); + } catch (ParseException e) { + System.err.println("failed to parse argument, exit."); + System.exit(-1); + } + + String configPath = cmd.getOptionValue("config"); + boolean recovery = cmd.hasOption('r') || cmd.hasOption("recovery"); + return new CommandLineOptions(configPath, recovery); + + } + + private static JobConfig readConfig(String path) { + JobConfig jobConfig = null; + try { + jobConfig = JsonUtils.readValue(new File(path), JobConfig.class); + } catch (IOException e) { + LOG.error("failed to read config file", e); + System.err.println("failed to read config file, exit."); + System.exit(-1); + } + return jobConfig; + } + + private static void checkConfig(JobConfig jobConfig) { + jobConfig.checkFeAddress(); + Preconditions.checkArgument(StringUtils.isNoneBlank(jobConfig.getLabel()), "label is empty"); + Preconditions.checkArgument(StringUtils.isNoneBlank(jobConfig.getUser()), "user is empty"); + Preconditions.checkArgument(jobConfig.getPassword() != null, "password cannot be null"); + Preconditions.checkArgument(StringUtils.isNoneBlank(jobConfig.getDatabase()), "database is empty"); + Preconditions.checkArgument(StringUtils.isNoneBlank(jobConfig.getWorkingDir()), + "spark config item workingDir is empty"); + jobConfig.checkTaskInfo(); + jobConfig.checkSparkInfo(); + jobConfig.checkHadoopProperties(); + } + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java new file mode 100644 index 00000000..124fd0fe --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/client/DorisClient.java @@ -0,0 +1,248 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.client; + +import org.apache.doris.common.Constants; +import org.apache.doris.common.LoadInfo; +import org.apache.doris.common.ResponseEntity; +import org.apache.doris.common.meta.LoadInfoResponse; +import org.apache.doris.common.meta.LoadMeta; +import org.apache.doris.exception.SparkLoadException; +import org.apache.doris.util.HttpUtils; +import org.apache.doris.util.JsonUtils; + +import org.apache.commons.lang3.StringUtils; +import org.apache.http.HttpHeaders; +import org.apache.http.HttpStatus; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpPost; +import org.apache.http.client.methods.HttpRequestBase; +import org.apache.http.client.utils.URIBuilder; +import org.apache.http.entity.StringEntity; +import org.apache.http.impl.client.CloseableHttpClient; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Base64; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class DorisClient { + + private static volatile FeClient FE; + private static BeClient BE; + + public static FeClient getFeClient(String feAddresses, String user, String password) { + if (FE == null) { + synchronized (FeClient.class) { + if (FE == null) { + FE = new FeClient(feAddresses, user, password); + } + } + } + return FE; + } + + public static class FeClient { + + public static final String BASE_URL = "http://%s%s"; + + public static final String INGESTION_LOAD_URL_PATTERN = "/api/ingestion_load/%s/%s/%s"; + + public static final String CREATE_ACTION = "_create"; + + public static final String UPDATE_ACTION = "_update"; + + public static final String GET_LOAD_INFO = "/api/%s/_load_info"; + + public static final String GET_DDL = "/api/_get_ddl"; + + private final List feNodes; + + private final String auth; + + public FeClient(String feAddresses, String user, String password) { + this.feNodes = parseFeNodes(feAddresses); + this.auth = parseAuth(user, password); + } + + private List parseFeNodes(String feAddresses) { + if (StringUtils.isBlank(feAddresses)) { + throw new IllegalArgumentException("feAddresses is empty"); + } + String[] feArr = feAddresses.split(","); + if (Arrays.stream(feArr).map(x -> x.split(":")) + .anyMatch(x -> x.length != 2 || x[0].isEmpty() || x[1].isEmpty())) { + throw new IllegalArgumentException("feAddresses contains invalid format, " + feAddresses); + } + return Arrays.stream(feArr).collect(Collectors.toList()); + } + + private String parseAuth(String user, String password) { + return Base64.getEncoder().encodeToString((user + ":" + password).getBytes(StandardCharsets.UTF_8)); + } + + public LoadMeta createIngestionLoad(String db, Map> tableToPartition, String label, + Map properties) throws SparkLoadException { + try { + String path = String.format(INGESTION_LOAD_URL_PATTERN, Constants.DEFAULT_CATALOG, db, CREATE_ACTION); + HttpPost httpPost = new HttpPost(); + addCommonHeaders(httpPost); + Map params = new HashMap<>(); + params.put("label", label); + params.put("tableToPartition", tableToPartition); + params.put("properties", properties); + httpPost.setEntity(new StringEntity(JsonUtils.writeValueAsString(params))); + String content = executeRequest(httpPost, path, null); + if (StringUtils.isBlank(content)) { + throw new SparkLoadException(String.format("request create load failed, path: %s", path)); + } + ResponseEntity res = JsonUtils.readValue(content, ResponseEntity.class); + if (res.getCode() != 0) { + throw new SparkLoadException(String.format("create load failed, code: %d, msg: %s, reason: %s", + res.getCode(), res.getMsg(), res.getData().isNull() ? null : res.getData().asText())); + } + return JsonUtils.readValue(res.getData().traverse(), LoadMeta.class); + } catch (IOException | URISyntaxException e) { + throw new SparkLoadException("create spark load failed", e); + } + } + + private void addCommonHeaders(HttpRequestBase req) { + req.setHeader(HttpHeaders.AUTHORIZATION, "Basic " + auth); + } + + private String executeRequest(HttpRequestBase req, String apiPath, Map params) + throws IOException, URISyntaxException { + IOException ex = null; + try (CloseableHttpClient client = HttpUtils.getClient()) { + for (String feNode : feNodes) { + String url = String.format(BASE_URL, feNode, apiPath); + URIBuilder uriBuilder = new URIBuilder(URI.create(url)); + if (params != null && !params.isEmpty()) { + params.forEach(uriBuilder::addParameter); + } + req.setURI(uriBuilder.build()); + addCommonHeaders(req); + CloseableHttpResponse res; + try { + res = client.execute(req); + } catch (IOException e) { + ex = e; + continue; + } + if (res.getStatusLine().getStatusCode() != HttpStatus.SC_OK) { + continue; + } + return HttpUtils.getEntityContent(res.getEntity()); + } + } + if (ex != null) { + throw ex; + } + return null; + } + + public void updateIngestionLoad(String db, Long loadId, Map statusInfo) + throws SparkLoadException { + + String path = String.format(INGESTION_LOAD_URL_PATTERN, Constants.DEFAULT_CATALOG, db, UPDATE_ACTION); + HttpPost httpPost = new HttpPost(); + addCommonHeaders(httpPost); + Map params = new HashMap<>(); + params.put("loadId", loadId); + params.put("statusInfo", statusInfo); + try { + httpPost.setEntity(new StringEntity(JsonUtils.writeValueAsString(params))); + String content = executeRequest(httpPost, path, null); + if (StringUtils.isBlank(content)) { + throw new SparkLoadException(String.format("request update load failed, path: %s", path)); + } + ResponseEntity res = JsonUtils.readValue(content, ResponseEntity.class); + if (res.getCode() != 0) { + throw new SparkLoadException(String.format("update load failed, code: %d, msg: %s, reason: %s", + res.getCode(), res.getMsg(), res.getData().isNull() ? null : res.getData().asText())); + } + } catch (IOException | URISyntaxException e) { + throw new SparkLoadException("update load failed", e); + } + + } + + public LoadInfo getLoadInfo(String db, String label) throws SparkLoadException { + + String path = String.format(GET_LOAD_INFO, db); + HttpGet httpGet = new HttpGet(); + addCommonHeaders(httpGet); + try { + Map params = new HashMap<>(); + params.put("label", label); + String content = executeRequest(httpGet, path, params); + if (StringUtils.isBlank(content)) { + throw new SparkLoadException(String.format("request get load info failed, path: %s", path)); + } + LoadInfoResponse res = JsonUtils.readValue(content, LoadInfoResponse.class); + if (!"ok".equalsIgnoreCase(res.getStatus())) { + throw new SparkLoadException(String.format("get load info failed, status: %s, msg: %s, jobInfo: %s", + res.getStatus(), res.getMsg(), JsonUtils.writeValueAsString(res.getJobInfo()))); + } + return res.getJobInfo(); + } catch (IOException | URISyntaxException e) { + throw new SparkLoadException("get load info failed", e); + } + + } + + public String getDDL(String db, String table) throws SparkLoadException { + + HttpGet httpGet = new HttpGet(); + addCommonHeaders(httpGet); + try { + Map params = new HashMap<>(); + params.put("db", db); + params.put("table", table); + String content = executeRequest(httpGet, GET_DDL, params); + if (StringUtils.isBlank(content)) { + throw new SparkLoadException(String.format("request get ddl failed, path: %s", GET_DDL)); + } + ResponseEntity res = JsonUtils.readValue(content, ResponseEntity.class); + if (res.getCode() != 0 || !res.getData().has("create_table") + || res.getData().get("create_table").isEmpty()) { + throw new SparkLoadException(String.format("get ddl failed, status: %s, msg: %s, data: %s", + res.getCode(), res.getMsg(), JsonUtils.writeValueAsString(res.getData()))); + } + return res.getData().get("create_table").get(0).asText(); + } catch (IOException | URISyntaxException e) { + throw new SparkLoadException("get ddl failed", e); + } + + } + + } + + private static class BeClient { + + } + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/CommandLineOptions.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/CommandLineOptions.java new file mode 100644 index 00000000..8c66abcb --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/CommandLineOptions.java @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common; + +import lombok.Getter; + +@Getter +public class CommandLineOptions { + + private final String configPath; + + private final Boolean recovery; + + public CommandLineOptions(String configPath, Boolean recovery) { + this.configPath = configPath; + this.recovery = recovery; + } +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java new file mode 100644 index 00000000..a3e4803e --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/Constants.java @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common; + +public interface Constants { + + String HIVE_METASTORE_URIS = "hive.metastore.uris"; + String SPARK_STANDALONE_SCHEME = "spark"; + String HADOOP_AUTH_KERBEROS = "kerberos"; + String HADOOP_SECURITY_AUTHENTICATION = "hadoop.security.authentication"; + String HADOOP_KERBEROS_PRINCIPAL = "hadoop.kerberos.principal"; + String HADOOP_KERBEROS_KEYTAB = "hadoop.kerberos.keytab"; + + String DEFAULT_CATALOG = "internal"; + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/LoadInfo.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/LoadInfo.java new file mode 100644 index 00000000..1c7e904c --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/LoadInfo.java @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common; + +import lombok.Data; + +import java.util.List; + +@Data +public class LoadInfo { + + private String dbName; + private List tblNames; + private String label; + private String clusterName; + private String state; + private String failMsg; + private String trackingUrl; + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/ResponseEntity.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/ResponseEntity.java new file mode 100644 index 00000000..a5a3f149 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/ResponseEntity.java @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common; + +import com.fasterxml.jackson.databind.JsonNode; +import lombok.Data; + +@Data +public class ResponseEntity { + + private Integer code; + private String msg; + private JsonNode data; + private Integer count; + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/JobStatus.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/JobStatus.java new file mode 100644 index 00000000..6493b36b --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/JobStatus.java @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common.enums; + +public enum JobStatus { + + RUNNING, + FAILED, + SUCCESS + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/LoadMode.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/LoadMode.java new file mode 100644 index 00000000..d86b0738 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/LoadMode.java @@ -0,0 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common.enums; + +public enum LoadMode { + PUSH, PULL; +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/TaskType.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/TaskType.java new file mode 100644 index 00000000..e6ebf9e0 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/enums/TaskType.java @@ -0,0 +1,25 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common.enums; + +public enum TaskType { + + HIVE, + FILE + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadInfoResponse.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadInfoResponse.java new file mode 100644 index 00000000..60f28e9f --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadInfoResponse.java @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common.meta; + +import org.apache.doris.common.LoadInfo; + +import lombok.Data; + +@Data +public class LoadInfoResponse { + + private String status; + private String msg; + private LoadInfo jobInfo; + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java new file mode 100644 index 00000000..6009f092 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/LoadMeta.java @@ -0,0 +1,158 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common.meta; + +import org.apache.doris.common.Constants; +import org.apache.doris.config.EtlJobConfig; +import org.apache.doris.config.JobConfig; +import org.apache.doris.exception.SparkLoadException; + +import com.google.common.annotations.VisibleForTesting; +import lombok.Data; +import org.apache.commons.lang3.StringUtils; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +@Data +public class LoadMeta { + + private Long loadId; + private Long txnId; + private Long dbId; + private Long signature; + private Map tableMeta; + + public EtlJobConfig getEtlJobConfig(JobConfig jobConfig) throws SparkLoadException { + Map tables = new HashMap<>(); + for (Map.Entry entry : getTableMeta().entrySet()) { + String name = entry.getKey(); + TableMeta meta = entry.getValue(); + EtlJobConfig.EtlTable etlTable = new EtlJobConfig.EtlTable(meta.getIndexes().stream().map( + TableMeta.EtlIndex::toEtlIndex).collect(Collectors.toList()), + meta.getPartitionInfo().toEtlPartitionInfo()); + JobConfig.TaskInfo taskInfo = jobConfig.getLoadTasks().get(name); + EtlJobConfig.EtlFileGroup fileGroup; + Map columnMappingMap = taskInfo.toEtlColumnMappingMap(); + checkMapping(etlTable, columnMappingMap); + List partitionIds = meta.getPartitionInfo().partitions.stream() + .map(p -> p.partitionId).collect(Collectors.toList()); + switch (taskInfo.getType()) { + case HIVE: + Map properties = new HashMap<>(jobConfig.getHadoopProperties()); + properties.put(Constants.HIVE_METASTORE_URIS, taskInfo.getHiveMetastoreUris()); + fileGroup = + new EtlJobConfig.EtlFileGroup(EtlJobConfig.SourceType.HIVE, taskInfo.getHiveFullTableName(), + properties, false, columnMappingMap, taskInfo.getWhere(), + partitionIds); + break; + case FILE: + List columnList = Collections.emptyList(); + if (StringUtils.isNoneBlank(taskInfo.getColumns())) { + columnList = Arrays.stream(taskInfo.getColumns().split(",")).collect(Collectors.toList()); + } + List columnFromPathList = Collections.emptyList(); + if (StringUtils.isNoneBlank(taskInfo.getColumnFromPath())) { + columnFromPathList = + Arrays.stream(taskInfo.getColumnFromPath().split(",")).collect(Collectors.toList()); + } + fileGroup = + new EtlJobConfig.EtlFileGroup(EtlJobConfig.SourceType.FILE, taskInfo.getPaths(), columnList, + columnFromPathList, taskInfo.getFieldSep(), taskInfo.getLineDelim(), false, + taskInfo.getFormat(), columnMappingMap, taskInfo.getWhere(), partitionIds); + break; + default: + throw new IllegalArgumentException("Unsupported task type: " + taskInfo.getType()); + } + etlTable.addFileGroup(fileGroup); + tables.put(meta.getId(), etlTable); + } + String outputFilePattern = EtlJobConfig.getOutputFilePattern(jobConfig.getLabel(), + EtlJobConfig.FilePatternVersion.V1); + String label = jobConfig.getLabel(); + EtlJobConfig.EtlJobProperty properties = new EtlJobConfig.EtlJobProperty(); + EtlJobConfig etlJobConfig = new EtlJobConfig(tables, outputFilePattern, label, properties); + etlJobConfig.outputPath = + EtlJobConfig.getOutputPath(jobConfig.getWorkingDir(), getDbId(), label, + getSignature()); + return etlJobConfig; + } + + @VisibleForTesting + public void checkMapping(EtlJobConfig.EtlTable etlTable, + Map columnMappingMap) throws SparkLoadException { + Optional baseIdx = etlTable.indexes.stream().filter(idx -> idx.isBaseIndex).findFirst(); + if (baseIdx.isPresent()) { + EtlJobConfig.EtlIndex etlIndex = baseIdx.get(); + for (EtlJobConfig.EtlColumn column : etlIndex.columns) { + if ("HLL".equalsIgnoreCase(column.columnType)) { + EtlJobConfig.EtlColumnMapping mapping = columnMappingMap.get(column.columnName); + checkHllMapping(column.columnName, mapping); + } + if ("BITMAP".equalsIgnoreCase(column.columnType)) { + EtlJobConfig.EtlColumnMapping mapping = columnMappingMap.get(column.columnName); + checkBitmapMapping(column.columnName, mapping); + } + } + } + } + + private void checkHllMapping(String columnName, EtlJobConfig.EtlColumnMapping mapping) throws SparkLoadException { + if (mapping == null) { + throw new SparkLoadException(""); + } + Pattern pattern = Pattern.compile("(\\w+)\\(.*\\)"); + Matcher matcher = pattern.matcher(mapping.expr); + if (matcher.find()) { + if ("hll_hash".equalsIgnoreCase(matcher.group(1)) + || "hll_empty".equalsIgnoreCase(matcher.group(1))) { + return; + } + throw new SparkLoadException("HLL column must use hll function, like " + columnName + "=hll_hash(xxx) or " + + columnName + "=hll_empty()"); + } + } + + private void checkBitmapMapping(String columnName, EtlJobConfig.EtlColumnMapping mapping) + throws SparkLoadException { + if (mapping == null) { + throw new SparkLoadException(""); + } + Pattern pattern = Pattern.compile("(\\w+)\\(.*\\)"); + Matcher matcher = pattern.matcher(mapping.expr); + if (matcher.find()) { + if ("to_bitmap".equalsIgnoreCase(matcher.group(1)) || "bitmap_hash".equalsIgnoreCase(matcher.group(1)) + || "bitmap_dict".equalsIgnoreCase(matcher.group(1)) + || "binary_bitmap".equalsIgnoreCase(matcher.group(1))) { + return; + } + throw new SparkLoadException( + "BITMAP column must use bitmap function, like " + columnName + "=to_bitmap(xxx) or " + + columnName + "=bitmap_hash() or " + columnName + "=bitmap_dict() or " + + columnName + "=binary_bitmap()"); + } + } + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/TableMeta.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/TableMeta.java new file mode 100644 index 00000000..3e97b97a --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/common/meta/TableMeta.java @@ -0,0 +1,85 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common.meta; + + +import org.apache.doris.config.EtlJobConfig; + +import lombok.Data; + +import java.io.Serializable; +import java.util.List; +import java.util.stream.Collectors; + +@Data +public class TableMeta { + + private Long id; + private List indexes; + private EtlPartitionInfo partitionInfo; + + public static class EtlIndex implements Serializable { + public long indexId; + public List columns; + public int schemaHash; + public String indexType; + public boolean isBaseIndex; + public int schemaVersion; + + public EtlIndex() { + + } + + public EtlJobConfig.EtlIndex toEtlIndex() { + return new EtlJobConfig.EtlIndex(indexId, columns, schemaHash, indexType, isBaseIndex, schemaVersion); + } + + } + + public static class EtlPartitionInfo implements Serializable { + public String partitionType; + public List partitionColumnRefs; + public List distributionColumnRefs; + public List partitions; + + public EtlPartitionInfo() { + } + + public EtlJobConfig.EtlPartitionInfo toEtlPartitionInfo() { + return new EtlJobConfig.EtlPartitionInfo(partitionType, partitionColumnRefs, distributionColumnRefs, + partitions.stream().map(EtlPartition::toEtlPartition).collect(Collectors.toList())); + } + + } + + public static class EtlPartition implements Serializable { + public long partitionId; + public List startKeys; + public List endKeys; + public boolean isMaxPartition; + public int bucketNum; + + public EtlPartition() { + } + + public EtlJobConfig.EtlPartition toEtlPartition() { + return new EtlJobConfig.EtlPartition(partitionId, startKeys, endKeys, isMaxPartition, bucketNum); + } + } + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java new file mode 100644 index 00000000..fb2f5ccb --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/config/JobConfig.java @@ -0,0 +1,267 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.config; + +import org.apache.doris.SparkLoadRunner; +import org.apache.doris.client.DorisClient; +import org.apache.doris.common.Constants; +import org.apache.doris.common.enums.LoadMode; +import org.apache.doris.common.enums.TaskType; +import org.apache.doris.exception.SparkLoadException; + +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.Preconditions; +import lombok.Data; +import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.StringUtils; + +import java.io.File; +import java.net.URI; +import java.sql.DriverManager; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +@Data +public class JobConfig { + + @JsonProperty(required = true) + private String feAddresses; + + @JsonProperty(required = true) + private String label; + + @JsonProperty(required = true) + private String user; + + @JsonProperty(required = true) + private String password; + + @JsonProperty(required = true) + private String database; + + @JsonProperty(required = true) + private String workingDir; + + @JsonProperty(required = true) + private Map loadTasks; + + @JsonProperty(required = true) + private SparkInfo spark; + + private LoadMode loadMode = LoadMode.PULL; + + private Map hadoopProperties = Collections.emptyMap(); + + private Map jobProperties = Collections.emptyMap(); + + private Map env = Collections.emptyMap(); + + @Data + public static class TaskInfo { + + private TaskType type; + + private String hiveMetastoreUris; + + private String hiveDatabase; + + private String hiveTable; + + private List paths; + + private String format; + + private String columns; + + private String columnFromPath; + + private String fieldSep = "\t"; + + private String lineDelim = "\n"; + + private List columnMappings = Collections.emptyList(); + + private String where; + + private List targetPartitions = Collections.emptyList(); + + public String getHiveFullTableName() { + return hiveDatabase + "." + hiveTable; + } + + public Map toEtlColumnMappingMap() { + Map map = new HashMap<>(); + for (String columnMapping : columnMappings) { + String[] arr = columnMapping.split("="); + map.put(arr[0], new EtlJobConfig.EtlColumnMapping(arr[1])); + } + return map; + } + + } + + @Data + public static class SparkInfo { + + private static final String DEFAULT_DEPLOY_MODE = "client"; + + private static final String DEFAULT_DPP_JAR_PATH = + SparkLoadRunner.SPARK_LOAD_HOME + "/app/spark-load-dpp-1.0-SNAPSHOT.jar"; + + private String sparkHome; + + private String master; + + private String deployMode = DEFAULT_DEPLOY_MODE; + + private Integer numExecutors; + + private Integer executorCores; + + private String executorMemory; + + private String driverMemory; + + private String dppJarPath = DEFAULT_DPP_JAR_PATH; + + private Map properties = Collections.emptyMap(); + + } + + public void checkFeAddress() { + Preconditions.checkArgument(StringUtils.isNoneBlank(getFeAddresses()), "feAddress is empty"); + String[] feAddressArr = getFeAddresses().split(","); + if (feAddressArr.length == 0) { + throw new IllegalArgumentException("feAddress format is incorrect"); + } + for (String feAddress : feAddressArr) { + String[] arr = feAddress.split(":"); + if (arr.length != 2) { + throw new IllegalArgumentException("feAddress format is incorrect"); + } + } + } + + public void checkTaskInfo() { + Map tasks = getLoadTasks(); + Preconditions.checkArgument(!tasks.isEmpty(), "loadTasks is empty"); + for (Map.Entry entry : tasks.entrySet()) { + String table = entry.getKey(); + try { + DorisClient.FeClient feClient = DorisClient.getFeClient(feAddresses, user, password); + String ddl = feClient.getDDL(database, table); + if (StringUtils.isNoneBlank(ddl) && ddl.contains("\"enable_unique_key_merge_on_write\" = \"true\"")) { + throw new IllegalArgumentException("Merge On Write is not supported"); + } + } catch (SparkLoadException e) { + throw new IllegalArgumentException("check table failed", e); + } + TaskInfo taskInfo = entry.getValue(); + switch (taskInfo.getType()) { + case HIVE: + Preconditions.checkArgument(StringUtils.isNoneBlank(taskInfo.getHiveDatabase()), + "hive database is empty"); + Preconditions.checkArgument(StringUtils.isNoneBlank(taskInfo.getHiveTable()), + "hive table is empty"); + break; + case FILE: + Preconditions.checkArgument(taskInfo.getPaths() != null && !taskInfo.getPaths().isEmpty(), + "file path is empty"); + Preconditions.checkArgument( + StringUtils.equalsAnyIgnoreCase(taskInfo.getFormat(), "parquet", "orc", "csv"), + "format only support parquet or orc or csv"); + if ("csv".equalsIgnoreCase(taskInfo.getFormat())) { + Preconditions.checkArgument(StringUtils.isNoneEmpty(taskInfo.getFieldSep()), + "field separator is empty"); + } + break; + default: + throw new IllegalArgumentException("task type only supports hive or file"); + } + } + } + + public void checkSparkInfo() { + SparkInfo sparkInfo = getSpark(); + Preconditions.checkArgument(StringUtils.isNoneBlank(sparkInfo.getSparkHome()), + "spark config item sparkHome is empty"); + Preconditions.checkArgument(checkSparkMaster(sparkInfo.getMaster()), + "spark master only supports yarn or standalone or local"); + Preconditions.checkArgument( + StringUtils.equalsAnyIgnoreCase(sparkInfo.getDeployMode(), "cluster", "client"), + "spark deployMode only supports cluster or client"); + if (!"yarn".equalsIgnoreCase(sparkInfo.getMaster())) { + Preconditions.checkArgument("client".equalsIgnoreCase(sparkInfo.getDeployMode()), + "standalone and local master only supports client mode"); + } + if (LoadMode.PULL == getLoadMode()) { + if (StringUtils.isBlank(sparkInfo.getDppJarPath())) { + throw new IllegalArgumentException("dpp jar file path is empty"); + } + if (!new File(sparkInfo.getDppJarPath()).exists()) { + throw new IllegalArgumentException("dpp jar file is not exists, path: " + getSpark().getDppJarPath()); + } + } + } + + private boolean checkSparkMaster(String master) { + if (StringUtils.isBlank(master)) { + return false; + } + if ("yarn".equalsIgnoreCase(master) || master.startsWith("local")) { + return true; + } + URI uri = URI.create(master); + return Constants.SPARK_STANDALONE_SCHEME.equalsIgnoreCase(uri.getScheme()) + && StringUtils.isNoneBlank(uri.getHost()) && uri.getPort() != -1; + } + + public void checkHadoopProperties() { + if (hadoopProperties == null || hadoopProperties.isEmpty()) { + return; + } + if (!hadoopProperties.containsKey("fs.defaultFS")) { + throw new IllegalArgumentException("fs.defaultFS is empty"); + } + // check auth + if (hadoopProperties.containsKey("hadoop.security.authentication") + && StringUtils.equalsIgnoreCase(hadoopProperties.get("hadoop.security.authentication"), "kerberos")) { + if (hadoopProperties.containsKey("hadoop.kerberos.principal")) { + if (StringUtils.isBlank(hadoopProperties.get("hadoop.kerberos.principal"))) { + throw new IllegalArgumentException("hadoop kerberos principal is empty"); + } + if (hadoopProperties.containsKey("hadoop.kerberos.keytab")) { + if (!FileUtils.getFile(hadoopProperties.get("hadoop.kerberos.keytab")).exists()) { + throw new IllegalArgumentException("hadoop kerberos keytab file is not exists, path: " + + hadoopProperties.get("hadoop.kerberos.keytab")); + } + return; + } + throw new IllegalArgumentException("hadoop.kerberos.keytab is not set"); + } + throw new IllegalArgumentException("hadoop.kerberos.principal is not set"); + } else { + if (!hadoopProperties.containsKey("hadoop.username")) { + throw new IllegalArgumentException("hadoop username is empty"); + } + } + } + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/exception/SparkLoadException.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/exception/SparkLoadException.java new file mode 100644 index 00000000..d25aca87 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/exception/SparkLoadException.java @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.exception; + +public class SparkLoadException extends Exception { + + public SparkLoadException(String message) { + super(message); + } + + public SparkLoadException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/LoaderFactory.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/LoaderFactory.java new file mode 100644 index 00000000..0b0fc786 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/LoaderFactory.java @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load; + +import org.apache.doris.config.JobConfig; +import org.apache.doris.load.job.Loader; +import org.apache.doris.load.job.PullLoader; + +public class LoaderFactory { + + public static Loader createLoader(JobConfig jobConfig, Boolean isRecoveryMode) { + switch (jobConfig.getLoadMode()) { + case PULL: + return new PullLoader(jobConfig, isRecoveryMode); + case PUSH: + default: + throw new UnsupportedOperationException(); + } + } + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java new file mode 100644 index 00000000..d80caab0 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Loader.java @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.job; + +import org.apache.doris.common.enums.JobStatus; +import org.apache.doris.config.JobConfig; +import org.apache.doris.exception.SparkLoadException; + +import lombok.Getter; +import org.apache.spark.launcher.SparkAppHandle; +import org.apache.spark.launcher.SparkLauncher; + +import java.io.File; +import java.io.IOException; +import java.time.Duration; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.locks.LockSupport; + +public abstract class Loader { + + private static final String SPARK_HADOOP_PREFIX = "spark.hadoop."; + + protected JobConfig jobConfig; + + protected boolean isRecoveryMode = false; + + @Getter + protected SparkAppHandle appHandle; + + @Getter + protected JobStatus jobStatus = JobStatus.RUNNING; + + protected final Map statusInfo = new HashMap<>(); + + public abstract void prepare() throws SparkLoadException; + + public void execute() throws SparkLoadException { + try { + appHandle = submitSparkJob(getMainClass(), getAppArgs(), getLogPath()); + } catch (IOException e) { + throw new SparkLoadException("submit spark job failed", e); + } + do { + if (appHandle.getState().isFinal()) { + if (SparkAppHandle.State.FAILED == appHandle.getState() + || SparkAppHandle.State.KILLED == appHandle.getState()) { + statusInfo.put("msg", + String.format("spark job run failed, appId: %s, state: %s", appHandle.getAppId(), + appHandle.getState())); + jobStatus = JobStatus.FAILED; + } else { + jobStatus = JobStatus.SUCCESS; + } + break; + } + statusInfo.put("appId", appHandle.getAppId()); + LockSupport.parkNanos(Duration.ofSeconds(5).toNanos()); + } while (true); + } + + private SparkAppHandle submitSparkJob(String mainClass, String[] appArgs, String logPath) throws IOException { + File logFile = new File(logPath); + if (!logFile.getParentFile().exists()) { + logFile.getParentFile().mkdir(); + } + JobConfig.SparkInfo sparkInfo = jobConfig.getSpark(); + SparkLauncher launcher = new SparkLauncher(jobConfig.getEnv()) + .setMaster(sparkInfo.getMaster()) + .setDeployMode(sparkInfo.getDeployMode()) + .setAppName("spark-load-" + jobConfig.getLabel()) + .setAppResource(sparkInfo.getDppJarPath()) + .setSparkHome(sparkInfo.getSparkHome()) + .setMainClass(mainClass) + .addAppArgs(appArgs) + .redirectError(logFile); + sparkInfo.getProperties().forEach(launcher::setConf); + jobConfig.getHadoopProperties().forEach((k, v) -> launcher.setConf(SPARK_HADOOP_PREFIX + k, v)); + return launcher.startApplication(); + } + + public void cancel() { + if (jobStatus == JobStatus.RUNNING) { + if (appHandle != null) { + try { + appHandle.stop(); + } catch (Exception e) { + appHandle.kill(); + } + } + } + jobStatus = JobStatus.FAILED; + afterFailed(new SparkLoadException("load client cancelled.")); + } + + protected abstract String getMainClass(); + + protected abstract String[] getAppArgs(); + + protected abstract String getLogPath(); + + public abstract void afterFinished() throws SparkLoadException; + + public abstract void afterFailed(Exception e); + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java new file mode 100644 index 00000000..80491bf3 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/PullLoader.java @@ -0,0 +1,370 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.job; + +import org.apache.doris.SparkLoadRunner; +import org.apache.doris.client.DorisClient; +import org.apache.doris.common.Constants; +import org.apache.doris.common.DppResult; +import org.apache.doris.common.LoadInfo; +import org.apache.doris.common.enums.JobStatus; +import org.apache.doris.common.meta.LoadMeta; +import org.apache.doris.common.meta.TableMeta; +import org.apache.doris.config.EtlJobConfig; +import org.apache.doris.config.JobConfig; +import org.apache.doris.exception.SparkLoadException; +import org.apache.doris.util.DateUtils; +import org.apache.doris.util.FileSystemUtils; +import org.apache.doris.util.JsonUtils; + +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.fs.FileStatus; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.File; +import java.io.IOException; +import java.time.Duration; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.locks.LockSupport; +import java.util.stream.Collectors; + +public class PullLoader extends Loader implements Recoverable { + + private static final Logger LOG = LogManager.getLogger(PullLoader.class); + + private static final String LOAD_META_JSON = "load_meta.json"; + + private static final String DPP_RESULT_JSON = "dpp_result.json"; + + private static final String SPARK_ETL_JOB_CLASS = "org.apache.doris.load.loadv2.etl.SparkEtlJob"; + + private LoadMeta loadMeta; + + private EtlJobConfig etlJobConfig; + + public PullLoader(JobConfig jobConfig, Boolean isRecoveryMode) { + this.jobConfig = jobConfig; + this.isRecoveryMode = isRecoveryMode; + } + + @Override + public void prepare() throws SparkLoadException { + DorisClient.FeClient feClient = DorisClient.getFeClient(jobConfig.getFeAddresses(), jobConfig.getUser(), + jobConfig.getPassword()); + Map> tableToPartition = jobConfig.getLoadTasks().entrySet().stream() + .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().getTargetPartitions())); + loadMeta = feClient.createIngestionLoad(jobConfig.getDatabase(), tableToPartition, jobConfig.getLabel(), + jobConfig.getJobProperties()); + etlJobConfig = loadMeta.getEtlJobConfig(jobConfig); + if (Constants.HADOOP_AUTH_KERBEROS.equalsIgnoreCase( + jobConfig.getHadoopProperties().get(Constants.HADOOP_SECURITY_AUTHENTICATION))) { + try { + FileSystemUtils.kerberosLogin(jobConfig); + } catch (IOException e) { + throw new SparkLoadException("login with kerberos auth failed", e); + } + } + } + + @Override + public void execute() throws SparkLoadException { + + try { + cleanOutputPath(); + } catch (IOException e) { + throw new SparkLoadException("clean output path failed", e); + } + uploadMetaInfo(loadMeta, etlJobConfig.getOutputPath()); + + String etlJobConfPath = etlJobConfig.outputPath + "/configs/jobconfig.json"; + try { + FileSystemUtils.createFile(jobConfig, etlJobConfig.configToJson(), etlJobConfPath, true); + } catch (IOException e) { + throw new SparkLoadException("create job config file failed", e); + } + + JobConfig.SparkInfo spark = jobConfig.getSpark(); + + LOG.info("submit spark job on master: " + spark.getMaster() + ", deployMode: " + spark.getDeployMode()); + + super.execute(); + + if (jobStatus == JobStatus.FAILED) { + throw new SparkLoadException("spark job run failed, msg: " + statusInfo.get("msg")); + } + LOG.info("spark job run finished."); + + } + + @Override + public void afterFinished() throws SparkLoadException { + DorisClient.FeClient feClient = DorisClient.getFeClient(jobConfig.getFeAddresses(), jobConfig.getUser(), + jobConfig.getPassword()); + statusInfo.put("status", jobStatus.name()); + statusInfo.put("msg", ""); + statusInfo.put("appId", appHandle == null ? null : appHandle.getAppId()); + try { + String dppResultStr = null; + int checkCnt = 0; + while (checkCnt < 3) { + try { + dppResultStr = getDppResultString(); + } catch (UnsupportedOperationException e) { + LOG.warn("retry get dpp result", e); + checkCnt++; + LockSupport.parkNanos(Duration.ofMillis(500).toNanos()); + } + if (dppResultStr != null) { + break; + } + } + if (dppResultStr == null) { + throw new SparkLoadException("get dpp result str failed"); + } + statusInfo.put("dppResult", dppResultStr); + statusInfo.put("filePathToSize", JsonUtils.writeValueAsString(getFilePathToSize())); + statusInfo.put("hadoopProperties", JsonUtils.writeValueAsString(jobConfig.getHadoopProperties())); + } catch (IOException e) { + throw new SparkLoadException("update job status failed", e); + } + feClient.updateIngestionLoad(jobConfig.getDatabase(), loadMeta.getLoadId(), statusInfo); + do { + LoadInfo loadInfo = feClient.getLoadInfo(jobConfig.getDatabase(), jobConfig.getLabel()); + switch (loadInfo.getState().toUpperCase(Locale.ROOT)) { + case "FINISHED": + LOG.info("load job finished."); + try { + cleanOutputPath(); + } catch (IOException e) { + LOG.warn("clean output path failed", e); + } + return; + case "CANCELLED": + throw new SparkLoadException("load job failed, " + loadInfo.getFailMsg()); + default: + LOG.info("load job unfinished, state: " + loadInfo.getState()); + break; + } + LockSupport.parkNanos(Duration.ofSeconds(15).toNanos()); + } while (true); + } + + @Override + public void afterFailed(Exception e) { + if (loadMeta == null) { + LOG.info("load job not start, skip update."); + return; + } + DorisClient.FeClient feClient = DorisClient.getFeClient(jobConfig.getFeAddresses(), jobConfig.getUser(), + jobConfig.getPassword()); + statusInfo.put("status", jobStatus.name()); + statusInfo.put("msg", e.getMessage()); + statusInfo.put("appId", appHandle == null ? null : appHandle.getAppId()); + try { + feClient.updateIngestionLoad(jobConfig.getDatabase(), loadMeta.getLoadId(), statusInfo); + } catch (SparkLoadException ex) { + LOG.warn("update load failed status failed", ex); + } + } + + @Override + public boolean canBeRecovered() throws SparkLoadException { + if (isRecoveryMode) { + String outputPath = etlJobConfig.getOutputPath(); + String parentOutputPath = outputPath.substring(0, StringUtils.lastIndexOf(outputPath, "/")); + try { + if (FileSystemUtils.exists(jobConfig, parentOutputPath)) { + FileStatus[] fileStatuses = FileSystemUtils.list(jobConfig, parentOutputPath); + if (fileStatuses.length != 1) { + return false; + } + fileStatuses = FileSystemUtils.list(jobConfig, fileStatuses[0].getPath().toString()); + boolean hasDppResult = false; + for (FileStatus fileStatus : fileStatuses) { + String fileName = fileStatus.getPath().getName(); + if (DPP_RESULT_JSON.equalsIgnoreCase(fileName)) { + hasDppResult = true; + String content = FileSystemUtils.readFile(jobConfig, fileStatus.getPath().toString()); + if (StringUtils.isBlank(content)) { + return false; + } + DppResult dppResult = JsonUtils.readValue(content, DppResult.class); + if (!checkDppResult(dppResult)) { + LOG.info("previous etl job is failed, cannot be recovered"); + return false; + } + } + // check meta consist + if (LOAD_META_JSON.equalsIgnoreCase(fileName)) { + String content = FileSystemUtils.readFile(jobConfig, fileStatus.getPath().toString()); + if (StringUtils.isBlank(content)) { + return false; + } + LoadMeta oldLoadMeta = JsonUtils.readValue(content, LoadMeta.class); + for (Map.Entry entry : loadMeta.getTableMeta().entrySet()) { + TableMeta tableMeta = entry.getValue(); + TableMeta oldTableMeta = oldLoadMeta.getTableMeta().get(entry.getKey()); + // index count is not consistent + if (oldTableMeta == null + || oldTableMeta.getIndexes().size() != tableMeta.getIndexes().size()) { + LOG.info("index size mismatch, cannot be recovered"); + return false; + } + Map indexMap = tableMeta.getIndexes().stream() + .collect(Collectors.toMap(etlIndex -> etlIndex.indexId, + TableMeta.EtlIndex::toEtlIndex)); + Map oldIndexMap = oldTableMeta.getIndexes().stream() + .collect(Collectors.toMap(etlIndex -> etlIndex.indexId, + TableMeta.EtlIndex::toEtlIndex)); + for (Map.Entry indexEntry : indexMap.entrySet()) { + EtlJobConfig.EtlIndex index = indexEntry.getValue(); + EtlJobConfig.EtlIndex oldIndex = oldIndexMap.get(indexEntry.getKey()); + // index not exists + if (oldIndex == null) { + LOG.info("index " + index.indexId + " is not exists in previous meta"); + return false; + } + // index mismatch + if (oldIndex.schemaHash != index.schemaHash + || oldIndex.schemaVersion != index.schemaVersion) { + LOG.info("index " + index.indexId + " has changed, " + + "old schemaHash: " + oldIndex.schemaHash + " and schemaVersion: " + + oldIndex.schemaVersion + " current schemaHash: " + + index.schemaHash + " and schemaVersion: " + + index.schemaVersion + ", cannot be recovered"); + return false; + } + } + // check partition consistent + Set partitionSet = tableMeta.getPartitionInfo().partitions.stream().map( + p -> p.partitionId).collect(Collectors.toSet()); + Set oldPartitionSet = oldTableMeta.getPartitionInfo().partitions.stream().map( + p -> p.partitionId).collect(Collectors.toSet()); + if (oldPartitionSet.size() != partitionSet.size()) { + LOG.info("partition size mismatch, old partition size: " + oldPartitionSet.size() + + ", now partition size: " + partitionSet.size() + + ", cannot be recovered"); + return false; + } + for (Long partitionId : partitionSet) { + if (!oldPartitionSet.contains(partitionId)) { + LOG.info("partition id mismatch, partition id: " + partitionId + + ", cannot be recovered"); + return false; + } + } + } + } + } + return hasDppResult; + } + } catch (IOException e) { + throw new SparkLoadException("check recovery failed", e); + } + } + return false; + } + + @Override + public void prepareRecover() throws SparkLoadException { + String outputPath = etlJobConfig.getOutputPath(); + String parentOutputPath = outputPath.substring(0, StringUtils.lastIndexOf(outputPath, "/")); + try { + FileStatus[] fileStatuses = FileSystemUtils.list(jobConfig, parentOutputPath); + FileSystemUtils.move(jobConfig, fileStatuses[0].getPath().toString(), outputPath); + FileSystemUtils.delete(jobConfig, outputPath + "/load_meta.json"); + uploadMetaInfo(loadMeta, etlJobConfig.getOutputPath()); + jobStatus = JobStatus.SUCCESS; + } catch (IOException e) { + throw new SparkLoadException("prepare recovery failed", e); + } + } + + private boolean checkDppResult(DppResult dppResult) { + if (!dppResult.isSuccess) { + return false; + } + int maxFilterRatio = Integer.parseInt(jobConfig.getJobProperties().getOrDefault("max_filter_ratio", "0")); + return dppResult.abnormalRows <= (dppResult.abnormalRows + dppResult.normalRows) * maxFilterRatio; + } + + private void uploadMetaInfo(LoadMeta metaInfo, String outputPath) throws SparkLoadException { + try { + if (!FileSystemUtils.exists(jobConfig, outputPath)) { + FileSystemUtils.mkdir(jobConfig, outputPath); + } + FileSystemUtils.createFile(jobConfig, JsonUtils.writeValueAsBytes(metaInfo), + outputPath + "/load_meta.json", true); + } catch (IOException e) { + throw new SparkLoadException("upload load meta failed", e); + } + } + + @Override + protected String getMainClass() { + return SPARK_ETL_JOB_CLASS; + } + + @Override + protected String[] getAppArgs() { + return new String[] {etlJobConfig.outputPath + "/configs/jobconfig.json"}; + } + + @Override + protected String getLogPath() { + String formattedNow = DateUtils.getFormattedNow(DateUtils.NUMBER_FORMATER); + return SparkLoadRunner.SPARK_LOAD_HOME + "/logs/" + jobConfig.getLabel() + "-" + formattedNow + ".log"; + } + + public void cleanOutputPath() throws IOException { + if (FileSystemUtils.exists(jobConfig, etlJobConfig.outputPath)) { + LOG.info("clean output: " + etlJobConfig.outputPath); + FileSystemUtils.delete(jobConfig, etlJobConfig.outputPath); + } + } + + private String getDppResultString() throws SparkLoadException { + try { + return FileSystemUtils.readFile(jobConfig, etlJobConfig.outputPath + "/dpp_result.json"); + } catch (IOException e) { + throw new SparkLoadException("get dpp result failed", e); + } + } + + private Map getFilePathToSize() throws SparkLoadException { + Map filePathToSize = new HashMap<>(); + try { + FileStatus[] fileStatuses = FileSystemUtils.list(jobConfig, etlJobConfig.outputPath); + for (FileStatus fileStatus : fileStatuses) { + if (fileStatus.isDirectory()) { + continue; + } + filePathToSize.put(fileStatus.getPath().toString(), fileStatus.getLen()); + } + } catch (IOException e) { + throw new SparkLoadException("get dpp result failed", e); + } + return filePathToSize; + } + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Recoverable.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Recoverable.java new file mode 100644 index 00000000..ccfd461a --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/load/job/Recoverable.java @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.job; + +import org.apache.doris.exception.SparkLoadException; + +public interface Recoverable { + + boolean canBeRecovered() throws SparkLoadException; + + void prepareRecover() throws SparkLoadException; + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/util/DateUtils.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/DateUtils.java new file mode 100644 index 00000000..7305ef76 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/DateUtils.java @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.util; + +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; + +public class DateUtils { + + public static final DateTimeFormatter NORMAL_FORMATER = + DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss").withZone( + ZoneId.systemDefault()); + + public static final DateTimeFormatter NUMBER_FORMATER = + DateTimeFormatter.ofPattern("yyyyMMddHHmmss").withZone( + ZoneId.systemDefault()); + + public static String getFormattedNow(DateTimeFormatter formatter) { + return formatter.format(LocalDateTime.now(ZoneId.systemDefault())); + } + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/util/FileSystemUtils.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/FileSystemUtils.java new file mode 100644 index 00000000..2e6b5880 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/FileSystemUtils.java @@ -0,0 +1,146 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.util; + +import org.apache.doris.common.Constants; +import org.apache.doris.config.JobConfig; + +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.CommonConfigurationKeysPublic; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; + +public class FileSystemUtils { + + private static final Logger LOG = LogManager.getLogger(FileSystemUtils.class); + + private static FileSystem getFs(JobConfig config, Path path) throws IOException { + return FileSystem.get(path.toUri(), getConf(config)); + } + + public static void createFile(JobConfig config, String content, String path, Boolean overwrite) throws IOException { + Path p = new Path(path); + try (FileSystem fs = getFs(config, p)) { + FSDataOutputStream outputStream = fs.create(p, overwrite); + outputStream.write(content.getBytes(StandardCharsets.UTF_8)); + outputStream.close(); + } + } + + public static void createFile(JobConfig config, byte[] contentBytes, String path, Boolean overwrite) + throws IOException { + Path p = new Path(path); + try (FileSystem fs = getFs(config, p)) { + FSDataOutputStream outputStream = fs.create(p, overwrite); + outputStream.write(contentBytes); + outputStream.close(); + } + } + + public static void delete(JobConfig config, String path) throws IOException { + Path p = new Path(path); + try (FileSystem fs = getFs(config, p)) { + fs.delete(p, true); + } + } + + public static boolean exists(JobConfig config, String path) throws IOException { + Path p = new Path(path); + try (FileSystem fs = getFs(config, p)) { + return fs.exists(p); + } + } + + public static FileStatus[] list(JobConfig config, String path) throws IOException { + Path p = new Path(path); + try (FileSystem fs = getFs(config, p)) { + return fs.listStatus(p); + } + } + + public static String readFile(JobConfig config, String path) throws IOException { + Path p = new Path(path); + try (FileSystem fs = getFs(config, p)) { + if (fs.exists(p) && fs.getFileStatus(p).isFile()) { + FSDataInputStream inputStream = fs.open(p); + BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); + StringBuilder sb = new StringBuilder(); + String line; + while ((line = reader.readLine()) != null) { + sb.append(line); + } + return sb.toString(); + } + throw new UnsupportedOperationException("read file is not exist or is not a file, path: " + path); + } + } + + public static void move(JobConfig config, String src, String dst) throws IOException { + Path srcPath = new Path(src); + Path dstpath = new Path(dst); + try (FileSystem fs = getFs(config, srcPath)) { + fs.rename(srcPath, dstpath); + } + } + + public static void mkdir(JobConfig config, String path) throws IOException { + Path p = new Path(path); + try (FileSystem fs = getFs(config, p)) { + fs.mkdirs(p, new FsPermission(644)); + } + } + + public static void kerberosLogin(JobConfig jobConfig) throws IOException { + Configuration conf = getConf(jobConfig); + conf.set(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHORIZATION, "true"); + conf.set(CommonConfigurationKeysPublic.HADOOP_KERBEROS_KEYTAB_LOGIN_AUTORENEWAL_ENABLED, "true"); + UserGroupInformation.setConfiguration(conf); + String keytab = jobConfig.getHadoopProperties().get(Constants.HADOOP_KERBEROS_KEYTAB); + String principal = jobConfig.getHadoopProperties().get(Constants.HADOOP_KERBEROS_PRINCIPAL); + try { + UserGroupInformation ugi = UserGroupInformation.getLoginUser(); + if (ugi.hasKerberosCredentials() && StringUtils.equals(ugi.getUserName(), principal)) { + ugi.checkTGTAndReloginFromKeytab(); + return; + } + } catch (IOException e) { + LOG.warn("A SecurityException occurs with kerberos, do login immediately.", e); + } + UserGroupInformation.loginUserFromKeytab(principal, keytab); + } + + private static Configuration getConf(JobConfig jobConfig) { + Configuration conf = new Configuration(); + jobConfig.getHadoopProperties().forEach(conf::set); + return conf; + } + +} diff --git a/spark-load/spark-load-core/src/main/java/org/apache/doris/util/HttpUtils.java b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/HttpUtils.java new file mode 100644 index 00000000..d1da38d3 --- /dev/null +++ b/spark-load/spark-load-core/src/main/java/org/apache/doris/util/HttpUtils.java @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.util; + +import org.apache.http.HttpEntity; +import org.apache.http.client.config.RequestConfig; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; + +public class HttpUtils { + + public static final int DEFAULT_CONN_TIMEOUT = 60 * 1000; + public static final int DEFAULT_SO_TIMEOUT = 60 * 1000; + + public static CloseableHttpClient getClient() { + return getClient(DEFAULT_CONN_TIMEOUT, DEFAULT_SO_TIMEOUT); + } + + public static CloseableHttpClient getClient(int connectionTimeout, int socketTimeout) { + RequestConfig requestConfig = RequestConfig.custom() + .setConnectTimeout(connectionTimeout) + .setSocketTimeout(socketTimeout) + .build(); + return HttpClients.custom().setDefaultRequestConfig(requestConfig).build(); + } + + public static String getEntityContent(HttpEntity entity) throws IOException { + StringBuilder sb = new StringBuilder(); + try (InputStream is = entity.getContent(); + BufferedReader reader = new BufferedReader(new InputStreamReader(is))) { + String line; + while ((line = reader.readLine()) != null) { + sb.append(line); + } + } + return sb.toString(); + } + +} diff --git a/spark-load/spark-load-core/src/main/resources/log4j.properties b/spark-load/spark-load-core/src/main/resources/log4j.properties new file mode 100644 index 00000000..c1e97855 --- /dev/null +++ b/spark-load/spark-load-core/src/main/resources/log4j.properties @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +log4j.rootLogger=INFO,console +log4j.additivity.org.apache=true +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.Threshold=INFO +log4j.appender.console.ImmediateFlush=true +log4j.appender.console.Target=System.out +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %p (%t|%tid) [%C{1}.%M():%L] %m%n \ No newline at end of file diff --git a/spark-load/spark-load-core/src/test/java/org/apache/doris/client/DorisClientTest.java b/spark-load/spark-load-core/src/test/java/org/apache/doris/client/DorisClientTest.java new file mode 100644 index 00000000..4f53a368 --- /dev/null +++ b/spark-load/spark-load-core/src/test/java/org/apache/doris/client/DorisClientTest.java @@ -0,0 +1,473 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.client; + +import org.apache.doris.common.meta.LoadMeta; +import org.apache.doris.common.meta.TableMeta; +import org.apache.doris.config.EtlJobConfig; +import org.apache.doris.exception.SparkLoadException; +import org.apache.doris.util.JsonUtils; + +import com.fasterxml.jackson.core.JsonProcessingException; +import mockit.Mock; +import mockit.MockUp; +import org.apache.http.Header; +import org.apache.http.HeaderIterator; +import org.apache.http.HttpEntity; +import org.apache.http.HttpStatus; +import org.apache.http.HttpVersion; +import org.apache.http.ProtocolVersion; +import org.apache.http.StatusLine; +import org.apache.http.client.ClientProtocolException; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpUriRequest; +import org.apache.http.entity.StringEntity; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.message.BasicStatusLine; +import org.apache.http.params.HttpParams; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +class DorisClientTest { + + @Test + public void getFeClient() { + IllegalArgumentException e1 = + Assertions.assertThrows(IllegalArgumentException.class, () -> DorisClient.getFeClient("", "", "")); + Assertions.assertEquals("feAddresses is empty", e1.getMessage()); + IllegalArgumentException e2 = Assertions.assertThrows(IllegalArgumentException.class, + () -> DorisClient.getFeClient("127.0.0.1", "", "")); + Assertions.assertEquals("feAddresses contains invalid format, 127.0.0.1", e2.getMessage()); + IllegalArgumentException e3 = Assertions.assertThrows(IllegalArgumentException.class, + () -> DorisClient.getFeClient("127.0.0.1:", "", "")); + Assertions.assertEquals("feAddresses contains invalid format, 127.0.0.1:", e3.getMessage()); + IllegalArgumentException e4 = + Assertions.assertThrows(IllegalArgumentException.class, () -> DorisClient.getFeClient(":8030", "", "")); + Assertions.assertEquals("feAddresses contains invalid format, :8030", e4.getMessage()); + Assertions.assertDoesNotThrow(() -> DorisClient.getFeClient("127.0.0.1:8030", "", "")); + } + + @Test + public void createIngestionLoad() throws SparkLoadException, JsonProcessingException { + + DorisClient.FeClient feClient = new DorisClient.FeClient("127.0.0.1:8030", "", ""); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_BAD_REQUEST); + return response; + } + }; + Assertions.assertThrows(SparkLoadException.class, () -> feClient.createIngestionLoad("db", new HashMap<>(), "test", new HashMap<>())); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_OK); + response.setEntity(new StringEntity("{\"code\":1,\"msg\":\"\",\"data\":{},\"count\":0}")); + return response; + } + }; + Assertions.assertThrows(SparkLoadException.class, () -> feClient.createIngestionLoad("db", new HashMap<>(), "test", new HashMap<>())); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_OK); + response.setEntity(new StringEntity("{\"code\":0,\"msg\":\"\",\"data\":{\"loadId\":1,\"txnId\":1," + + "\"dbId\":1,\"signature\":1,\"tableMeta\":{\"tbl1\":{\"id\":1," + + "\"indexes\":[{\"indexId\":0,\"columns\":[{\"columnName\":\"c0\",\"columnType\":\"INT\"," + + "\"isAllowNull\":true,\"isKey\":true,\"aggregationType\":\"NONE\",\"defaultValue\":\"0\"," + + "\"stringLength\":0,\"precision\":0,\"scale\":0,\"defineExpr\":null}],\"schemaHash\":0," + + "\"indexType\":null,\"isBaseIndex\":false,\"schemaVersion\":0}],\"partitionInfo\":" + + "{\"partitionType\":\"UNPARTITIONED\",\"partitionColumnRefs\":[],\"distributionColumnRefs\":[]," + + "\"partitions\":[{\"partitionId\":1,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true," + + "\"bucketNum\":1}]}}}},\"count\":0}")); + return response; + } + }; + + LoadMeta loadMeta = new LoadMeta(); + loadMeta.setLoadId(1L); + loadMeta.setTxnId(1L); + loadMeta.setDbId(1L); + loadMeta.setSignature(1L); + Map tableMetaMap = new HashMap<>(); + TableMeta tableMeta = new TableMeta(); + tableMeta.setId(1L); + List indexList = new ArrayList<>(); + TableMeta.EtlIndex index = new TableMeta.EtlIndex(); + List columnList = new ArrayList<>(); + EtlJobConfig.EtlColumn column = new EtlJobConfig.EtlColumn(); + column.columnName = "c0"; + column.columnType = "INT"; + column.defaultValue = "0"; + column.isAllowNull = true; + column.aggregationType = "NONE"; + column.isKey = true; + columnList.add(column); + index.columns = columnList; + indexList.add(index); + tableMeta.setIndexes(indexList); + TableMeta.EtlPartitionInfo partitionInfo = new TableMeta.EtlPartitionInfo(); + TableMeta.EtlPartition partition = new TableMeta.EtlPartition(); + partition.partitionId = 1; + partition.bucketNum = 1; + partition.startKeys = Collections.emptyList(); + partition.endKeys = Collections.emptyList(); + partition.isMaxPartition = true; + partitionInfo.partitions = Collections.singletonList(partition); + partitionInfo.partitionType = "UNPARTITIONED"; + partitionInfo.partitionColumnRefs = new ArrayList<>(); + partitionInfo.distributionColumnRefs = new ArrayList<>(); + tableMeta.setPartitionInfo(partitionInfo); + tableMetaMap.put("tbl1", tableMeta); + loadMeta.setTableMeta(tableMetaMap); + Assertions.assertEquals(JsonUtils.writeValueAsString(loadMeta), + JsonUtils.writeValueAsString(feClient.createIngestionLoad("db", new HashMap<>(), "test", new HashMap<>()))); + + } + + @Test + public void updateIngestionLoad() { + + DorisClient.FeClient feClient = new DorisClient.FeClient("127.0.0.1:8030", "", ""); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_BAD_REQUEST); + return response; + } + }; + Assertions.assertThrows(SparkLoadException.class, () -> feClient.updateIngestionLoad("db", 1L, new HashMap<>())); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_OK); + response.setEntity(new StringEntity("{\"code\":1,\"msg\":\"\",\"data\":{},\"count\":0}")); + return response; + } + }; + Assertions.assertThrows(SparkLoadException.class, () -> feClient.updateIngestionLoad("db", 1L, new HashMap<>())); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_OK); + response.setEntity(new StringEntity("{\"code\":0,\"msg\":\"\",\"data\":{},\"count\":0}")); + return response; + } + }; + Assertions.assertDoesNotThrow(() -> feClient.updateIngestionLoad("db", 1L, new HashMap<>())); + + } + + @Test + public void getLoadInfo() throws SparkLoadException, JsonProcessingException { + + DorisClient.FeClient feClient = new DorisClient.FeClient("127.0.0.1:8030", "", ""); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_BAD_REQUEST); + return response; + } + }; + Assertions.assertThrows(SparkLoadException.class, () -> feClient.getLoadInfo("db", "test")); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_OK); + response.setEntity(new StringEntity("{\"status\":\"err\",\"msg\":\"\",\"jobInfo\":{\"dbName\":\"db\"," + + "\"tblNames\":[\"tbl1\"],\"label\":\"test\",\"clusterName\":\"default\",\"state\":\"FINISHED\"," + + "\"failMsg\":\"\",\"trackingUrl\":\"\"}}")); + return response; + } + }; + Assertions.assertThrows(SparkLoadException.class, () -> feClient.getLoadInfo("db", "test")); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_OK); + response.setEntity(new StringEntity("{\"status\":\"ok\",\"msg\":\"\",\"jobInfo\":{\"dbName\":\"db\"," + + "\"tblNames\":[\"tbl1\"],\"label\":\"test\",\"clusterName\":\"default\",\"state\":\"FINISHED\"," + + "\"failMsg\":\"\",\"trackingUrl\":\"\"}}")); + return response; + } + }; + Assertions.assertEquals("{\"dbName\":\"db\",\"tblNames\":[\"tbl1\"],\"label\":\"test\"," + + "\"clusterName\":\"default\",\"state\":\"FINISHED\",\"failMsg\":\"\",\"trackingUrl\":\"\"}", + JsonUtils.writeValueAsString(feClient.getLoadInfo("db", "test"))); + + } + + @Test + public void getDDL() { + + DorisClient.FeClient feClient = new DorisClient.FeClient("127.0.0.1:8030", "", ""); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_BAD_REQUEST); + return response; + } + }; + SparkLoadException e1 = + Assertions.assertThrows(SparkLoadException.class, () -> feClient.getDDL("db", "test")); + Assertions.assertEquals("request get ddl failed, path: /api/_get_ddl", e1.getMessage()); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_OK); + response.setEntity(new StringEntity("{\"code\":1,\"msg\":\"\",\"data\":{},\"count\":0}")); + return response; + } + }; + SparkLoadException e2 = + Assertions.assertThrows(SparkLoadException.class, () -> feClient.getDDL("db", "test")); + Assertions.assertEquals("get ddl failed, status: 1, msg: , data: {}", e2.getMessage()); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_OK); + response.setEntity(new StringEntity("{\"code\":0,\"msg\":\"\",\"data\":{},\"count\":0}")); + return response; + } + }; + SparkLoadException e3 = + Assertions.assertThrows(SparkLoadException.class, () -> feClient.getDDL("db", "test")); + Assertions.assertEquals("get ddl failed, status: 0, msg: , data: {}", e3.getMessage()); + + new MockUp(CloseableHttpClient.class) { + @Mock + public CloseableHttpResponse execute( + final HttpUriRequest request) throws IOException, ClientProtocolException { + MockedCloseableHttpResponse response = new MockedCloseableHttpResponse(); + response.setStatusCode(HttpStatus.SC_OK); + response.setEntity(new StringEntity("{\"code\":0,\"msg\":\"\"," + + "\"data\":{\"create_table\": [\"CREATE TABLE `tbl1` (\\n `k1` int(11) NULL " + + "COMMENT \\\"\\\",\\n `k2` int(11) NULL COMMENT \\\"\\\"\\n) ENGINE=OLAP\\n" + + "DUPLICATE KEY(`k1`, `k2`)\\nCOMMENT \\\"OLAP\\\"\\nDISTRIBUTED BY HASH(`k1`) BUCKETS 1\\n" + + "PROPERTIES (\\n\\\"replication_num\\\" = \\\"1\\\",\\n\\\"version_info\\\" = \\\"1,0\\\",\\n" + + "\\\"in_memory\\\" = \\\"false\\\",\\n\\\"storage_format\\\" = \\\"DEFAULT\\\"\\n);\"]\n}," + + "\"count\":0}")); + return response; + } + }; + Assertions.assertDoesNotThrow(() -> feClient.getDDL("db", "test")); + + + } + + private class MockedCloseableHttpResponse implements CloseableHttpResponse { + + private StatusLine statusLine; + private HttpEntity entity; + + @Override + public void close() throws IOException { + + } + + @Override + public StatusLine getStatusLine() { + return statusLine; + } + + @Override + public void setStatusLine(StatusLine statusline) { + this.statusLine = statusline; + } + + @Override + public void setStatusLine(ProtocolVersion ver, int code) { + this.statusLine = new BasicStatusLine(ver, code, ""); + } + + @Override + public void setStatusLine(ProtocolVersion ver, int code, String reason) { + this.statusLine = new BasicStatusLine(ver, code, reason); + } + + @Override + public void setStatusCode(int code) throws IllegalStateException { + if (this.statusLine == null) { + this.statusLine = new BasicStatusLine(HttpVersion.HTTP_1_1, code, ""); + } else { + this.statusLine = new BasicStatusLine(statusLine.getProtocolVersion(), code, statusLine.getReasonPhrase()); + } + } + + @Override + public void setReasonPhrase(String reason) throws IllegalStateException { + if (this.statusLine == null) { + this.statusLine = new BasicStatusLine(HttpVersion.HTTP_1_1, HttpStatus.SC_OK, reason); + } else { + this.statusLine = new BasicStatusLine(statusLine.getProtocolVersion(), statusLine.getStatusCode(), reason); + } + } + + @Override + public HttpEntity getEntity() { + return entity; + } + + @Override + public void setEntity(HttpEntity entity) { + this.entity = entity; + } + + @Override + public Locale getLocale() { + return null; + } + + @Override + public void setLocale(Locale loc) { + + } + + @Override + public ProtocolVersion getProtocolVersion() { + return HttpVersion.HTTP_1_1; + } + + @Override + public boolean containsHeader(String name) { + return false; + } + + @Override + public Header[] getHeaders(String name) { + return new Header[0]; + } + + @Override + public Header getFirstHeader(String name) { + return null; + } + + @Override + public Header getLastHeader(String name) { + return null; + } + + @Override + public Header[] getAllHeaders() { + return new Header[0]; + } + + @Override + public void addHeader(Header header) { + + } + + @Override + public void addHeader(String name, String value) { + + } + + @Override + public void setHeader(Header header) { + + } + + @Override + public void setHeader(String name, String value) { + + } + + @Override + public void setHeaders(Header[] headers) { + + } + + @Override + public void removeHeader(Header header) { + + } + + @Override + public void removeHeaders(String name) { + + } + + @Override + public HeaderIterator headerIterator() { + return null; + } + + @Override + public HeaderIterator headerIterator(String name) { + return null; + } + + @Override + public HttpParams getParams() { + return null; + } + + @Override + public void setParams(HttpParams params) { + + } + } + + +} \ No newline at end of file diff --git a/spark-load/spark-load-core/src/test/java/org/apache/doris/common/meta/LoadMetaTest.java b/spark-load/spark-load-core/src/test/java/org/apache/doris/common/meta/LoadMetaTest.java new file mode 100644 index 00000000..0c1bceaa --- /dev/null +++ b/spark-load/spark-load-core/src/test/java/org/apache/doris/common/meta/LoadMetaTest.java @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common.meta; + + +import org.apache.doris.config.EtlJobConfig; +import org.apache.doris.config.JobConfig; +import org.apache.doris.exception.SparkLoadException; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class LoadMetaTest { + + @Test + public void checkMapping() throws SparkLoadException { + + List columns = new ArrayList<>(); + columns.add(new EtlJobConfig.EtlColumn("id", "BIGINT", false, true, "NONE", null, 0, 10, 0)); + columns.add(new EtlJobConfig.EtlColumn("c1", "HLL", true, false, "NONE", null, 0, 10, 0)); + columns.add(new EtlJobConfig.EtlColumn("c2", "BITMAP", true, false, "NONE", null, 0, 10, 0)); + + EtlJobConfig.EtlIndex etlIndex = new EtlJobConfig.EtlIndex(1, columns, 1, "DUPLICATE", true, 1); + EtlJobConfig.EtlPartition etlPartition = + new EtlJobConfig.EtlPartition(1L, Collections.singletonList(0), Collections.singletonList(1), true, 1); + EtlJobConfig.EtlPartitionInfo etlPartitionInfo = + new EtlJobConfig.EtlPartitionInfo("RANGE", Collections.singletonList("id"), + Collections.singletonList("id"), Collections.singletonList(etlPartition)); + + EtlJobConfig.EtlTable etlTable = new EtlJobConfig.EtlTable(Collections.singletonList(etlIndex), + etlPartitionInfo); + + LoadMeta loadMeta = new LoadMeta(); + + Map columnMappingMap = new HashMap<>(); + columnMappingMap.put("c2", new EtlJobConfig.EtlColumnMapping("to_bitmap(c1)")); + Assertions.assertThrows(SparkLoadException.class, () -> loadMeta.checkMapping(etlTable, columnMappingMap)); + + Map columnMappingMap1 = new HashMap<>(); + columnMappingMap1.put("c1", new EtlJobConfig.EtlColumnMapping("hll_hash(c1)")); + Assertions.assertThrows(SparkLoadException.class, () -> loadMeta.checkMapping(etlTable, columnMappingMap1)); + + Map columnMappingMap2 = new HashMap<>(); + columnMappingMap2.put("c1", new EtlJobConfig.EtlColumnMapping("hll_hash(c1)")); + columnMappingMap2.put("c2", new EtlJobConfig.EtlColumnMapping("to_bitmap(c1)")); + loadMeta.checkMapping(etlTable, columnMappingMap2); + + } + +} \ No newline at end of file diff --git a/spark-load/spark-load-core/src/test/java/org/apache/doris/config/JobConfigTest.java b/spark-load/spark-load-core/src/test/java/org/apache/doris/config/JobConfigTest.java new file mode 100644 index 00000000..c4e6f00f --- /dev/null +++ b/spark-load/spark-load-core/src/test/java/org/apache/doris/config/JobConfigTest.java @@ -0,0 +1,222 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.config; + +import org.apache.doris.client.DorisClient; +import org.apache.doris.common.enums.TaskType; +import org.apache.doris.exception.SparkLoadException; + +import mockit.Mock; +import mockit.MockUp; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +public class JobConfigTest { + + @Test + public void checkFeAddress() { + + JobConfig jobConfig = new JobConfig(); + jobConfig.setFeAddresses(""); + IllegalArgumentException e1 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkFeAddress); + Assertions.assertEquals("feAddress is empty", e1.getMessage()); + + jobConfig.setFeAddresses("127.0.0.1"); + IllegalArgumentException e2 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkFeAddress, + "feAddress format is incorrect"); + Assertions.assertEquals("feAddress format is incorrect", e2.getMessage()); + + jobConfig.setFeAddresses("127.0.0.1,127.0.0.2"); + IllegalArgumentException e3 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkFeAddress, + "feAddress format is incorrect"); + Assertions.assertEquals("feAddress format is incorrect", e3.getMessage()); + + jobConfig.setFeAddresses("127.0.0.1:8030"); + Assertions.assertDoesNotThrow(jobConfig::checkFeAddress); + + } + + @Test + public void checkTaskInfo() { + + JobConfig jobConfig = new JobConfig(); + jobConfig.setFeAddresses("127.0.0.1:8030"); + + jobConfig.setLoadTasks(new HashMap<>()); + IllegalArgumentException e1 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkTaskInfo); + Assertions.assertEquals("loadTasks is empty", e1.getMessage()); + + new MockUp(DorisClient.FeClient.class) { + @Mock + public String getDDL(String db, String table) throws SparkLoadException { + return "create table tbl1 (col1 int, col2 int, col3 int, col4 int) unique key (col1) properties (" + + "\"enable_unique_key_merge_on_write\" = \"false\")"; + } + }; + + Map loadTasks1 = new HashMap<>(); + JobConfig.TaskInfo taskInfo1 = new JobConfig.TaskInfo(); + taskInfo1.setType(TaskType.FILE); + loadTasks1.put("task1", taskInfo1); + jobConfig.setLoadTasks(loadTasks1); + IllegalArgumentException e2 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkTaskInfo); + Assertions.assertEquals("file path is empty", e2.getMessage()); + + Map loadTasks2 = new HashMap<>(); + JobConfig.TaskInfo taskInfo2 = new JobConfig.TaskInfo(); + taskInfo2.setType(TaskType.FILE); + taskInfo2.setPaths(Collections.singletonList("test")); + taskInfo2.setFormat("sequence"); + loadTasks2.put("task2", taskInfo2); + jobConfig.setLoadTasks(loadTasks2); + IllegalArgumentException e3 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkTaskInfo); + Assertions.assertEquals("format only support parquet or orc or csv", e3.getMessage()); + + taskInfo2.setFormat("csv"); + Assertions.assertDoesNotThrow(jobConfig::checkTaskInfo); + + Map loadTasks3 = new HashMap<>(); + JobConfig.TaskInfo taskInfo3 = new JobConfig.TaskInfo(); + taskInfo3.setType(TaskType.HIVE); + loadTasks3.put("task3", taskInfo3); + jobConfig.setLoadTasks(loadTasks3); + IllegalArgumentException e4 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkTaskInfo); + Assertions.assertEquals("hive database is empty", e4.getMessage()); + + taskInfo3.setHiveDatabase("db"); + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkTaskInfo, "hive table is empty"); + + taskInfo3.setHiveTable("tbl"); + Assertions.assertDoesNotThrow(jobConfig::checkTaskInfo); + + new MockUp(DorisClient.FeClient.class) { + @Mock + public String getDDL(String db, String table) throws SparkLoadException { + return "create table tbl1 (col1 int, col2 int, col3 int, col4 int) unique key (col1) properties (" + + "\"enable_unique_key_merge_on_write\" = \"true\")"; + } + }; + IllegalArgumentException e5 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkTaskInfo); + + } + + @Test + public void checkSparkInfo() throws IOException { + + JobConfig jobConfig = new JobConfig(); + JobConfig.SparkInfo sparkInfo = new JobConfig.SparkInfo(); + jobConfig.setSpark(sparkInfo); + IllegalArgumentException e1 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkSparkInfo); + Assertions.assertEquals("spark config item sparkHome is empty", e1.getMessage()); + + sparkInfo.setSparkHome("test"); + IllegalArgumentException e2 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkSparkInfo); + Assertions.assertEquals("spark master only supports yarn or standalone or local", e2.getMessage()); + + sparkInfo.setMaster("local"); + sparkInfo.setDeployMode("abc"); + IllegalArgumentException e3 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkSparkInfo); + Assertions.assertEquals("spark deployMode only supports cluster or client", e3.getMessage()); + + sparkInfo.setMaster("spark://127.0.0.1:7077"); + sparkInfo.setDeployMode("cluster"); + IllegalArgumentException e4 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkSparkInfo); + Assertions.assertEquals("standalone and local master only supports client mode", e4.getMessage()); + + sparkInfo.setMaster("yarn"); + sparkInfo.setDeployMode("cluster"); + IllegalArgumentException e5 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkSparkInfo); + Assertions.assertEquals("dpp jar file is not exists, path: null/app/spark-load-dpp-1.0-SNAPSHOT.jar", e5.getMessage()); + + sparkInfo.setDppJarPath(""); + IllegalArgumentException e6 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkSparkInfo); + Assertions.assertEquals("dpp jar file path is empty", e6.getMessage()); + + Path path = Files.createTempFile(null, null); + sparkInfo.setDppJarPath(path.toAbsolutePath().toString()); + Assertions.assertDoesNotThrow(jobConfig::checkSparkInfo); + + } + + @Test + public void checkHadoopProperties() throws IOException { + + JobConfig jobConfig = new JobConfig(); + Map hadoopProperties = new HashMap<>(); + jobConfig.setHadoopProperties(hadoopProperties); + + hadoopProperties.put("abc", "123"); + IllegalArgumentException e1 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkHadoopProperties); + Assertions.assertEquals("fs.defaultFS is empty", e1.getMessage()); + + hadoopProperties.put("fs.defaultFS", "test"); + IllegalArgumentException e2 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkHadoopProperties); + Assertions.assertEquals("hadoop username is empty", e2.getMessage()); + + hadoopProperties.put("hadoop.username", "hadoop"); + Assertions.assertDoesNotThrow(jobConfig::checkHadoopProperties); + + hadoopProperties.put("hadoop.security.authentication", "kerberos"); + IllegalArgumentException e3 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkHadoopProperties); + Assertions.assertEquals("hadoop.kerberos.principal is not set", e3.getMessage()); + + hadoopProperties.put("hadoop.kerberos.principal", ""); + IllegalArgumentException e4 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkHadoopProperties); + Assertions.assertEquals("hadoop kerberos principal is empty", e4.getMessage()); + + hadoopProperties.put("hadoop.kerberos.principal", "spark@DORIS.ORG"); + IllegalArgumentException e5 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkHadoopProperties); + Assertions.assertEquals("hadoop.kerberos.keytab is not set", e5.getMessage()); + + hadoopProperties.put("hadoop.kerberos.keytab", "test"); + IllegalArgumentException e6 = + Assertions.assertThrows(IllegalArgumentException.class, jobConfig::checkHadoopProperties); + Assertions.assertEquals("hadoop kerberos keytab file is not exists, path: test", e6.getMessage()); + + Path path = Files.createTempFile("spark", ".keytab"); + hadoopProperties.put("hadoop.kerberos.keytab", path.toAbsolutePath().toString()); + Assertions.assertDoesNotThrow(jobConfig::checkHadoopProperties); + + } +} \ No newline at end of file diff --git a/spark-load/spark-load-core/src/test/java/org/apache/doris/load/LoaderFactoryTest.java b/spark-load/spark-load-core/src/test/java/org/apache/doris/load/LoaderFactoryTest.java new file mode 100644 index 00000000..28cb230c --- /dev/null +++ b/spark-load/spark-load-core/src/test/java/org/apache/doris/load/LoaderFactoryTest.java @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load; + +import org.apache.doris.common.enums.LoadMode; +import org.apache.doris.config.JobConfig; +import org.apache.doris.load.job.Loader; +import org.apache.doris.load.job.PullLoader; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +class LoaderFactoryTest { + + @Test + void createLoader() { + + JobConfig jobConfig = new JobConfig(); + jobConfig.setLoadMode(null); + Assertions.assertThrows(NullPointerException.class, () -> LoaderFactory.createLoader(jobConfig, false)); + + jobConfig.setLoadMode(LoadMode.PUSH); + Assertions.assertThrows(UnsupportedOperationException.class, () -> LoaderFactory.createLoader(jobConfig, false)); + + jobConfig.setLoadMode(LoadMode.PULL); + Assertions.assertDoesNotThrow(() -> LoaderFactory.createLoader(jobConfig, false)); + Loader loader = LoaderFactory.createLoader(jobConfig, false);; + Assertions.assertInstanceOf(PullLoader.class, loader); + + } +} \ No newline at end of file diff --git a/spark-load/spark-load-core/src/test/java/org/apache/doris/load/job/PullLoaderTest.java b/spark-load/spark-load-core/src/test/java/org/apache/doris/load/job/PullLoaderTest.java new file mode 100644 index 00000000..a0c56a60 --- /dev/null +++ b/spark-load/spark-load-core/src/test/java/org/apache/doris/load/job/PullLoaderTest.java @@ -0,0 +1,236 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.job; + +import org.apache.doris.client.DorisClient; +import org.apache.doris.common.enums.TaskType; +import org.apache.doris.common.meta.LoadMeta; +import org.apache.doris.common.meta.TableMeta; +import org.apache.doris.config.EtlJobConfig; +import org.apache.doris.config.JobConfig; +import org.apache.doris.exception.SparkLoadException; +import org.apache.doris.load.LoaderFactory; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.json.JsonMapper; +import mockit.Mock; +import mockit.MockUp; +import org.apache.commons.io.FileUtils; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; +import org.junit.jupiter.api.Test; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +class PullLoaderTest { + + @Test + void canBeRecovered() throws SparkLoadException, IOException { + + JobConfig jobConfig = new JobConfig(); + jobConfig.setFeAddresses("127.0.0.1:8080"); + Map loadTasks = new HashMap<>(); + JobConfig.TaskInfo taskInfo = new JobConfig.TaskInfo(); + taskInfo.setType(TaskType.FILE); + taskInfo.setPaths(Collections.singletonList("test")); + loadTasks.put("tbl1", taskInfo); + jobConfig.setLoadTasks(loadTasks); + jobConfig.setLabel("test"); + File file = new File(System.getProperty("java.io.tmpdir")); + jobConfig.setWorkingDir(file.getAbsolutePath()); + + new MockUp() { + @Mock + public LoadMeta createIngestionLoad(String db, Map> tableToPartition, String label, + Map properties) { + LoadMeta loadMeta = new LoadMeta(); + loadMeta.setLoadId(1L); + loadMeta.setTxnId(1L); + loadMeta.setDbId(1L); + loadMeta.setSignature(1L); + Map tableMetaMap = new HashMap<>(); + TableMeta tableMeta = new TableMeta(); + tableMeta.setId(1L); + List indexList = new ArrayList<>(); + TableMeta.EtlIndex index = new TableMeta.EtlIndex(); + List columnList = new ArrayList<>(); + EtlJobConfig.EtlColumn column = new EtlJobConfig.EtlColumn(); + column.columnName = "c0"; + column.columnType = "INT"; + column.defaultValue = "0"; + column.isAllowNull = true; + column.aggregationType = "NONE"; + column.isKey = true; + columnList.add(column); + index.columns = columnList; + indexList.add(index); + tableMeta.setIndexes(indexList); + TableMeta.EtlPartitionInfo partitionInfo = new TableMeta.EtlPartitionInfo(); + TableMeta.EtlPartition partition = new TableMeta.EtlPartition(); + partition.partitionId = 1; + partition.bucketNum = 1; + partition.startKeys = Collections.emptyList(); + partition.endKeys = Collections.emptyList(); + partition.isMaxPartition = true; + partitionInfo.partitions = Collections.singletonList(partition); + partitionInfo.partitionType = "UNPARTITIONED"; + partitionInfo.partitionColumnRefs = new ArrayList<>(); + partitionInfo.distributionColumnRefs = new ArrayList<>(); + tableMeta.setPartitionInfo(partitionInfo); + tableMetaMap.put("tbl1", tableMeta); + loadMeta.setTableMeta(tableMetaMap); + try { + System.out.println(JsonMapper.builder().build().writeValueAsString(loadMeta)); + } catch (JsonProcessingException e) { + throw new RuntimeException(e); + } + return loadMeta; + } + }; + Loader loader = LoaderFactory.createLoader(jobConfig, true); + assertInstanceOf(Recoverable.class, loader); + loader.prepare(); + assertFalse(((Recoverable)loader).canBeRecovered()); + + File file1 = new File(System.getProperty("java.io.tmpdir") + "/jobs/1/test"); + try { + + file1.mkdirs(); + assertFalse(((Recoverable)loader).canBeRecovered()); + + File file2 = new File(System.getProperty("java.io.tmpdir") + "/jobs/1/test/1"); + file2.mkdirs(); + assertFalse(((Recoverable)loader).canBeRecovered()); + + File file3 = new File(System.getProperty("java.io.tmpdir") + "/jobs/1/test/1/dpp_result.json"); + Files.write(file3.toPath(), Collections.singletonList("")); + assertFalse(((Recoverable)loader).canBeRecovered()); + + Files.write(file3.toPath(), Collections.singletonList("test")); + assertThrows(SparkLoadException.class, () -> ((Recoverable)loader).canBeRecovered()); + + Files.write(file3.toPath(), Collections.singletonList("{}")); + assertThrows(SparkLoadException.class, () -> ((Recoverable)loader).canBeRecovered()); + + Files.write(file3.toPath(), Collections.singletonList("{\"is_success\":false,\"failed_reason\":\"\"," + + "\"scanned_rows\":0,\"file_number\":0,\"file_size\":0,\"normal_rows\":0,\"abnormal_rows\":0," + + "\"unselect_rows\":0,\"partial_abnormal_rows\":\"\",\"scanned_bytes\":0}\n")); + assertFalse(((Recoverable)loader).canBeRecovered()); + + Files.write(file3.toPath(), Collections.singletonList("{\"is_success\":true,\"failed_reason\":\"\"," + + "\"scanned_rows\":0,\"file_number\":0,\"file_size\":0,\"normal_rows\":0,\"abnormal_rows\":0," + + "\"unselect_rows\":0,\"partial_abnormal_rows\":\"\",\"scanned_bytes\":0}\n")); + + File file4 = new File(System.getProperty("java.io.tmpdir") + "/jobs/1/test/1/load_meta.json"); + Files.write(file4.toPath(), Collections.singletonList("")); + assertFalse(((Recoverable)loader).canBeRecovered()); + + Files.write(file4.toPath(), Collections.singletonList("{\"loadId\":1,\"txnId\":1,\"dbId\":1,\"signature\":1," + + "\"tableMeta\":{\"tbl1\":{\"id\":1,\"indexes\":[],\"partitionInfo\":{\"partitionType\":" + + "\"UNPARTITIONED\",\"partitionColumnRefs\":[],\"distributionColumnRefs\":[],\"partitions\":" + + "[{\"partitionId\":1,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true,\"bucketNum\":1}]}" + + "}}}\n")); + assertFalse(((Recoverable)loader).canBeRecovered()); + + Files.write(file4.toPath(), Collections.singletonList("{\"loadId\":1,\"txnId\":1,\"dbId\":1,\"signature\":1," + + "\"tableMeta\":{\"tbl2\":{\"id\":1,\"indexes\":[{\"indexId\":0,\"columns\":[{\"columnName\":\"c0\"," + + "\"columnType\":\"INT\",\"isAllowNull\":true,\"isKey\":true,\"aggregationType\":\"NONE\"," + + "\"defaultValue\":\"0\",\"stringLength\":0,\"precision\":0,\"scale\":0,\"defineExpr\":null}]," + + "\"schemaHash\":0,\"indexType\":null,\"isBaseIndex\":false,\"schemaVersion\":0}],\"partitionInfo\":" + + "{\"partitionType\":\"UNPARTITIONED\",\"partitionColumnRefs\":[],\"distributionColumnRefs\":[]," + + "\"partitions\":[{\"partitionId\":1,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true," + + "\"bucketNum\":1}]}}}}")); + assertFalse(((Recoverable)loader).canBeRecovered()); + + Files.write(file4.toPath(), Collections.singletonList("{\"loadId\":1,\"txnId\":1,\"dbId\":1,\"signature\":1," + + "\"tableMeta\":{\"tbl1\":{\"id\":1,\"indexes\":[{\"indexId\":1,\"columns\":[{\"columnName\":\"c0\"," + + "\"columnType\":\"INT\",\"isAllowNull\":true,\"isKey\":true,\"aggregationType\":\"NONE\"," + + "\"defaultValue\":\"0\",\"stringLength\":0,\"precision\":0,\"scale\":0,\"defineExpr\":null}]," + + "\"schemaHash\":0,\"indexType\":null,\"isBaseIndex\":false,\"schemaVersion\":0}],\"partitionInfo\":" + + "{\"partitionType\":\"UNPARTITIONED\",\"partitionColumnRefs\":[],\"distributionColumnRefs\":[]," + + "\"partitions\":[{\"partitionId\":1,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true," + + "\"bucketNum\":1}]}}}}")); + assertFalse(((Recoverable)loader).canBeRecovered()); + + Files.write(file4.toPath(), Collections.singletonList("{\"loadId\":1,\"txnId\":1,\"dbId\":1,\"signature\":1," + + "\"tableMeta\":{\"tbl1\":{\"id\":1,\"indexes\":[{\"indexId\":0,\"columns\":[{\"columnName\":\"c0\"," + + "\"columnType\":\"INT\",\"isAllowNull\":true,\"isKey\":true,\"aggregationType\":\"NONE\"," + + "\"defaultValue\":\"0\",\"stringLength\":0,\"precision\":0,\"scale\":0,\"defineExpr\":null}]," + + "\"schemaHash\":1,\"indexType\":null,\"isBaseIndex\":false,\"schemaVersion\":0}],\"partitionInfo\":" + + "{\"partitionType\":\"UNPARTITIONED\",\"partitionColumnRefs\":[],\"distributionColumnRefs\":[]," + + "\"partitions\":[{\"partitionId\":1,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true," + + "\"bucketNum\":1}]}}}}")); + assertFalse(((Recoverable)loader).canBeRecovered()); + + Files.write(file4.toPath(), Collections.singletonList("{\"loadId\":1,\"txnId\":1,\"dbId\":1,\"signature\":1," + + "\"tableMeta\":{\"tbl1\":{\"id\":1,\"indexes\":[{\"indexId\":0,\"columns\":[{\"columnName\":\"c0\"," + + "\"columnType\":\"INT\",\"isAllowNull\":true,\"isKey\":true,\"aggregationType\":\"NONE\"," + + "\"defaultValue\":\"0\",\"stringLength\":0,\"precision\":0,\"scale\":0,\"defineExpr\":null}]," + + "\"schemaHash\":0,\"indexType\":null,\"isBaseIndex\":false,\"schemaVersion\":1}],\"partitionInfo\":" + + "{\"partitionType\":\"UNPARTITIONED\",\"partitionColumnRefs\":[],\"distributionColumnRefs\":[]," + + "\"partitions\":[{\"partitionId\":1,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true," + + "\"bucketNum\":1}]}}}}")); + assertFalse(((Recoverable)loader).canBeRecovered()); + + Files.write(file4.toPath(), Collections.singletonList("{\"loadId\":1,\"txnId\":1,\"dbId\":1,\"signature\":1," + + "\"tableMeta\":{\"tbl1\":{\"id\":1,\"indexes\":[{\"indexId\":0,\"columns\":[{\"columnName\":\"c0\"," + + "\"columnType\":\"INT\",\"isAllowNull\":true,\"isKey\":true,\"aggregationType\":\"NONE\"," + + "\"defaultValue\":\"0\",\"stringLength\":0,\"precision\":0,\"scale\":0,\"defineExpr\":null}]," + + "\"schemaHash\":0,\"indexType\":null,\"isBaseIndex\":false,\"schemaVersion\":0}],\"partitionInfo\":" + + "{\"partitionType\":\"UNPARTITIONED\",\"partitionColumnRefs\":[],\"distributionColumnRefs\":[]," + + "\"partitions\":[{\"partitionId\":1,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true," + + "\"bucketNum\":1},{\"partitionId\":2,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true," + + "\"bucketNum\":1}]}}}}")); + assertFalse(((Recoverable)loader).canBeRecovered()); + + Files.write(file4.toPath(), Collections.singletonList("{\"loadId\":1,\"txnId\":1,\"dbId\":1,\"signature\":1," + + "\"tableMeta\":{\"tbl1\":{\"id\":1,\"indexes\":[{\"indexId\":0,\"columns\":[{\"columnName\":\"c0\"," + + "\"columnType\":\"INT\",\"isAllowNull\":true,\"isKey\":true,\"aggregationType\":\"NONE\"," + + "\"defaultValue\":\"0\",\"stringLength\":0,\"precision\":0,\"scale\":0,\"defineExpr\":null}]," + + "\"schemaHash\":0,\"indexType\":null,\"isBaseIndex\":false,\"schemaVersion\":0}],\"partitionInfo\":" + + "{\"partitionType\":\"UNPARTITIONED\",\"partitionColumnRefs\":[],\"distributionColumnRefs\":[]," + + "\"partitions\":[{\"partitionId\":2,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true," + + "\"bucketNum\":1}]}}}}")); + assertFalse(((Recoverable)loader).canBeRecovered()); + + Files.write(file4.toPath(), Collections.singletonList("{\"loadId\":1,\"txnId\":1,\"dbId\":1,\"signature\":1," + + "\"tableMeta\":{\"tbl1\":{\"id\":1,\"indexes\":[{\"indexId\":0,\"columns\":[{\"columnName\":\"c0\"," + + "\"columnType\":\"INT\",\"isAllowNull\":true,\"isKey\":true,\"aggregationType\":\"NONE\"," + + "\"defaultValue\":\"0\",\"stringLength\":0,\"precision\":0,\"scale\":0,\"defineExpr\":null}]," + + "\"schemaHash\":0,\"indexType\":null,\"isBaseIndex\":false,\"schemaVersion\":0}],\"partitionInfo\":" + + "{\"partitionType\":\"UNPARTITIONED\",\"partitionColumnRefs\":[],\"distributionColumnRefs\":[]," + + "\"partitions\":[{\"partitionId\":1,\"startKeys\":[],\"endKeys\":[],\"isMaxPartition\":true," + + "\"bucketNum\":1}]}}}}")); + assertTrue(((Recoverable)loader).canBeRecovered()); + + } finally { + // delete ${java.io.tmpdir}/jobs on exit + FileUtils.deleteDirectory(file1.getParentFile().getParentFile()); + } + + } +} \ No newline at end of file diff --git a/spark-load/spark-load-core/src/test/java/org/apache/doris/util/DateUtilsTest.java b/spark-load/spark-load-core/src/test/java/org/apache/doris/util/DateUtilsTest.java new file mode 100644 index 00000000..d6d10ce8 --- /dev/null +++ b/spark-load/spark-load-core/src/test/java/org/apache/doris/util/DateUtilsTest.java @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.util; + +import mockit.Mock; +import mockit.MockUp; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.time.LocalDateTime; +import java.time.ZoneId; + +class DateUtilsTest { + + @Test + void getFormattedNow() { + new MockUp() { + @Mock + public LocalDateTime now(ZoneId zoneId) { + return LocalDateTime.of(2024,8,1,12,34,56); + } + }; + Assertions.assertEquals("2024-08-01 12:34:56", DateUtils.getFormattedNow(DateUtils.NORMAL_FORMATER)); + Assertions.assertEquals("20240801123456", DateUtils.getFormattedNow(DateUtils.NUMBER_FORMATER)); + } +} \ No newline at end of file diff --git a/spark-load/spark-load-dist/pom.xml b/spark-load/spark-load-dist/pom.xml new file mode 100644 index 00000000..01dcad98 --- /dev/null +++ b/spark-load/spark-load-dist/pom.xml @@ -0,0 +1,103 @@ + + + + 4.0.0 + + org.apache.doris + spark-load + ${revision} + + + pom + + spark-load-dist + + + 8 + 8 + UTF-8 + + + + + org.apache.doris + spark-load-core + ${project.version} + + + org.apache.doris + spark-load-dpp + ${project.version} + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + 3.0.2 + + false + false + + + org.apache.doris + spark-load-dpp + ${project.version} + ${project.build.directory}/app + + + + + + + copy + + package + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + bin + + single + + package + + ${project.parent.artifactId}-${project.version} + + src/main/assembly/assembly.xml + + ${project.parent.build.directory} + + + + + + + + \ No newline at end of file diff --git a/spark-load/spark-load-dist/src/main/assembly/assembly.xml b/spark-load/spark-load-dist/src/main/assembly/assembly.xml new file mode 100644 index 00000000..71b9a3ae --- /dev/null +++ b/spark-load/spark-load-dist/src/main/assembly/assembly.xml @@ -0,0 +1,72 @@ + + + + bin + + tar.gz + + true + ${project.parent.artifactId}-${project.version}-bin + + + + false + runtime + true + lib + + org.apache.doris:spark-load-dpp + + + + + + + ./src/main/bin + bin + + spark-load.sh + + unix + 0755 + + + ${project.build.directory}/lib + lib + 0755 + + + ${project.build.directory}/app + app + 0755 + + + ${project.build.directory}/../src/main/resources + conf + unix + 0755 + + *.yml + *.properties + logback*.xml + + + + \ No newline at end of file diff --git a/spark-load/spark-load-dist/src/main/bin/spark-load.sh b/spark-load/spark-load-dist/src/main/bin/spark-load.sh new file mode 100644 index 00000000..9097dd24 --- /dev/null +++ b/spark-load/spark-load-dist/src/main/bin/spark-load.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +if [ -z ${SPARK_LOAD_HOME} ]; then + cur_dir=$(dirname "$0")/../ + SPARK_LOAD_HOME=$(readlink -f ${cur_dir}) +fi + +export SPARK_LOAD_HOME + +if [[ -z "${JAVA_HOME}" ]]; then + if ! command -v java &>/dev/null; then + JAVA="" + else + JAVA="$(command -v java)" + fi +else + JAVA="${JAVA_HOME}/bin/java" +fi + +if [[ ! -x "${JAVA}" ]]; then + echo "The JAVA_HOME environment variable is not set correctly" + echo "This environment variable is required to run this program" + echo "Note: JAVA_HOME should point to a JDK and not a JRE" + echo "You can set JAVA_HOME in the fe.conf configuration file" + exit 1 +fi + +SPARK_LOAD_CORE_JAR= +for f in "${SPARK_LOAD_HOME}/lib"/*.jar; do + if [[ $(basename "${f}") == "spark-load-core"*".jar" ]]; then + SPARK_LOAD_CORE_JAR="${f}" + continue + fi + CLASSPATH="${f}:${CLASSPATH}" +done +CLASSPATH="${SPARK_LOAD_CORE_JAR}:${CLASSPATH}" +export CLASSPATH="${SPARK_LOAD_CORE_JAR}/conf:${CLASSPATH}:${SPARK_LOAD_CORE_JAR}/lib" + +${JAVA} org.apache.doris.SparkLoadRunner "$@" \ No newline at end of file diff --git a/spark-load/spark-load-dpp/pom.xml b/spark-load/spark-load-dpp/pom.xml new file mode 100644 index 00000000..81254e04 --- /dev/null +++ b/spark-load/spark-load-dpp/pom.xml @@ -0,0 +1,340 @@ + + + + 4.0.0 + + org.apache.doris + ${revision} + spark-load + + spark-load-dpp + jar + + 1 + -Xmx512m + + + + org.apache.doris + spark-load-common + ${project.version} + + + + commons-codec + commons-codec + + + + org.apache.commons + commons-lang3 + + + + + + org.apache.spark + spark-core_${scala.major.version} + + + + io.netty + netty-all + + + + + org.apache.spark + spark-sql_${scala.major.version} + + + org.apache.hadoop + hadoop-common + + + org.apache.parquet + parquet-column + + + org.apache.parquet + parquet-hadoop + + + org.apache.parquet + parquet-common + + + commons-collections + commons-collections + + + org.scala-lang + scala-library + + + com.esotericsoftware + kryo-shaded + + + org.apache.spark + spark-catalyst_${scala.major.version} + + + com.google.guava + guava + + + org.junit.jupiter + junit-jupiter-engine + test + + + + + + + + + + org.junit.jupiter + junit-jupiter-params + test + + + org.jmockit + jmockit + test + + + + org.apache.logging.log4j + log4j-core + + + + org.apache.logging.log4j + log4j-api + + + + org.apache.logging.log4j + log4j-slf4j-impl + + + + org.slf4j + slf4j-api + + + org.apache.hadoop + hadoop-aws + + + org.slf4j + slf4j-log4j12 + + + log4j + log4j + + + servlet-api + javax.servlet + + + + com.amazonaws + aws-java-sdk-s3 + + + com.amazonaws + aws-java-sdk-bundle + + + + + com.amazonaws + aws-java-sdk-s3 + + + com.amazonaws + aws-java-sdk-glue + + + com.amazonaws + aws-java-sdk-dynamodb + + + com.google.code.gson + gson + + + + spark-load-dpp-${project.version} + + + + org.apache.maven.plugins + maven-surefire-plugin + + set larger, eg, 3, to reduce the time or running FE unit tests<--> + ${fe_ut_parallel} + not reuse forked jvm, so that each unit test will run in separate jvm. to avoid singleton confict<--> + false + + -javaagent:${settings.localRepository}/org/jmockit/jmockit/${jmockit.version}/jmockit-${jmockit.version}.jar @{argLine} + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + copy-dependencies + package + + copy-dependencies + + + ${project.build.directory}/lib + false + false + true + runtime + ${skip.plugin} + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + + org.apache.doris.load.loadv2.etl.SparkEtlJob + + + + jar-with-dependencies + + + + + make-assembly + + package + + + single + + + + + + org.codehaus.mojo + cobertura-maven-plugin + 2.7 + + + 1024m + + + + + + org.apache.maven.plugins + maven-clean-plugin + 3.1.0 + + + auto-clean + initialize + + clean + + + + + + org.apache.maven.plugins + maven-shade-plugin + + + + com.google.code.findbugs:* + org.slf4j:* + + + + + org.roaringbitmap + org.apache.doris.shaded.org.roaringbitmap + com.google.guava + org.apache.doris.shaded.com.google.guava + + + + + + package + + shade + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + true + + + + org.codehaus.mojo + flatten-maven-plugin + + true + resolveCiFriendliesOnly + + + + flatten + process-resources + + flatten + + + + flatten.clean + clean + + clean + + + + + + + diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/common/SparkDppException.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/common/SparkDppException.java new file mode 100644 index 00000000..66547461 --- /dev/null +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/common/SparkDppException.java @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common; + +import com.google.common.base.Strings; + +// Exception for Spark DPP process +public class SparkDppException extends Exception { + public SparkDppException(String msg, Throwable cause) { + super(Strings.nullToEmpty(msg), cause); + } + + public SparkDppException(Throwable cause) { + super(cause); + } + + public SparkDppException(String msg, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(Strings.nullToEmpty(msg), cause, enableSuppression, writableStackTrace); + } + + public SparkDppException(String msg) { + super(Strings.nullToEmpty(msg)); + } +} diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java new file mode 100644 index 00000000..d639b31f --- /dev/null +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java @@ -0,0 +1,296 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.common.SparkDppException; +import org.apache.doris.config.EtlJobConfig; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Serializable; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; + + +// Parser to validate value for different type +public abstract class ColumnParser implements Serializable { + + // thread safe formatter + public static final DateTimeFormatter DATE_FORMATTER = new DateTimeFormatterBuilder() + .appendPattern("uuuu-MM-dd") + .toFormatter(); + public static final DateTimeFormatter DATE_TIME_FORMATTER = new DateTimeFormatterBuilder() + .appendPattern("uuuu-MM-dd HH:mm:ss") + .toFormatter(); + protected static final Logger LOG = LoggerFactory.getLogger(ColumnParser.class); + + public static ColumnParser create(EtlJobConfig.EtlColumn etlColumn) throws SparkDppException { + String columnType = etlColumn.columnType; + if (columnType.equalsIgnoreCase("TINYINT")) { + return new TinyIntParser(); + } else if (columnType.equalsIgnoreCase("SMALLINT")) { + return new SmallIntParser(); + } else if (columnType.equalsIgnoreCase("INT")) { + return new IntParser(); + } else if (columnType.equalsIgnoreCase("BIGINT")) { + return new BigIntParser(); + } else if (columnType.equalsIgnoreCase("FLOAT")) { + return new FloatParser(); + } else if (columnType.equalsIgnoreCase("DOUBLE")) { + return new DoubleParser(); + } else if (columnType.equalsIgnoreCase("BOOLEAN")) { + return new BooleanParser(); + } else if (columnType.equalsIgnoreCase("DATE") + || columnType.equalsIgnoreCase("DATEV2")) { + return new DateParser(); + } else if (columnType.equalsIgnoreCase("DATETIME") + || columnType.equalsIgnoreCase("DATETIMEV2")) { + return new DatetimeParser(); + } else if (columnType.equalsIgnoreCase("STRING") + || columnType.equalsIgnoreCase("TEXT")) { + return new StringTypeParser(etlColumn); + } else if (columnType.equalsIgnoreCase("VARCHAR") + || columnType.equalsIgnoreCase("CHAR") + || columnType.equalsIgnoreCase("BITMAP") + || columnType.equalsIgnoreCase("HLL")) { + return new StringParser(etlColumn); + } else if (columnType.equalsIgnoreCase("DECIMALV2") + || columnType.equalsIgnoreCase("DECIMAL32") + || columnType.equalsIgnoreCase("DECIMAL64") + || columnType.equalsIgnoreCase("DECIMAL128")) { + return new DecimalParser(etlColumn); + } else if (columnType.equalsIgnoreCase("LARGEINT")) { + return new LargeIntParser(); + } else { + throw new SparkDppException("unsupported type:" + columnType); + } + } + + public abstract boolean parse(String value); +} + +class TinyIntParser extends ColumnParser { + @Override + public boolean parse(String value) { + try { + Byte.parseByte(value); + } catch (NumberFormatException e) { + return false; + } + return true; + } +} + +class SmallIntParser extends ColumnParser { + @Override + public boolean parse(String value) { + try { + Short.parseShort(value); + } catch (NumberFormatException e) { + return false; + } + return true; + } +} + +class IntParser extends ColumnParser { + @Override + public boolean parse(String value) { + try { + Integer.parseInt(value); + } catch (NumberFormatException e) { + return false; + } + return true; + } +} + +class BigIntParser extends ColumnParser { + @Override + public boolean parse(String value) { + try { + Long.parseLong(value); + } catch (NumberFormatException e) { + return false; + } + return true; + } +} + +class FloatParser extends ColumnParser { + @Override + public boolean parse(String value) { + try { + Float ret = Float.parseFloat(value); + return !ret.isNaN() && !ret.isInfinite(); + } catch (NumberFormatException e) { + return false; + } + } +} + +class DoubleParser extends ColumnParser { + @Override + public boolean parse(String value) { + try { + Double ret = Double.parseDouble(value); + return !ret.isInfinite() && !ret.isNaN(); + } catch (NumberFormatException e) { + return false; + } + } +} + +class BooleanParser extends ColumnParser { + @Override + public boolean parse(String value) { + if (value.equalsIgnoreCase("true") + || value.equalsIgnoreCase("false") + || value.equals("0") || value.equals("1")) { + return true; + } + return false; + } +} + +class DateParser extends ColumnParser { + @Override + public boolean parse(String value) { + try { + DATE_FORMATTER.parse(value); + } catch (Exception e) { + return false; + } + return true; + } +} + +class DatetimeParser extends ColumnParser { + @Override + public boolean parse(String value) { + try { + DATE_TIME_FORMATTER.parse(value); + } catch (Exception e) { + return false; + } + return true; + } +} + +class StringParser extends ColumnParser { + + private EtlJobConfig.EtlColumn etlColumn; + + public StringParser(EtlJobConfig.EtlColumn etlColumn) { + this.etlColumn = etlColumn; + } + + @Override + public boolean parse(String value) { + try { + return value.getBytes("UTF-8").length <= etlColumn.stringLength; + } catch (Exception e) { + throw new RuntimeException("string check failed ", e); + } + } +} + +class StringTypeParser extends ColumnParser { + + private EtlJobConfig.EtlColumn etlColumn; + + public StringTypeParser(EtlJobConfig.EtlColumn etlColumn) { + this.etlColumn = etlColumn; + } + + @Override + public boolean parse(String value) { + try { + return value.getBytes("UTF-8").length <= DppUtils.STRING_LENGTH_LIMIT; + } catch (Exception e) { + throw new RuntimeException("string check failed ", e); + } + } +} + + +class DecimalParser extends ColumnParser { + + public static int PRECISION = 27; + public static int SCALE = 9; + + private BigDecimal maxValue; + private BigDecimal minValue; + + public DecimalParser(EtlJobConfig.EtlColumn etlColumn) { + StringBuilder precisionStr = new StringBuilder(); + for (int i = 0; i < etlColumn.precision - etlColumn.scale; i++) { + precisionStr.append("9"); + } + StringBuilder scaleStr = new StringBuilder(); + for (int i = 0; i < etlColumn.scale; i++) { + scaleStr.append("9"); + } + maxValue = new BigDecimal(precisionStr.toString() + "." + scaleStr.toString()); + minValue = new BigDecimal("-" + precisionStr.toString() + "." + scaleStr.toString()); + } + + @Override + public boolean parse(String value) { + try { + BigDecimal bigDecimal = new BigDecimal(value); + return bigDecimal.precision() - bigDecimal.scale() <= PRECISION - SCALE && bigDecimal.scale() <= SCALE; + } catch (NumberFormatException e) { + return false; + } catch (Exception e) { + throw new RuntimeException("decimal parse failed ", e); + } + } + + public BigDecimal getMaxValue() { + return maxValue; + } + + public BigDecimal getMinValue() { + return minValue; + } +} + +class LargeIntParser extends ColumnParser { + + private BigInteger maxValue = new BigInteger("170141183460469231731687303715884105727"); + private BigInteger minValue = new BigInteger("-170141183460469231731687303715884105728"); + + @Override + public boolean parse(String value) { + try { + BigInteger inputValue = new BigInteger(value); + return inputValue.compareTo(maxValue) < 0 && inputValue.compareTo(minValue) > 0; + } catch (NumberFormatException e) { + return false; + } catch (ArithmeticException e) { + LOG.warn("int value is too big even for java BigInteger,value={}" + value); + return false; + } catch (Exception e) { + throw new RuntimeException("large int parse failed:" + value, e); + } + } +} diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisKryoRegistrator.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisKryoRegistrator.java new file mode 100644 index 00000000..c873f5af --- /dev/null +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisKryoRegistrator.java @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.common.io.BitmapValue; +import org.apache.doris.common.io.Roaring64Map; + +import com.esotericsoftware.kryo.Kryo; +import org.apache.spark.serializer.KryoRegistrator; + +/** + * register etl classes with Kryo when using Kryo serialization. + */ +public class DorisKryoRegistrator implements KryoRegistrator { + + @Override + public void registerClasses(Kryo kryo) { + kryo.register(Roaring64Map.class); + kryo.register(BitmapValue.class); + } +} diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitioner.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitioner.java new file mode 100644 index 00000000..9fd413db --- /dev/null +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitioner.java @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.config.EtlJobConfig; + +import org.apache.spark.Partitioner; + +import java.io.Serializable; +import java.util.List; + +public class DorisRangePartitioner extends Partitioner { + private static final String UNPARTITIONED_TYPE = "UNPARTITIONED"; + private EtlJobConfig.EtlPartitionInfo partitionInfo; + private List partitionRangeKeys; + List partitionKeyIndexes; + + public DorisRangePartitioner(EtlJobConfig.EtlPartitionInfo partitionInfo, + List partitionKeyIndexes, + List partitionRangeKeys) { + this.partitionInfo = partitionInfo; + this.partitionKeyIndexes = partitionKeyIndexes; + this.partitionRangeKeys = partitionRangeKeys; + } + + public int numPartitions() { + if (partitionInfo == null) { + return 0; + } + if (partitionInfo.partitionType.equalsIgnoreCase(UNPARTITIONED_TYPE)) { + return 1; + } + return partitionInfo.partitions.size(); + } + + public int getPartition(Object var1) { + if (partitionInfo.partitionType != null + && partitionInfo.partitionType.equalsIgnoreCase(UNPARTITIONED_TYPE)) { + return 0; + } + DppColumns key = (DppColumns) var1; + // get the partition columns from key as partition key + DppColumns partitionKey = new DppColumns(key, partitionKeyIndexes); + // TODO: optimize this by use binary search + for (int i = 0; i < partitionRangeKeys.size(); ++i) { + if (partitionRangeKeys.get(i).isRowContained(partitionKey)) { + return i; + } + } + return -1; + } + + public static class PartitionRangeKey implements Serializable { + public boolean isMaxPartition; + public DppColumns startKeys; + public DppColumns endKeys; + + public boolean isRowContained(DppColumns row) { + if (isMaxPartition) { + return startKeys.compareTo(row) <= 0; + } else { + return startKeys.compareTo(row) <= 0 && endKeys.compareTo(row) > 0; + } + } + + public String toString() { + return "PartitionRangeKey{" + + "isMaxPartition=" + isMaxPartition + + ", startKeys=" + startKeys + + ", endKeys=" + endKeys + + '}'; + } + } +} diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppColumns.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppColumns.java new file mode 100644 index 00000000..5b5e3f5d --- /dev/null +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppColumns.java @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import com.google.common.base.Preconditions; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.Objects; + +// DppColumns is used to store the +class DppColumns implements Comparable, Serializable { + public List columns = new ArrayList(); + + public DppColumns(List keys) { + this.columns = keys; + } + + public DppColumns(DppColumns key, List indexes) { + for (int i = 0; i < indexes.size(); ++i) { + columns.add(key.columns.get(indexes.get(i))); + } + } + + @Override + public int compareTo(DppColumns other) { + Preconditions.checkState(columns.size() == other.columns.size()); + + int cmp = 0; + for (int i = 0; i < columns.size(); i++) { + Object columnObj = columns.get(i); + Object otherColumn = other.columns.get(i); + if (columnObj == null && otherColumn == null) { + return 0; + } else if (columnObj == null || otherColumn == null) { + if (columnObj == null) { + return -1; + } else { + return 1; + } + } + if (columns.get(i) instanceof Integer) { + cmp = ((Integer) (columns.get(i))).compareTo((Integer) (other.columns.get(i))); + } else if (columns.get(i) instanceof Long) { + cmp = ((Long) (columns.get(i))).compareTo((Long) (other.columns.get(i))); + } else if (columns.get(i) instanceof Boolean) { + cmp = ((Boolean) (columns.get(i))).compareTo((Boolean) (other.columns.get(i))); + } else if (columns.get(i) instanceof Short) { + cmp = ((Short) (columns.get(i))).compareTo((Short) (other.columns.get(i))); + } else if (columns.get(i) instanceof Float) { + cmp = ((Float) (columns.get(i))).compareTo((Float) (other.columns.get(i))); + } else if (columns.get(i) instanceof Double) { + cmp = ((Double) (columns.get(i))).compareTo((Double) (other.columns.get(i))); + } else if (columns.get(i) instanceof Date) { + cmp = ((Date) (columns.get(i))).compareTo((Date) (other.columns.get(i))); + } else if (columns.get(i) instanceof java.sql.Timestamp) { + cmp = ((java.sql.Timestamp) columns.get(i)).compareTo((java.sql.Timestamp) other.columns.get(i)); + } else { + cmp = ((String) (columns.get(i))).compareTo((String) (other.columns.get(i))); + } + if (cmp != 0) { + return cmp; + } + } + return cmp; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + DppColumns dppColumns = (DppColumns) o; + return Objects.equals(columns, dppColumns.columns); + } + + @Override + public int hashCode() { + return Objects.hash(columns); + } + + @Override + public String toString() { + return "dppColumns{" + + "columns=" + columns + + '}'; + } +} diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java new file mode 100644 index 00000000..bf190408 --- /dev/null +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java @@ -0,0 +1,299 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.common.SparkDppException; +import org.apache.doris.config.EtlJobConfig; + +import com.google.common.collect.Lists; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.DecimalType; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.List; +import java.util.Set; +import java.util.zip.CRC32; + +public class DppUtils { + public static final String BUCKET_ID = "__bucketId__"; + + public static final int STRING_LENGTH_LIMIT = 1048576; + + public static Class getClassFromDataType(DataType dataType) { + if (dataType == null) { + return null; + } + if (dataType.equals(DataTypes.BooleanType)) { + return Boolean.class; + } else if (dataType.equals(DataTypes.ShortType)) { + return Short.class; + } else if (dataType.equals(DataTypes.IntegerType)) { + return Integer.class; + } else if (dataType.equals(DataTypes.LongType)) { + return Long.class; + } else if (dataType.equals(DataTypes.FloatType)) { + return Float.class; + } else if (dataType.equals(DataTypes.DoubleType)) { + return Double.class; + } else if (dataType.equals(DataTypes.DateType)) { + return Date.class; + } else if (dataType.equals(DataTypes.StringType)) { + return String.class; + } else if (dataType instanceof DecimalType) { + DecimalType decimalType = (DecimalType) dataType; + return BigDecimal.valueOf(decimalType.precision(), decimalType.scale()).getClass(); + } else if (dataType.equals(DataTypes.TimestampType)) { + return Long.class; + } + return null; + } + + public static Class getClassFromColumn(EtlJobConfig.EtlColumn column) throws SparkDppException { + switch (column.columnType) { + case "BOOLEAN": + return Boolean.class; + case "TINYINT": + case "SMALLINT": + return Short.class; + case "INT": + return Integer.class; + case "DATETIME": + case "DATETIMEV2": + return java.sql.Timestamp.class; + case "BIGINT": + return Long.class; + case "LARGEINT": + throw new SparkDppException("LARGEINT is not supported now"); + case "FLOAT": + return Float.class; + case "DOUBLE": + return Double.class; + case "DATE": + case "DATEV2": + return Date.class; + case "HLL": + case "CHAR": + case "VARCHAR": + case "STRING": + case "TEXT": + case "BITMAP": + case "OBJECT": + return String.class; + case "DECIMALV2": + case "DECIMAL32": + case "DECIMAL64": + case "DECIMAL128": + return BigDecimal.valueOf(column.precision, column.scale).getClass(); + default: + return String.class; + } + } + + public static DataType getDataTypeFromColumn(EtlJobConfig.EtlColumn column, boolean regardDistinctColumnAsBinary) { + DataType dataType = DataTypes.StringType; + switch (column.columnType) { + case "BOOLEAN": + dataType = DataTypes.StringType; + break; + case "TINYINT": + dataType = DataTypes.ByteType; + break; + case "SMALLINT": + dataType = DataTypes.ShortType; + break; + case "INT": + dataType = DataTypes.IntegerType; + break; + case "DATETIME": + case "DATETIMEV2": + dataType = DataTypes.TimestampType; + break; + case "BIGINT": + dataType = DataTypes.LongType; + break; + case "LARGEINT": + dataType = DataTypes.StringType; + break; + case "FLOAT": + dataType = DataTypes.FloatType; + break; + case "DOUBLE": + dataType = DataTypes.DoubleType; + break; + case "DATE": + case "DATEV2": + dataType = DataTypes.DateType; + break; + case "CHAR": + case "VARCHAR": + case "STRING": + case "TEXT": + case "OBJECT": + dataType = DataTypes.StringType; + break; + case "HLL": + case "BITMAP": + dataType = regardDistinctColumnAsBinary ? DataTypes.BinaryType : DataTypes.StringType; + break; + case "DECIMALV2": + case "DECIMAL32": + case "DECIMAL64": + case "DECIMAL128": + dataType = DecimalType.apply(column.precision, column.scale); + break; + default: + throw new RuntimeException("Reason: invalid column type:" + column); + } + return dataType; + } + + public static ByteBuffer getHashValue(Object o, DataType type) { + ByteBuffer buffer = ByteBuffer.allocate(8); + buffer.order(ByteOrder.LITTLE_ENDIAN); + if (o == null) { + buffer.putInt(0); + return buffer; + } + if (type.equals(DataTypes.ByteType)) { + buffer.put((byte) o); + } else if (type.equals(DataTypes.ShortType)) { + buffer.putShort((Short) o); + } else if (type.equals(DataTypes.IntegerType)) { + buffer.putInt((Integer) o); + } else if (type.equals(DataTypes.LongType)) { + buffer.putLong((Long) o); + } else if (type.equals(DataTypes.StringType)) { + try { + String str = String.valueOf(o); + buffer = ByteBuffer.wrap(str.getBytes("UTF-8")); + } catch (Exception e) { + throw new RuntimeException(e); + } + } else if (type.equals(DataTypes.BooleanType)) { + Boolean b = (Boolean) o; + byte value = (byte) (b ? 1 : 0); + buffer.put(value); + } + // do not flip buffer when the buffer was created by wrap() + if (!type.equals(DataTypes.StringType)) { + buffer.flip(); + } + return buffer; + } + + public static long getHashValue(Row row, List distributeColumns, StructType dstTableSchema) { + CRC32 hashValue = new CRC32(); + for (String distColumn : distributeColumns) { + Object columnObject = row.get(row.fieldIndex(distColumn)); + ByteBuffer buffer = getHashValue(columnObject, dstTableSchema.apply(distColumn).dataType()); + hashValue.update(buffer.array(), 0, buffer.limit()); + } + return hashValue.getValue(); + } + + public static StructType replaceBinaryColsInSchema(Set binaryColumns, StructType dstSchema) { + List fields = new ArrayList<>(); + for (StructField originField : dstSchema.fields()) { + if (binaryColumns.contains(originField.name())) { + fields.add(DataTypes.createStructField(originField.name(), + DataTypes.BinaryType, originField.nullable())); + } else { + fields.add(DataTypes.createStructField(originField.name(), + originField.dataType(), originField.nullable())); + } + } + StructType ret = DataTypes.createStructType(fields); + return ret; + } + + public static StructType createDstTableSchema(List columns, + boolean addBucketIdColumn, boolean regardDistinctColumnAsBinary) { + List fields = new ArrayList<>(); + if (addBucketIdColumn) { + StructField bucketIdField = DataTypes.createStructField(BUCKET_ID, DataTypes.StringType, true); + fields.add(bucketIdField); + } + for (EtlJobConfig.EtlColumn column : columns) { + DataType structColumnType = getDataTypeFromColumn(column, regardDistinctColumnAsBinary); + StructField field = DataTypes.createStructField(column.columnName, structColumnType, column.isAllowNull); + fields.add(field); + } + StructType dstSchema = DataTypes.createStructType(fields); + return dstSchema; + } + + public static List parseColumnsFromPath(String filePath, List columnsFromPath) + throws SparkDppException { + if (columnsFromPath == null || columnsFromPath.isEmpty()) { + return Collections.emptyList(); + } + String[] strings = filePath.split("/"); + if (strings.length < 2) { + System.err.println("Fail to parse columnsFromPath, expected: " + columnsFromPath + + ", filePath: " + filePath); + throw new SparkDppException("Reason: Fail to parse columnsFromPath, expected: " + + columnsFromPath + ", filePath: " + filePath); + } + String[] columns = new String[columnsFromPath.size()]; + int size = 0; + for (int i = strings.length - 2; i >= 0; i--) { + String str = strings[i]; + if (str != null && str.isEmpty()) { + continue; + } + if (str == null || !str.contains("=")) { + System.err.println("Fail to parse columnsFromPath, expected: " + columnsFromPath + + ", filePath: " + filePath); + throw new SparkDppException("Reason: Fail to parse columnsFromPath, expected: " + + columnsFromPath + ", filePath: " + filePath); + } + String[] pair = str.split("=", 2); + if (pair.length != 2) { + System.err.println("Fail to parse columnsFromPath, expected: " + columnsFromPath + + ", filePath: " + filePath); + throw new SparkDppException("Reason: Fail to parse columnsFromPath, expected: " + + columnsFromPath + ", filePath: " + filePath); + } + int index = columnsFromPath.indexOf(pair[0]); + if (index == -1) { + continue; + } + columns[index] = pair[1]; + size++; + if (size >= columnsFromPath.size()) { + break; + } + } + if (size != columnsFromPath.size()) { + System.err.println("Fail to parse columnsFromPath, expected: " + columnsFromPath + + ", filePath: " + filePath); + throw new SparkDppException("Reason: Fail to parse columnsFromPath, expected: " + + columnsFromPath + ", filePath: " + filePath); + } + return Lists.newArrayList(columns); + } +} diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/GlobalDictBuilder.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/GlobalDictBuilder.java new file mode 100644 index 00000000..e19cfae8 --- /dev/null +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/GlobalDictBuilder.java @@ -0,0 +1,432 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.commons.collections.map.MultiValueMap; +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.sql.AnalysisException; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.catalog.Column; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.stream.Collectors; + +/** + * used for build hive global dict and encode source hive table + * + * input: a source hive table + * output: a intermediate hive table whose distinct column is encode with int value + * + * usage example + * step1,create a intermediate hive table + * GlobalDictBuilder.createHiveIntermediateTable() + * step2, get distinct column's value + * GlobalDictBuilder.extractDistinctColumn() + * step3, build global dict + * GlobalDictBuilder.buildGlobalDict() + * step4, encode intermediate hive table with global dict + * GlobalDictBuilder.encodeDorisIntermediateHiveTable() + */ + +public class GlobalDictBuilder { + + protected static final Logger LOG = LoggerFactory.getLogger(GlobalDictBuilder.class); + + // name of the column in doris table which need to build global dict + // for example: some dict columns a,b,c + // case 1: all dict columns has no relation, then the map is as below + // [a=null, b=null, c=null] + // case 2: column a's value can reuse column b's value which means column a's value is a subset of column b's value + // [b=a,c=null] + private MultiValueMap dictColumn; + // target doris table columns in current spark load job + private List dorisOlapTableColumnList; + + // distinct columns which need to use map join to solve data skew in encodeDorisIntermediateHiveTable() + // we needn't to specify it until data skew happends + private List mapSideJoinColumns; + + // hive table datasource,format is db.table + private String sourceHiveDBTableName; + // user-specified filter when query sourceHiveDBTable + private String sourceHiveFilter; + // intermediate hive table to store the distinct value of distinct column + private String distinctKeyTableName; + // current doris table's global dict hive table + private String globalDictTableName; + + // used for next step to read + private String dorisIntermediateHiveTable; + private SparkSession spark; + + // key=doris column name,value=column type + private Map dorisColumnNameTypeMap = new HashMap<>(); + + // column in this list means need split distinct value and then encode respectively + // to avoid the performance bottleneck to transfer origin value to dict value + private List veryHighCardinalityColumn; + // determine the split num of new distinct value,better can be divisible by 1 + private int veryHighCardinalityColumnSplitNum; + + private ExecutorService pool; + + private StructType distinctValueSchema; + + public GlobalDictBuilder(MultiValueMap dictColumn, + List dorisOlapTableColumnList, + List mapSideJoinColumns, + String sourceHiveDBTableName, + String sourceHiveFilter, + String dorisHiveDB, + String distinctKeyTableName, + String globalDictTableName, + String dorisIntermediateHiveTable, + int buildConcurrency, + List veryHighCardinalityColumn, + int veryHighCardinalityColumnSplitNum, + SparkSession spark) { + this.dictColumn = dictColumn; + this.dorisOlapTableColumnList = dorisOlapTableColumnList; + this.mapSideJoinColumns = mapSideJoinColumns; + this.sourceHiveDBTableName = sourceHiveDBTableName; + this.sourceHiveFilter = sourceHiveFilter; + this.distinctKeyTableName = distinctKeyTableName; + this.globalDictTableName = globalDictTableName; + this.dorisIntermediateHiveTable = dorisIntermediateHiveTable; + this.spark = spark; + this.pool = Executors.newFixedThreadPool(buildConcurrency < 0 ? 1 : buildConcurrency); + this.veryHighCardinalityColumn = veryHighCardinalityColumn; + this.veryHighCardinalityColumnSplitNum = veryHighCardinalityColumnSplitNum; + + spark.sql("use " + dorisHiveDB); + } + + public void createHiveIntermediateTable() throws AnalysisException { + Map sourceHiveTableColumn = spark.catalog() + .listColumns(sourceHiveDBTableName) + .collectAsList() + .stream().collect(Collectors.toMap(Column::name, Column::dataType)); + + Map sourceHiveTableColumnInLowercase = new HashMap<>(); + for (Map.Entry entry : sourceHiveTableColumn.entrySet()) { + sourceHiveTableColumnInLowercase.put(entry.getKey().toLowerCase(), entry.getValue().toLowerCase()); + } + + // check and get doris column type in hive + dorisOlapTableColumnList.stream().map(String::toLowerCase).forEach(columnName -> { + String columnType = sourceHiveTableColumnInLowercase.get(columnName); + if (StringUtils.isEmpty(columnType)) { + throw new RuntimeException(String.format("doris column %s not in source hive table", columnName)); + } + dorisColumnNameTypeMap.put(columnName, columnType); + }); + + spark.sql(String.format("drop table if exists %s ", dorisIntermediateHiveTable)); + // create IntermediateHiveTable + spark.sql(getCreateIntermediateHiveTableSql()); + + // insert data to IntermediateHiveTable + spark.sql(getInsertIntermediateHiveTableSql()); + } + + public void extractDistinctColumn() { + // create distinct tables + spark.sql(getCreateDistinctKeyTableSql()); + + // extract distinct column + List workerList = new ArrayList<>(); + // For the column in dictColumns's valueSet, their value is a subset of column in keyset, + // so we don't need to extract distinct value of column in valueSet + for (Object column : dictColumn.keySet()) { + workerList.add( + () -> spark.sql(getInsertDistinctKeyTableSql(column.toString(), dorisIntermediateHiveTable))); + } + + submitWorker(workerList); + } + + public void buildGlobalDict() throws ExecutionException, InterruptedException { + // create global dict hive table + spark.sql(getCreateGlobalDictHiveTableSql()); + + List globalDictBuildWorkers = new ArrayList<>(); + for (Object distinctColumnNameOrigin : dictColumn.keySet()) { + String distinctColumnNameTmp = distinctColumnNameOrigin.toString(); + globalDictBuildWorkers.add(() -> { + // get global dict max value + List maxGlobalDictValueRow + = spark.sql(getMaxGlobalDictValueSql(distinctColumnNameTmp)).collectAsList(); + if (maxGlobalDictValueRow.size() == 0) { + throw new RuntimeException(String.format("get max dict value failed: %s", distinctColumnNameTmp)); + } + + long maxDictValue = 0; + long minDictValue = 0; + Row row = maxGlobalDictValueRow.get(0); + if (row != null && row.get(0) != null) { + maxDictValue = (long) row.get(0); + minDictValue = (long) row.get(1); + } + LOG.info(" column " + distinctColumnNameTmp + " 's max value in dict is " + + maxDictValue + ", min value is " + minDictValue); + // maybe never happened, but we need detect it + if (minDictValue < 0) { + throw new RuntimeException(String.format(" column %s 's cardinality has exceed bigint's max value", + distinctColumnNameTmp)); + } + + if (veryHighCardinalityColumn.contains(distinctColumnNameTmp) + && veryHighCardinalityColumnSplitNum > 1) { + // split distinct key first and then encode with count + buildGlobalDictBySplit(maxDictValue, distinctColumnNameTmp); + } else { + // build global dict directly + spark.sql(getBuildGlobalDictSql(maxDictValue, distinctColumnNameTmp)); + } + + }); + } + submitWorker(globalDictBuildWorkers); + } + + // encode dorisIntermediateHiveTable's distinct column + public void encodeDorisIntermediateHiveTable() { + for (Object distinctColumnObj : dictColumn.keySet()) { + spark.sql(getEncodeDorisIntermediateHiveTableSql(distinctColumnObj.toString(), + (ArrayList) dictColumn.get(distinctColumnObj.toString()))); + } + } + + private String getCreateIntermediateHiveTableSql() { + StringBuilder sql = new StringBuilder(); + sql.append("create table if not exists ").append(dorisIntermediateHiveTable).append(" ( "); + + Set allDictColumn = new HashSet<>(); + allDictColumn.addAll(dictColumn.keySet()); + allDictColumn.addAll(dictColumn.values()); + dorisOlapTableColumnList.forEach(columnName -> { + sql.append(columnName).append(" "); + if (allDictColumn.contains(columnName)) { + sql.append(" string ,"); + } else { + sql.append(dorisColumnNameTypeMap.get(columnName)).append(" ,"); + } + }); + return sql.deleteCharAt(sql.length() - 1).append(" )").append(" stored as sequencefile ").toString(); + } + + private String getInsertIntermediateHiveTableSql() { + StringBuilder sql = new StringBuilder(); + sql.append("insert overwrite table ").append(dorisIntermediateHiveTable).append(" select "); + dorisOlapTableColumnList.forEach(columnName -> { + sql.append(columnName).append(" ,"); + }); + sql.deleteCharAt(sql.length() - 1) + .append(" from ").append(sourceHiveDBTableName); + if (!StringUtils.isEmpty(sourceHiveFilter)) { + sql.append(" where ").append(sourceHiveFilter); + } + return sql.toString(); + } + + private String getCreateDistinctKeyTableSql() { + return "create table if not exists " + distinctKeyTableName + + "(dict_key string) partitioned by (dict_column string) stored as sequencefile "; + } + + private String getInsertDistinctKeyTableSql(String distinctColumnName, String sourceHiveTable) { + StringBuilder sql = new StringBuilder(); + sql.append("insert overwrite table ").append(distinctKeyTableName) + .append(" partition(dict_column='").append(distinctColumnName).append("')") + .append(" select ").append(distinctColumnName) + .append(" from ").append(sourceHiveTable) + .append(" group by ").append(distinctColumnName); + return sql.toString(); + } + + private String getCreateGlobalDictHiveTableSql() { + return "create table if not exists " + globalDictTableName + + "(dict_key string, dict_value bigint) partitioned by(dict_column string) stored as sequencefile "; + } + + private String getMaxGlobalDictValueSql(String distinctColumnName) { + return "select max(dict_value) as max_value,min(dict_value) as min_value from " + + globalDictTableName + " where dict_column='" + distinctColumnName + "'"; + } + + private void buildGlobalDictBySplit(long maxGlobalDictValue, String distinctColumnName) { + // 1. get distinct value + Dataset newDistinctValue = spark.sql(getNewDistinctValue(distinctColumnName)); + + // 2. split the newDistinctValue to avoid window functions' single node bottleneck + Dataset[] splitedDistinctValue = newDistinctValue.randomSplit(getRandomSplitWeights()); + long currentMaxDictValue = maxGlobalDictValue; + Map distinctKeyMap = new HashMap<>(); + + for (int i = 0; i < splitedDistinctValue.length; i++) { + long currentDatasetStartDictValue = currentMaxDictValue; + long splitDistinctValueCount = splitedDistinctValue[i].count(); + currentMaxDictValue += splitDistinctValueCount; + String tmpDictTableName = String.format("%s_%s_tmp_dict_%s", i, + currentDatasetStartDictValue, distinctColumnName); + distinctKeyMap.put(tmpDictTableName, currentDatasetStartDictValue); + Dataset distinctValueFrame = spark.createDataFrame( + splitedDistinctValue[i].toJavaRDD(), getDistinctValueSchema()); + distinctValueFrame.createOrReplaceTempView(tmpDictTableName); + } + + spark.sql(getSplitBuildGlobalDictSql(distinctKeyMap, distinctColumnName)); + + } + + private String getSplitBuildGlobalDictSql(Map distinctKeyMap, String distinctColumnName) { + StringBuilder sql = new StringBuilder(); + sql.append("insert overwrite table ").append(globalDictTableName) + .append(" partition(dict_column='").append(distinctColumnName).append("') ") + .append(" select dict_key,dict_value from ").append(globalDictTableName) + .append(" where dict_column='").append(distinctColumnName).append("' "); + for (Map.Entry entry : distinctKeyMap.entrySet()) { + sql.append(" union all select dict_key, CAST((row_number() over(order by dict_key)) as BIGINT) ") + .append(String.format("+ CAST(%s as BIGINT) as dict_value from %s", + entry.getValue(), entry.getKey())); + } + return sql.toString(); + } + + private StructType getDistinctValueSchema() { + if (distinctValueSchema == null) { + List fieldList = new ArrayList<>(); + fieldList.add(DataTypes.createStructField("dict_key", DataTypes.StringType, false)); + distinctValueSchema = DataTypes.createStructType(fieldList); + } + return distinctValueSchema; + } + + private double[] getRandomSplitWeights() { + double[] weights = new double[veryHighCardinalityColumnSplitNum]; + double weight = 1 / Double.parseDouble(String.valueOf(veryHighCardinalityColumnSplitNum)); + Arrays.fill(weights, weight); + return weights; + } + + private String getBuildGlobalDictSql(long maxGlobalDictValue, String distinctColumnName) { + return "insert overwrite table " + globalDictTableName + " partition(dict_column='" + distinctColumnName + "') " + + " select dict_key,dict_value from " + globalDictTableName + + " where dict_column='" + distinctColumnName + "' " + + " union all select t1.dict_key as dict_key," + + "CAST((row_number() over(order by t1.dict_key)) as BIGINT) + " + + "CAST(" + maxGlobalDictValue + " as BIGINT) as dict_value from " + + "(select dict_key from " + distinctKeyTableName + + " where dict_column='" + distinctColumnName + "' and dict_key is not null)t1 left join " + + " (select dict_key,dict_value from " + globalDictTableName + + " where dict_column='" + distinctColumnName + "' )t2 " + + "on t1.dict_key = t2.dict_key where t2.dict_value is null"; + } + + private String getNewDistinctValue(String distinctColumnName) { + return "select t1.dict_key from " + + " (select dict_key from " + distinctKeyTableName + + " where dict_column='" + distinctColumnName + + "' and dict_key is not null)t1 left join " + + " (select dict_key,dict_value from " + globalDictTableName + + " where dict_column='" + distinctColumnName + "' )t2 " + + "on t1.dict_key = t2.dict_key where t2.dict_value is null"; + + } + + private String getEncodeDorisIntermediateHiveTableSql(String dictColumn, List childColumn) { + StringBuilder sql = new StringBuilder(); + sql.append("insert overwrite table ").append(dorisIntermediateHiveTable).append(" select "); + // using map join to solve distinct column data skew + // here is a spark sql hint + if (mapSideJoinColumns.size() != 0 && mapSideJoinColumns.contains(dictColumn)) { + sql.append(" /*+ BROADCAST (t) */ "); + } + dorisOlapTableColumnList.forEach(columnName -> { + if (dictColumn.equals(columnName)) { + sql.append("t.dict_value").append(" ,"); + // means the dictColumn is reused + } else if (childColumn != null && childColumn.contains(columnName)) { + sql.append(String.format(" if(%s is null, null, t.dict_value) ", columnName)).append(" ,"); + } else { + sql.append(dorisIntermediateHiveTable).append(".").append(columnName).append(" ,"); + } + }); + sql.deleteCharAt(sql.length() - 1) + .append(" from ") + .append(dorisIntermediateHiveTable) + .append(" LEFT OUTER JOIN ( select dict_key,dict_value from ").append(globalDictTableName) + .append(" where dict_column='").append(dictColumn).append("' ) t on ") + .append(dorisIntermediateHiveTable).append(".").append(dictColumn) + .append(" = t.dict_key "); + return sql.toString(); + } + + private void submitWorker(List workerList) { + try { + List> futureList = new ArrayList<>(); + for (GlobalDictBuildWorker globalDictBuildWorker : workerList) { + futureList.add(pool.submit(new Callable() { + @Override + public Boolean call() throws Exception { + try { + globalDictBuildWorker.work(); + return true; + } catch (Exception e) { + LOG.error("BuildGlobalDict failed", e); + return false; + } + } + })); + } + + LOG.info("begin to fetch worker result"); + for (Future future : futureList) { + if (!future.get()) { + throw new RuntimeException("detect one worker failed"); + } + } + LOG.info("fetch worker result complete"); + } catch (Exception e) { + LOG.error("submit worker failed", e); + throw new RuntimeException("submit worker failed", e); + } + } + + private interface GlobalDictBuildWorker { + void work(); + } +} diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilder.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilder.java new file mode 100644 index 00000000..ca89ab8d --- /dev/null +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilder.java @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.config.EtlJobConfig; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +// Build RollupTree by using minimum coverage strategy, +// which is to find the index with the minimum columns that +// has all columns of rollup index as parent index node. +// Eg: +// There are three indexes: +// index1(c1, c2, c3, c4, c5) +// index2(c1, c2, c4) +// index3(c1, c2) +// index4(c3, c4) +// index5(c1, c2, c5) +// then the result tree is: +// index1 +// | \ \ +// index2 index4 index5 +// | +// index3 +// Now, if there are more than one indexes meet the column coverage requirement, +// have the same column size(eg: index2 vs index5), child rollup is preferred +// builded from the front index(eg: index3 is the child of index2). This can be +// further optimized based on the row number of the index. +public class MinimumCoverageRollupTreeBuilder implements RollupTreeBuilder { + public RollupTreeNode build(EtlJobConfig.EtlTable tableMeta) { + List indexes = tableMeta.indexes; + List indexMetas = new ArrayList<>(); + EtlJobConfig.EtlIndex baseIndex = null; + for (EtlJobConfig.EtlIndex indexMeta : indexes) { + if (indexMeta.isBaseIndex) { + baseIndex = indexMeta; + continue; + } + indexMetas.add(indexMeta); + } + List baseIndexColumns = baseIndex.columns; + List baseKeyColumns = new ArrayList<>(); + List baseValueColumns = new ArrayList<>(); + for (EtlJobConfig.EtlColumn columnMeta : baseIndexColumns) { + if (columnMeta.isKey) { + baseKeyColumns.add(columnMeta.columnName); + } else { + baseValueColumns.add(columnMeta.columnName); + } + } + RollupTreeNode root = new RollupTreeNode(); + root.parent = null; + root.keyColumnNames = baseKeyColumns; + root.valueColumnNames = baseValueColumns; + root.indexId = baseIndex.indexId; + root.indexMeta = baseIndex; + + // sort the index metas to make sure the column number decrease + Collections.sort(indexMetas, new EtlJobConfig.EtlIndexComparator().reversed()); + for (int i = 0; i < indexMetas.size(); ++i) { + List keyColumns = new ArrayList<>(); + List valueColumns = new ArrayList<>(); + for (EtlJobConfig.EtlColumn column : indexMetas.get(i).columns) { + if (column.isKey) { + keyColumns.add(column.columnName); + } else { + valueColumns.add(column.columnName); + } + } + if (!insertIndex(root, indexMetas.get(i), keyColumns, valueColumns)) { + throw new RuntimeException(String.format("can't find a parent rollup for rollup %s," + + " rollup tree is %s", indexMetas.get(i).toString(), root)); + } + } + return root; + } + + // DFS traverse to build the rollup tree + // return true means we find a parent rollup for current rollup table + private boolean insertIndex(RollupTreeNode root, EtlJobConfig.EtlIndex indexMeta, + List keyColumns, + List valueColumns) { + // find suitable parent rollup from current node's children + if (root.children != null) { + for (int i = root.children.size() - 1; i >= 0; i--) { + if (insertIndex(root.children.get(i), indexMeta, keyColumns, valueColumns)) { + return true; + } + } + } + + // find suitable parent rollup from current node + if (root.keyColumnNames.containsAll(keyColumns) && root.valueColumnNames.containsAll(valueColumns)) { + if (root.children == null) { + root.children = new ArrayList<>(); + } + RollupTreeNode newChild = new RollupTreeNode(); + newChild.keyColumnNames = keyColumns; + newChild.valueColumnNames = valueColumns; + newChild.indexMeta = indexMeta; + newChild.indexId = indexMeta.indexId; + newChild.parent = root; + newChild.level = root.level + 1; + root.children.add(newChild); + return true; + } + + return false; + } +} diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeBuilder.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeBuilder.java new file mode 100644 index 00000000..16ce92b8 --- /dev/null +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeBuilder.java @@ -0,0 +1,25 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.config.EtlJobConfig; + +// RollupTreeBuilder is used to get the RollupTree from the TableMeta +public abstract interface RollupTreeBuilder { + public RollupTreeNode build(EtlJobConfig.EtlTable tableMeta); +} diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeNode.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeNode.java new file mode 100644 index 00000000..ec3129f3 --- /dev/null +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/RollupTreeNode.java @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.config.EtlJobConfig; + +import java.util.List; + +// Base and rollup indexes are managed by as a RollupTree in order to +// produce the rollup index data from the best-fit index to get better performance. +// The calculation will be done through preorder traversal +public class RollupTreeNode { + public RollupTreeNode parent; + public List children; + public long indexId; + public List keyColumnNames; + public List valueColumnNames; + public int level; + public EtlJobConfig.EtlIndex indexMeta; + + public String toString() { + StringBuilder builder = new StringBuilder(); + for (int i = 0; i < level; ++i) { + builder.append("-"); + } + builder.append("indexid: " + indexId + "\n"); + if (children != null && !children.isEmpty()) { + for (int i = 0; i < level; ++i) { + builder.append("-"); + } + builder.append("children:\n"); + for (RollupTreeNode child : children) { + builder.append(child.toString()); + } + } + return builder.toString(); + } +} diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java new file mode 100644 index 00000000..6746e80e --- /dev/null +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java @@ -0,0 +1,1205 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.common.DppResult; +import org.apache.doris.common.SparkDppException; +import org.apache.doris.config.EtlJobConfig; +import org.apache.doris.util.JsonUtils; + +import com.google.common.base.Strings; +import com.google.common.collect.Maps; +import org.apache.commons.collections.CollectionUtils; +import org.apache.commons.collections.IteratorUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.column.ParquetProperties.WriterVersion; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.spark.Partitioner; +import org.apache.spark.TaskContext; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.PairFlatMapFunction; +import org.apache.spark.api.java.function.VoidFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.catalyst.CatalystTypeConverters; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport; +import org.apache.spark.sql.functions; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.storage.StorageLevel; +import org.apache.spark.util.LongAccumulator; +import org.apache.spark.util.SerializableConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import scala.Tuple2; +import scala.collection.JavaConverters; + +import java.io.IOException; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; +import java.util.stream.Collectors; +// This class is a Spark-based data preprocessing program, +// which will make use of the distributed compute framework of spark to +// do ETL job/sort/preaggregate jobs in spark job +// to boost the process of large amount of data load. +// the process steps are as following: +// 1. load data +// 1.1 load data from path/hive table +// 1.2 do the etl process +// 2. repartition data by using doris data model(partition and bucket) +// 3. process aggregation if needed +// 4. write data to parquet file + +public final class SparkDpp implements java.io.Serializable { + private static final Logger LOG = LoggerFactory.getLogger(SparkDpp.class); + + private static final String NULL_FLAG = "\\N"; + private static final String DPP_RESULT_FILE = "dpp_result.json"; + private static final String BITMAP_TYPE = "bitmap"; + Map> tableToBitmapDictColumns = new HashMap<>(); + Map> tableToBinaryBitmapColumns = new HashMap<>(); + private SparkSession spark = null; + private EtlJobConfig etlJobConfig = null; + private LongAccumulator abnormalRowAcc = null; + private LongAccumulator scannedRowsAcc = null; + private LongAccumulator fileNumberAcc = null; + private LongAccumulator fileSizeAcc = null; + private Map bucketKeyMap = new HashMap<>(); + // accumulator to collect invalid rows + private StringAccumulator invalidRows = new StringAccumulator(); + // save the hadoop configuration from spark session. + // because hadoop configuration is not serializable, + // we need to wrap it so that we can use it in executor. + private SerializableConfiguration serializableHadoopConf; + private DppResult dppResult = new DppResult(); + + // just for ut + public SparkDpp() { + } + + public SparkDpp(SparkSession spark, EtlJobConfig etlJobConfig, Map> tableToBitmapDictColumns, + Map> tableToBinaryBitmapColumns) { + this.spark = spark; + this.etlJobConfig = etlJobConfig; + if (tableToBitmapDictColumns != null) { + this.tableToBitmapDictColumns = tableToBitmapDictColumns; + } + if (tableToBinaryBitmapColumns != null) { + this.tableToBinaryBitmapColumns = tableToBinaryBitmapColumns; + } + } + + public void init() { + abnormalRowAcc = spark.sparkContext().longAccumulator("abnormalRowAcc"); + scannedRowsAcc = spark.sparkContext().longAccumulator("scannedRowsAcc"); + fileNumberAcc = spark.sparkContext().longAccumulator("fileNumberAcc"); + fileSizeAcc = spark.sparkContext().longAccumulator("fileSizeAcc"); + spark.sparkContext().register(invalidRows, "InvalidRowsAccumulator"); + this.serializableHadoopConf = new SerializableConfiguration(spark.sparkContext().hadoopConfiguration()); + } + + private JavaPairRDD, Object[]> processRDDAggregate(JavaPairRDD, Object[]> currentPairRDD, + RollupTreeNode curNode, + SparkRDDAggregator[] sparkRDDAggregators) + throws SparkDppException { + final boolean isDuplicateTable = !StringUtils.equalsIgnoreCase(curNode.indexMeta.indexType, "AGGREGATE") + && !StringUtils.equalsIgnoreCase(curNode.indexMeta.indexType, "UNIQUE"); + // Aggregate/UNIQUE table + if (!isDuplicateTable) { + int idx = 0; + for (int i = 0; i < curNode.indexMeta.columns.size(); i++) { + if (!curNode.indexMeta.columns.get(i).isKey) { + sparkRDDAggregators[idx] = SparkRDDAggregator.buildAggregator(curNode.indexMeta.columns.get(i)); + idx++; + } + } + + if (curNode.indexMeta.isBaseIndex) { + return currentPairRDD.mapToPair( + new EncodeBaseAggregateTableFunction(sparkRDDAggregators)) + .reduceByKey(new AggregateReduceFunction(sparkRDDAggregators)); + } else { + return currentPairRDD + .mapToPair(new EncodeRollupAggregateTableFunction( + getColumnIndexInParentRollup(curNode.keyColumnNames, curNode.valueColumnNames, + curNode.parent.keyColumnNames, curNode.parent.valueColumnNames))) + .reduceByKey(new AggregateReduceFunction(sparkRDDAggregators)); + } + // Duplicate Table + } else { + int idx = 0; + for (int i = 0; i < curNode.indexMeta.columns.size(); i++) { + if (!curNode.indexMeta.columns.get(i).isKey) { + // duplicate table doesn't need aggregator + // init a aggregator here just for keeping interface compatibility when writing data to HDFS + sparkRDDAggregators[idx] = new DefaultSparkRDDAggregator(); + idx++; + } + } + if (curNode.indexMeta.isBaseIndex) { + return currentPairRDD; + } else { + return currentPairRDD.mapToPair(new EncodeRollupAggregateTableFunction( + getColumnIndexInParentRollup(curNode.keyColumnNames, curNode.valueColumnNames, + curNode.parent.keyColumnNames, curNode.parent.valueColumnNames))); + } + } + } + + // write data to parquet file by using writing the parquet scheme of spark. + private void writeRepartitionAndSortedRDDToParquet(JavaPairRDD, Object[]> resultRDD, + String pathPattern, long tableId, + EtlJobConfig.EtlIndex indexMeta, + SparkRDDAggregator[] sparkRDDAggregators) { + // TODO(wb) should deal largeint as BigInteger instead of string when using biginteger as key, + // data type may affect sorting logic + StructType dstSchema = DppUtils.createDstTableSchema(indexMeta.columns, false, true); + + resultRDD.repartitionAndSortWithinPartitions(new BucketPartitioner(bucketKeyMap), new BucketComparator()) + .foreachPartition((VoidFunction, Object[]>>>) t -> { + // write the data to dst file + Configuration conf = new Configuration(serializableHadoopConf.value()); + FileSystem fs = FileSystem.get(new Path(etlJobConfig.outputPath).toUri(), conf); + String lastBucketKey = null; + ParquetWriter parquetWriter = null; + TaskContext taskContext = TaskContext.get(); + long taskAttemptId = taskContext.taskAttemptId(); + String dstPath = ""; + String tmpPath = ""; + + while (t.hasNext()) { + Tuple2, Object[]> pair = t.next(); + List keyColumns = pair._1(); + Object[] valueColumns = pair._2(); + if ((keyColumns.size() + valueColumns.length) <= 1) { + LOG.warn("invalid row:" + pair); + continue; + } + + String curBucketKey = keyColumns.get(0).toString(); + List columnObjects = new ArrayList<>(); + for (int i = 1; i < keyColumns.size(); ++i) { + columnObjects.add(keyColumns.get(i)); + } + for (int i = 0; i < valueColumns.length; ++i) { + columnObjects.add(sparkRDDAggregators[i].finalize(valueColumns[i])); + } + + // if the bucket key is new, it will belong to a new tablet + if (!curBucketKey.equals(lastBucketKey)) { + if (parquetWriter != null) { + parquetWriter.close(); + // rename tmpPath to path + try { + fs.rename(new Path(tmpPath), new Path(dstPath)); + } catch (IOException ioe) { + LOG.warn("rename from tmpPath" + tmpPath + " to dstPath:" + dstPath + + " failed. exception:" + ioe); + throw ioe; + } + } + // flush current writer and create a new writer + String[] bucketKey = curBucketKey.split("_"); + if (bucketKey.length != 2) { + LOG.warn("invalid bucket key:" + curBucketKey); + continue; + } + long partitionId = Long.parseLong(bucketKey[0]); + int bucketId = Integer.parseInt(bucketKey[1]); + dstPath = String.format(pathPattern, tableId, partitionId, indexMeta.indexId, bucketId, + indexMeta.schemaHash); + tmpPath = dstPath + "." + taskAttemptId; + conf.setBoolean("spark.sql.parquet.writeLegacyFormat", false); + conf.setBoolean("spark.sql.parquet.int64AsTimestampMillis", false); + conf.setBoolean("spark.sql.parquet.int96AsTimestamp", true); + conf.setBoolean("spark.sql.parquet.binaryAsString", false); + conf.setBoolean("spark.sql.parquet.fieldId.write.enabled", true); + conf.set("spark.sql.parquet.outputTimestampType", "INT96"); + ParquetWriteSupport.setSchema(dstSchema, conf); + ParquetWriteSupport parquetWriteSupport = new ParquetWriteSupport(); + parquetWriter = new ParquetWriter<>(new Path(tmpPath), parquetWriteSupport, + CompressionCodecName.SNAPPY, 256 * 1024 * 1024, 16 * 1024, 1024 * 1024, true, false, + WriterVersion.PARQUET_1_0, conf); + LOG.info("[HdfsOperate]>> initialize writer succeed! path:" + tmpPath); + lastBucketKey = curBucketKey; + } + Object[] array = columnObjects.toArray(); + Object[] catalystArr = new Object[array.length]; + for (int i = 0; i < array.length; i++) { + catalystArr[i] = CatalystTypeConverters.createToCatalystConverter(dstSchema.apply(i).dataType()).apply(array[i]); + } + InternalRow internalRow = InternalRow.apply( + JavaConverters.asScalaBufferConverter(Arrays.asList(catalystArr)).asScala() + .toSeq()); + parquetWriter.write(internalRow); + } + if (parquetWriter != null) { + parquetWriter.close(); + try { + fs.rename(new Path(tmpPath), new Path(dstPath)); + } catch (IOException ioe) { + LOG.warn("rename from tmpPath" + tmpPath + " to dstPath:" + dstPath + " failed. exception:" + + ioe); + throw ioe; + } + } + + }); + } + + // TODO(wb) one shuffle to calculate the rollup in the same level + private void processRollupTree(RollupTreeNode rootNode, + JavaPairRDD, Object[]> rootRDD, + long tableId, EtlJobConfig.EtlIndex baseIndex) throws SparkDppException { + Queue nodeQueue = new LinkedList<>(); + nodeQueue.offer(rootNode); + int currentLevel = 0; + // level travel the tree + Map, Object[]>> parentRDDMap = new HashMap<>(); + parentRDDMap.put(baseIndex.indexId, rootRDD); + Map, Object[]>> childrenRDDMap = new HashMap<>(); + String pathPattern = etlJobConfig.outputPath + "/" + etlJobConfig.outputFilePattern; + while (!nodeQueue.isEmpty()) { + RollupTreeNode curNode = nodeQueue.poll(); + LOG.info("start to process index:" + curNode.indexId); + if (curNode.children != null) { + for (RollupTreeNode child : curNode.children) { + nodeQueue.offer(child); + } + } + JavaPairRDD, Object[]> curRDD = null; + // column select for rollup + if (curNode.level != currentLevel) { + for (JavaPairRDD, Object[]> rdd : parentRDDMap.values()) { + rdd.unpersist(); + } + currentLevel = curNode.level; + parentRDDMap.clear(); + parentRDDMap = childrenRDDMap; + childrenRDDMap = new HashMap<>(); + } + + long parentIndexId = baseIndex.indexId; + if (curNode.parent != null) { + parentIndexId = curNode.parent.indexId; + } + + JavaPairRDD, Object[]> parentRDD = parentRDDMap.get(parentIndexId); + + // aggregate + SparkRDDAggregator[] sparkRDDAggregators = new SparkRDDAggregator[curNode.valueColumnNames.size()]; + curRDD = processRDDAggregate(parentRDD, curNode, sparkRDDAggregators); + + childrenRDDMap.put(curNode.indexId, curRDD); + + if (curNode.children != null && curNode.children.size() > 1) { + // if the children number larger than 1, persist the dataframe for performance + curRDD.persist(StorageLevel.MEMORY_AND_DISK()); + } + // repartition and write to hdfs + writeRepartitionAndSortedRDDToParquet(curRDD, pathPattern, tableId, curNode.indexMeta, sparkRDDAggregators); + } + } + + // get column index map from parent rollup to child rollup + // not consider bucketId here + private Pair getColumnIndexInParentRollup(List childRollupKeyColumns, + List childRollupValueColumns, + List parentRollupKeyColumns, + List parentRollupValueColumns) + throws SparkDppException { + List keyMap = new ArrayList<>(); + List valueMap = new ArrayList<>(); + // find column index in parent rollup schema + for (String childRollupKeyColumn : childRollupKeyColumns) { + for (int j = 0; j < parentRollupKeyColumns.size(); j++) { + if (StringUtils.equalsIgnoreCase(childRollupKeyColumn, parentRollupKeyColumns.get(j))) { + keyMap.add(j); + break; + } + } + } + + for (String childRollupValueColumn : childRollupValueColumns) { + for (int j = 0; j < parentRollupValueColumns.size(); j++) { + if (StringUtils.equalsIgnoreCase(childRollupValueColumn, parentRollupValueColumns.get(j))) { + valueMap.add(j); + break; + } + } + } + + if (keyMap.size() != childRollupKeyColumns.size() || valueMap.size() != childRollupValueColumns.size()) { + throw new SparkDppException(String.format("column map index from child to parent has error," + + " key size src: %s, dst: %s; value size src: %s, dst: %s", + childRollupKeyColumns.size(), keyMap.size(), childRollupValueColumns.size(), valueMap.size())); + } + + return Pair.of(keyMap.toArray(new Integer[0]), valueMap.toArray(new Integer[0])); + } + + /** + * check decimal,char/varchar + */ + public boolean validateData(Object srcValue, EtlJobConfig.EtlColumn etlColumn, ColumnParser columnParser, Row row) { + + switch (etlColumn.columnType.toUpperCase()) { + case "DECIMALV2": + case "DECIMAL32": + case "DECIMAL64": + case "DECIMAL128": + // TODO(wb): support decimal round; see be DecimalV2Value::round + DecimalParser decimalParser = (DecimalParser) columnParser; + BigDecimal srcBigDecimal = (BigDecimal) srcValue; + if (srcValue != null && (decimalParser.getMaxValue().compareTo(srcBigDecimal) < 0 + || decimalParser.getMinValue().compareTo(srcBigDecimal) > 0)) { + LOG.warn(String.format("decimal value is not valid for defination, column=%s," + + " value=%s,precision=%s,scale=%s", + etlColumn.columnName, srcValue, srcBigDecimal.precision(), srcBigDecimal.scale())); + return false; + } + break; + case "CHAR": + case "VARCHAR": + // TODO(wb) padding char type + int strSize = 0; + if (srcValue != null && (strSize = srcValue.toString().getBytes(StandardCharsets.UTF_8).length) + > etlColumn.stringLength) { + LOG.warn(String.format("the length of input is too long than schema." + + " column_name:%s,input_str[%s],schema length:%s,actual length:%s", + etlColumn.columnName, row.toString(), etlColumn.stringLength, strSize)); + return false; + } + break; + case "STRING": + case "TEXT": + // TODO(zjf) padding string type + int strDataSize = 0; + if (srcValue != null && (strDataSize = srcValue.toString().getBytes(StandardCharsets.UTF_8).length) + > DppUtils.STRING_LENGTH_LIMIT) { + LOG.warn(String.format("The string type is limited to a maximum of %s bytes." + + " column_name:%s,input_str[%s],actual length:%s", + DppUtils.STRING_LENGTH_LIMIT, etlColumn.columnName, row.toString(), strDataSize)); + return false; + } + break; + default: + return true; + } + return true; + } + + /** + * 1 project column and reorder column + * 2 validate data + * 3 fill tuple with partition column + */ + private JavaPairRDD, Object[]> fillTupleWithPartitionColumn(Dataset dataframe, + EtlJobConfig.EtlPartitionInfo partitionInfo, + List partitionKeyIndex, + List partitionRangeKeys, + List keyAndPartitionColumnNames, + List valueColumnNames, + StructType dstTableSchema, + EtlJobConfig.EtlIndex baseIndex, + List validPartitionIds) + throws SparkDppException { + List distributeColumns = partitionInfo.distributionColumnRefs; + Partitioner partitioner = new DorisRangePartitioner(partitionInfo, partitionKeyIndex, partitionRangeKeys); + Set validPartitionIndex = new HashSet<>(); + if (validPartitionIds == null) { + for (int i = 0; i < partitionInfo.partitions.size(); ++i) { + validPartitionIndex.add(i); + } + } else { + for (int i = 0; i < partitionInfo.partitions.size(); ++i) { + if (validPartitionIds.contains(partitionInfo.partitions.get(i).partitionId)) { + validPartitionIndex.add(i); + } + } + } + + Map parsers = Maps.newHashMap(); + for (EtlJobConfig.EtlColumn column : baseIndex.columns) { + parsers.put(column.columnName, ColumnParser.create(column)); + } + + // use PairFlatMapFunction instead of PairMapFunction because the there will be + // 0 or 1 output row for 1 input row + JavaPairRDD, Object[]> resultPairRDD = dataframe.toJavaRDD().flatMapToPair( + (PairFlatMapFunction, Object[]>) row -> { + List, Object[]>> result = new ArrayList<>(); + List keyAndPartitionColumns = new ArrayList<>(); + List keyColumns = new ArrayList<>(); + List valueColumns = new ArrayList<>(valueColumnNames.size()); + for (String columnName : keyAndPartitionColumnNames) { + Object columnObject = row.get(row.fieldIndex(columnName)); + if (!validateData(columnObject, baseIndex.getColumn(columnName), + parsers.get(columnName), row)) { + LOG.info("invalid row: " + row); + abnormalRowAcc.add(1); + return IteratorUtils.emptyIterator(); + } + keyAndPartitionColumns.add(columnObject); + + if (baseIndex.getColumn(columnName).isKey) { + keyColumns.add(columnObject); + } + } + + for (String columnName : valueColumnNames) { + Object columnObject = row.get(row.fieldIndex(columnName)); + if (!validateData(columnObject, baseIndex.getColumn(columnName), + parsers.get(columnName), row)) { + abnormalRowAcc.add(1); + return IteratorUtils.emptyIterator(); + } + valueColumns.add(columnObject); + } + + DppColumns key = new DppColumns(keyAndPartitionColumns); + int pid = partitioner.getPartition(key); + if (!validPartitionIndex.contains(pid)) { + LOG.warn("invalid partition for row:" + row + ", pid:" + pid); + abnormalRowAcc.add(1); + LOG.info("abnormalRowAcc:" + abnormalRowAcc); + if (abnormalRowAcc.value() < 5) { + LOG.info("add row to invalidRows:" + row.toString()); + invalidRows.add(row.toString()); + LOG.info("invalid rows contents:" + invalidRows.value()); + } + } else { + // TODO(wb) support lagreint for hash + long hashValue = DppUtils.getHashValue(row, distributeColumns, dstTableSchema); + int bucketId = (int) ((hashValue & 0xffffffffL) % partitionInfo.partitions.get(pid).bucketNum); + long partitionId = partitionInfo.partitions.get(pid).partitionId; + // bucketKey is partitionId_bucketId + String bucketKey = partitionId + "_" + bucketId; + + List tuple = new ArrayList<>(); + tuple.add(bucketKey); + tuple.addAll(keyColumns); + result.add(new Tuple2<>(tuple, valueColumns.toArray())); + } + return result.iterator(); + }); + + // use bucket number as the parallel number + int reduceNum = 0; + for (EtlJobConfig.EtlPartition partition : partitionInfo.partitions) { + for (int i = 0; i < partition.bucketNum; i++) { + bucketKeyMap.put(partition.partitionId + "_" + i, reduceNum); + reduceNum++; + } + } + + // print to system.out for easy to find log info + System.out.println("print bucket key map:" + bucketKeyMap.toString()); + + return resultPairRDD; + } + + // do the etl process + private Dataset convertSrcDataframeToDstDataframe(EtlJobConfig.EtlIndex baseIndex, + Dataset srcDataframe, StructType dstTableSchema, + EtlJobConfig.EtlFileGroup fileGroup) + throws SparkDppException { + + Dataset dataframe = srcDataframe; + StructType srcSchema = dataframe.schema(); + Set srcColumnNames = new HashSet<>(); + for (StructField field : srcSchema.fields()) { + srcColumnNames.add(field.name()); + } + Map columnMappings = fileGroup.columnMappings; + // 1. process simple columns + Set mappingColumns = null; + if (columnMappings != null) { + mappingColumns = columnMappings.keySet(); + } + List dstColumnNames = new ArrayList<>(); + for (StructField dstField : dstTableSchema.fields()) { + dstColumnNames.add(dstField.name()); + EtlJobConfig.EtlColumn column = baseIndex.getColumn(dstField.name()); + if (!srcColumnNames.contains(dstField.name())) { + if (mappingColumns != null && mappingColumns.contains(dstField.name())) { + // mapping columns will be processed in next step + continue; + } + if (column.defaultValue != null) { + if (column.defaultValue.equals(NULL_FLAG)) { + dataframe = dataframe.withColumn(dstField.name(), functions.lit(null)); + } else { + dataframe = dataframe.withColumn(dstField.name(), functions.lit(column.defaultValue)); + } + } else if (column.isAllowNull) { + dataframe = dataframe.withColumn(dstField.name(), functions.lit(null)); + } else { + throw new SparkDppException("Reason: no data for column:" + dstField.name()); + } + } + if (column.columnType.equalsIgnoreCase("DATE") || column.columnType.equalsIgnoreCase("DATEV2")) { + dataframe = dataframe.withColumn(dstField.name(), + dataframe.col(dstField.name()).cast(DataTypes.DateType)); + } else if (column.columnType.equalsIgnoreCase("DATETIME") + || column.columnType.equalsIgnoreCase("DATETIMEV2")) { + dataframe = dataframe.withColumn(dstField.name(), + dataframe.col(dstField.name()).cast(DataTypes.TimestampType)); + } else if (column.columnType.equalsIgnoreCase("BOOLEAN")) { + dataframe = dataframe.withColumn(dstField.name(), + functions.when(functions.lower(dataframe.col(dstField.name())).equalTo("true"), "1") + .when(dataframe.col(dstField.name()).equalTo("1"), "1") + .otherwise("0")); + } else if (!column.columnType.equalsIgnoreCase(BITMAP_TYPE) + && !dstField.dataType().equals(DataTypes.StringType)) { + dataframe = dataframe.withColumn(dstField.name(), + dataframe.col(dstField.name()).cast(dstField.dataType())); + } else if (column.columnType.equalsIgnoreCase(BITMAP_TYPE) + && dstField.dataType().equals(DataTypes.BinaryType)) { + dataframe = dataframe.withColumn(dstField.name(), + dataframe.col(dstField.name()).cast(DataTypes.BinaryType)); + } + if (fileGroup.isNegative && !column.isKey) { + // negative load + // value will be convert te -1 * value + dataframe = dataframe.withColumn(dstField.name(), functions.expr("-1 *" + dstField.name())); + } + } + // 2. process the mapping columns + for (String mappingColumn : mappingColumns) { + String mappingDescription = columnMappings.get(mappingColumn).toDescription(); + if (mappingDescription.toLowerCase().contains("hll_hash")) { + continue; + } + // here should cast data type to dst column type + dataframe = dataframe.withColumn(mappingColumn, + functions.expr(mappingDescription).cast(dstTableSchema.apply(mappingColumn).dataType())); + } + return dataframe; + } + + private Dataset loadDataFromPath(SparkSession spark, + EtlJobConfig.EtlFileGroup fileGroup, + String fileUrl, + EtlJobConfig.EtlIndex baseIndex, + List columns) throws SparkDppException { + List columnValueFromPath = DppUtils.parseColumnsFromPath(fileUrl, fileGroup.columnsFromPath); + List dataSrcColumns = fileGroup.fileFieldNames; + if (dataSrcColumns == null) { + // if there is no source columns info + // use base index columns as source columns + dataSrcColumns = new ArrayList<>(); + for (EtlJobConfig.EtlColumn column : baseIndex.columns) { + dataSrcColumns.add(column.columnName); + } + } + // for getting schema to check source data + Map dstColumnNameToIndex = new HashMap(); + for (int i = 0; i < baseIndex.columns.size(); i++) { + dstColumnNameToIndex.put(baseIndex.columns.get(i).columnName, i); + } + List srcColumnsWithColumnsFromPath = new ArrayList<>(dataSrcColumns); + if (fileGroup.columnsFromPath != null) { + srcColumnsWithColumnsFromPath.addAll(fileGroup.columnsFromPath); + } + + if ("parquet".equalsIgnoreCase(fileGroup.fileFormat)) { + // parquet had its own schema, just use it; perhaps we could add some validation in future. + Dataset dataFrame = spark.read().parquet(fileUrl); + if (!CollectionUtils.isEmpty(columnValueFromPath)) { + for (int k = 0; k < columnValueFromPath.size(); k++) { + dataFrame = dataFrame.withColumn( + fileGroup.columnsFromPath.get(k), functions.lit(columnValueFromPath.get(k))); + } + } + if (!Strings.isNullOrEmpty(fileGroup.where)) { + dataFrame = dataFrame.where(fileGroup.where); + } + return dataFrame; + } + + if ("orc".equalsIgnoreCase(fileGroup.fileFormat)) { + Dataset dataFrame = spark.read().orc(fileUrl); + if (!CollectionUtils.isEmpty(columnValueFromPath)) { + for (int k = 0; k < columnValueFromPath.size(); k++) { + dataFrame = dataFrame.withColumn( + fileGroup.columnsFromPath.get(k), functions.lit(columnValueFromPath.get(k))); + } + } + if (!Strings.isNullOrEmpty(fileGroup.where)) { + dataFrame = dataFrame.where(fileGroup.where); + } + return dataFrame; + } + + StructType srcSchema = createScrSchema(srcColumnsWithColumnsFromPath); + JavaRDD sourceDataRdd = spark.read().textFile(fileUrl).toJavaRDD(); + int columnSize = dataSrcColumns.size(); + List parsers = new ArrayList<>(); + for (EtlJobConfig.EtlColumn column : baseIndex.columns) { + parsers.add(ColumnParser.create(column)); + } + char separator = (char) fileGroup.columnSeparator.getBytes(StandardCharsets.UTF_8)[0]; + JavaRDD rowRDD = sourceDataRdd.flatMap( + record -> { + scannedRowsAcc.add(1); + String[] attributes = splitLine(record, separator); + List result = new ArrayList<>(); + boolean validRow = true; + if (attributes.length != columnSize) { + LOG.warn("invalid src schema, data columns:" + + attributes.length + ", file group columns:" + + columnSize + ", row:" + record); + validRow = false; + } else { + for (int i = 0; i < attributes.length; ++i) { + StructField field = srcSchema.apply(i); + String srcColumnName = field.name(); + if (attributes[i].equals(NULL_FLAG) && dstColumnNameToIndex.containsKey(srcColumnName)) { + if (baseIndex.columns.get(dstColumnNameToIndex.get(srcColumnName)).isAllowNull) { + attributes[i] = null; + } else { + LOG.warn("column name:" + srcColumnName + ", attribute: " + i + + " can not be null. row:" + record); + validRow = false; + break; + } + } + boolean isStrictMode = etlJobConfig.properties.strictMode; + if (isStrictMode) { + if (dstColumnNameToIndex.containsKey(srcColumnName)) { + int index = dstColumnNameToIndex.get(srcColumnName); + String type = columns.get(index).columnType; + if (type.equalsIgnoreCase("CHAR") + || type.equalsIgnoreCase("VARCHAR") + || fileGroup.columnMappings.containsKey(field.name())) { + continue; + } + ColumnParser parser = parsers.get(index); + boolean valid = parser.parse(attributes[i]); + if (!valid) { + validRow = false; + LOG.warn("invalid row:" + record + + ", attribute " + i + ": " + attributes[i] + " parsed failed"); + break; + } + } + } + } + } + if (validRow) { + Row row = null; + if (fileGroup.columnsFromPath == null) { + row = RowFactory.create(attributes); + } else { + // process columns from path + // append columns from path to the tail + List columnAttributes = new ArrayList<>(); + columnAttributes.addAll(Arrays.asList(attributes)); + columnAttributes.addAll(columnValueFromPath); + row = RowFactory.create(columnAttributes.toArray()); + } + result.add(row); + } else { + abnormalRowAcc.add(1); + // at most add 5 rows to invalidRows + if (abnormalRowAcc.value() <= 5) { + invalidRows.add(record); + } + } + return result.iterator(); + } + ); + + Dataset dataframe = spark.createDataFrame(rowRDD, srcSchema); + if (!Strings.isNullOrEmpty(fileGroup.where)) { + dataframe = dataframe.where(fileGroup.where); + } + return dataframe; + } + + private StructType createScrSchema(List srcColumns) { + List fields = new ArrayList<>(); + for (String srcColumn : srcColumns) { + // user StringType to load source data + StructField field = DataTypes.createStructField(srcColumn, DataTypes.StringType, true); + fields.add(field); + } + return DataTypes.createStructType(fields); + } + + // This method is to keep the splitting consistent with broker load / mini load + private String[] splitLine(String line, char sep) { + if (line == null || line.isEmpty()) { + return new String[0]; + } + int index = 0; + int lastIndex = 0; + // line-begin char and line-end char are considered to be 'delimeter' + List values = new ArrayList<>(); + for (int i = 0; i < line.length(); i++, index++) { + if (line.charAt(index) == sep) { + values.add(line.substring(lastIndex, index)); + lastIndex = index + 1; + } + } + values.add(line.substring(lastIndex, index)); + return values.toArray(new String[0]); + } + + // partition keys will be parsed into double from json + // so need to convert it to partition columns' type + private Object convertPartitionKey(Object srcValue, Class dstClass, boolean isV2Type) throws SparkDppException { + if (dstClass.equals(Float.class) || dstClass.equals(Double.class)) { + return null; + } + if (srcValue instanceof Double) { + if (dstClass.equals(Short.class)) { + return ((Double) srcValue).shortValue(); + } else if (dstClass.equals(Integer.class)) { + return ((Double) srcValue).intValue(); + } else if (dstClass.equals(Long.class)) { + return ((Double) srcValue).longValue(); + } else if (dstClass.equals(BigInteger.class)) { + // TODO(wb) gson will cast origin value to double by default + // when the partition column is largeint, this will cause error data + // need fix it thoroughly + return new BigInteger(srcValue.toString()); + } else if (dstClass.equals(java.sql.Date.class) || dstClass.equals(java.util.Date.class)) { + double srcValueDouble = (double) srcValue; + return convertToJavaDate((int) srcValueDouble); + } else if (dstClass.equals(java.sql.Timestamp.class)) { + double srcValueDouble = (double) srcValue; + if (isV2Type) { + return convertV2ToJavaDatetime((long) srcValueDouble); + } + return convertToJavaDatetime((long) srcValueDouble); + } else { + // dst type is string + return srcValue.toString(); + } + } else { + LOG.warn("unsupport partition key:" + srcValue); + throw new SparkDppException("unsupport partition key:" + srcValue); + } + } + + private java.sql.Timestamp convertToJavaDatetime(long src) { + String dateTimeStr = Long.valueOf(src).toString(); + if (dateTimeStr.length() != 14) { + throw new RuntimeException("invalid input date format for SparkDpp, src: " + dateTimeStr); + } + + String year = dateTimeStr.substring(0, 4); + String month = dateTimeStr.substring(4, 6); + String day = dateTimeStr.substring(6, 8); + String hour = dateTimeStr.substring(8, 10); + String min = dateTimeStr.substring(10, 12); + String sec = dateTimeStr.substring(12, 14); + + return java.sql.Timestamp.valueOf(String.format("%s-%s-%s %s:%s:%s", year, month, day, hour, min, sec)); + } + + private java.sql.Timestamp convertV2ToJavaDatetime(long src) { + String dateTimeStr = Long.valueOf(src).toString(); + if (dateTimeStr.length() != 18) { + throw new RuntimeException("invalid input date format for SparkDpp, src: " + dateTimeStr); + } + + long year = (src >> 46); + long month = (src >> 42) & ((1L << 4) - 1); + long day = (src >> 37) & ((1L << 5) - 1); + long hour = (src >> 32) & ((1L << 5) - 1); + long min = (src >> 26) & ((1L << 6) - 1); + long sec = (src >> 20) & ((1L << 6) - 1); + long ms = src & ((1L << 20) - 1); + + return java.sql.Timestamp.valueOf( + String.format("%d-%02d-%02d %02d:%02d:%02d.%d", year, month, day, hour, min, sec, ms)); + } + + private java.sql.Date convertToJavaDate(int originDate) { + int day = originDate & 0x1f; + originDate >>= 5; + int month = originDate & 0x0f; + originDate >>= 4; + int year = originDate; + return java.sql.Date.valueOf(String.format("%04d-%02d-%02d", year, month, day)); + } + + private List createPartitionRangeKeys( + EtlJobConfig.EtlPartitionInfo partitionInfo, List> partitionKeySchema, + Map partitionKeyIndexToType) throws SparkDppException { + List partitionRangeKeys = new ArrayList<>(); + for (EtlJobConfig.EtlPartition partition : partitionInfo.partitions) { + DorisRangePartitioner.PartitionRangeKey partitionRangeKey = new DorisRangePartitioner.PartitionRangeKey(); + List startKeyColumns = new ArrayList<>(); + for (int i = 0; i < partition.startKeys.size(); i++) { + Object value = partition.startKeys.get(i); + boolean isV2Type = + partitionKeyIndexToType.get(i) != null && partitionKeyIndexToType.get(i).endsWith("V2"); + startKeyColumns.add(convertPartitionKey(value, partitionKeySchema.get(i), isV2Type)); + } + partitionRangeKey.startKeys = new DppColumns(startKeyColumns); + if (!partition.isMaxPartition) { + partitionRangeKey.isMaxPartition = false; + List endKeyColumns = new ArrayList<>(); + for (int i = 0; i < partition.endKeys.size(); i++) { + Object value = partition.endKeys.get(i); + boolean isV2Type = + partitionKeyIndexToType.get(i) != null && partitionKeyIndexToType.get(i).endsWith("V2"); + endKeyColumns.add(convertPartitionKey(value, partitionKeySchema.get(i), isV2Type)); + } + partitionRangeKey.endKeys = new DppColumns(endKeyColumns); + } else { + partitionRangeKey.isMaxPartition = true; + } + partitionRangeKeys.add(partitionRangeKey); + } + return partitionRangeKeys; + } + + private Dataset loadDataFromFilePaths(SparkSession spark, + EtlJobConfig.EtlIndex baseIndex, + List filePaths, + EtlJobConfig.EtlFileGroup fileGroup, + StructType dstTableSchema) + throws SparkDppException, IOException { + Dataset fileGroupDataframe = null; + for (String filePath : filePaths) { + try { + FileSystem fs = FileSystem.get(new Path(filePath).toUri(), serializableHadoopConf.value()); + FileStatus[] fileStatuses = fs.globStatus(new Path(filePath)); + if (fileStatuses == null) { + throw new SparkDppException("fs list status failed: " + filePath); + } + for (FileStatus fileStatus : fileStatuses) { + if (fileStatus.isDirectory()) { + continue; + } + fileNumberAcc.add(1); + fileSizeAcc.add(fileStatus.getLen()); + } + } catch (Exception e) { + LOG.warn("parse path failed:" + filePath); + throw e; + } + if (fileGroup.columnSeparator == null) { + LOG.warn("invalid null column separator!"); + throw new SparkDppException("Reason: invalid null column separator!"); + } + Dataset dataframe = null; + + dataframe = loadDataFromPath(spark, fileGroup, filePath, baseIndex, baseIndex.columns); + dataframe = convertSrcDataframeToDstDataframe(baseIndex, dataframe, dstTableSchema, fileGroup); + if (fileGroupDataframe == null) { + fileGroupDataframe = dataframe; + } else { + fileGroupDataframe.union(dataframe); + } + } + return fileGroupDataframe; + } + + private Dataset loadDataFromHiveTable(SparkSession spark, + String hiveDbTableName, + EtlJobConfig.EtlIndex baseIndex, + EtlJobConfig.EtlFileGroup fileGroup, + StructType dstTableSchema, + Set dictBitmapColumnSet, + Set binaryBitmapColumnsSet) throws SparkDppException { + // select base index columns from hive table + StringBuilder sql = new StringBuilder(); + sql.append("select "); + baseIndex.columns.forEach(column -> { + sql.append(column.columnName).append(","); + }); + sql.deleteCharAt(sql.length() - 1).append(" from ").append(hiveDbTableName); + if (!Strings.isNullOrEmpty(fileGroup.where)) { + sql.append(" where ").append(fileGroup.where); + } + + Dataset dataframe = spark.sql(sql.toString()); + dataframe.show(); + // Note(wb): in current spark load implementation, spark load can't be consistent with doris BE; + // The reason is as follows + // For stream load in doris BE, it runs as follow steps: + // step 1: type check + // step 2: expression calculation + // step 3: strict mode check + // step 4: nullable column check + // BE can do the four steps row by row + // but spark load relies on spark to do step2, so it can only do step 1 for whole dataset + // and then do step 2 for whole dataset and so on; + // So in spark load, we first do step 1,3,4,and then do step 2. + dataframe = checkDataFromHiveWithStrictMode(dataframe, baseIndex, fileGroup.columnMappings.keySet(), + etlJobConfig.properties.strictMode, dstTableSchema, dictBitmapColumnSet, binaryBitmapColumnsSet); + dataframe = convertSrcDataframeToDstDataframe(baseIndex, dataframe, dstTableSchema, fileGroup); + return dataframe; + } + + private Dataset checkDataFromHiveWithStrictMode(Dataset dataframe, EtlJobConfig.EtlIndex baseIndex, + Set mappingColKeys, boolean isStrictMode, + StructType dstTableSchema, + Set dictBitmapColumnSet, + Set binaryBitmapColumnsSet) throws SparkDppException { + List columnNameNeedCheckArrayList = new ArrayList<>(); + List columnParserArrayList = new ArrayList<>(); + for (EtlJobConfig.EtlColumn column : baseIndex.columns) { + // note(wb): there are three data source for bitmap column + // case 1: global dict and binary data; needn't check + // case 2: bitmap hash function; this func is not supported in spark load now, so ignore it here + // case 3: origin value is a integer value; it should be checked use LongParser + if (StringUtils.equalsIgnoreCase(column.columnType, "bitmap")) { + if (dictBitmapColumnSet.contains(column.columnName.toLowerCase())) { + continue; + } + if (binaryBitmapColumnsSet.contains(column.columnName.toLowerCase())) { + continue; + } + columnNameNeedCheckArrayList.add(column); + columnParserArrayList.add(new BigIntParser()); + } else if (!StringUtils.equalsIgnoreCase(column.columnType, "varchar") + && !StringUtils.equalsIgnoreCase(column.columnType, "char") + && !mappingColKeys.contains(column.columnName)) { + columnNameNeedCheckArrayList.add(column); + columnParserArrayList.add(ColumnParser.create(column)); + } + } + + ColumnParser[] columnParserArray = columnParserArrayList.toArray(new ColumnParser[0]); + EtlJobConfig.EtlColumn[] columnNameArray = columnNameNeedCheckArrayList.toArray(new EtlJobConfig.EtlColumn[0]); + + StructType srcSchema = dataframe.schema(); + JavaRDD result = dataframe.toJavaRDD().flatMap(new FlatMapFunction() { + @Override + public Iterator call(Row row) throws Exception { + List result = new ArrayList<>(); + Set columnIndexNeedToRepalceNull = new HashSet(); + boolean validRow = true; + for (int i = 0; i < columnNameArray.length; i++) { + EtlJobConfig.EtlColumn column = columnNameArray[i]; + int fieldIndex = row.fieldIndex(column.columnName); + Object value = row.get(fieldIndex); + if (value == null && !column.isAllowNull) { + validRow = false; + LOG.warn("column:" + i + " can not be null. row:" + row.toString()); + break; + } + if (value != null && !columnParserArray[i].parse(value.toString())) { + if (isStrictMode) { + validRow = false; + LOG.warn(String.format("row parsed failed in strict mode, column name %s, src row %s", + column.columnName, row.toString())); + } else if (!column.isAllowNull) { + // a column parsed failed would be filled null, + // but if doris column is not allowed null, we should skip this row + validRow = false; + LOG.warn("column:" + i + " can not be null. row:" + row.toString()); + break; + } else { + columnIndexNeedToRepalceNull.add(fieldIndex); + } + } + } + if (!validRow) { + abnormalRowAcc.add(1); + // at most add 5 rows to invalidRows + if (abnormalRowAcc.value() <= 5) { + invalidRows.add(row.toString()); + } + } else if (!columnIndexNeedToRepalceNull.isEmpty()) { + scannedRowsAcc.add(1); + Object[] newRow = new Object[row.size()]; + for (int i = 0; i < row.size(); i++) { + if (columnIndexNeedToRepalceNull.contains(i)) { + newRow[i] = null; + } else { + newRow[i] = row.get(i); + } + } + result.add(RowFactory.create(newRow)); + } else { + scannedRowsAcc.add(1); + result.add(row); + } + return result.iterator(); + } + }); + + // here we just check data but not do cast, + // so data type should be same with src schema which is hive table schema + return spark.createDataFrame(result, srcSchema); + } + + private void process() throws Exception { + try { + for (Map.Entry entry : etlJobConfig.tables.entrySet()) { + Long tableId = entry.getKey(); + EtlJobConfig.EtlTable etlTable = entry.getValue(); + LOG.info("etlTable:" + etlTable); + Set dictBitmapColumnSet = tableToBitmapDictColumns.getOrDefault(tableId, new HashSet<>()); + Set binaryBitmapColumnSet = tableToBinaryBitmapColumns.getOrDefault(tableId, new HashSet<>()); + + // get the base index meta + EtlJobConfig.EtlIndex baseIndex = null; + for (EtlJobConfig.EtlIndex indexMeta : etlTable.indexes) { + if (indexMeta.isBaseIndex) { + baseIndex = indexMeta; + break; + } + } + + // get key and partition column names and value column names separately + List keyAndPartitionColumnNames = new ArrayList<>(); + List valueColumnNames = new ArrayList<>(); + for (EtlJobConfig.EtlColumn etlColumn : baseIndex.columns) { + if (etlColumn.isKey) { + keyAndPartitionColumnNames.add(etlColumn.columnName); + } else { + if (etlTable.partitionInfo.partitionColumnRefs.contains(etlColumn.columnName)) { + keyAndPartitionColumnNames.add(etlColumn.columnName); + } + valueColumnNames.add(etlColumn.columnName); + } + } + + EtlJobConfig.EtlPartitionInfo partitionInfo = etlTable.partitionInfo; + List partitionKeyIndex = new ArrayList(); + List> partitionKeySchema = new ArrayList<>(); + for (String key : partitionInfo.partitionColumnRefs) { + for (int i = 0; i < baseIndex.columns.size(); ++i) { + EtlJobConfig.EtlColumn column = baseIndex.columns.get(i); + if (column.columnName.equals(key)) { + partitionKeyIndex.add(keyAndPartitionColumnNames.indexOf(key)); + partitionKeySchema.add(DppUtils.getClassFromColumn(column)); + break; + } + } + } + Map columnToType = baseIndex.columns.stream().collect( + Collectors.toMap(etlColumn -> etlColumn.columnName, etlColumn -> etlColumn.columnType)); + Map partitionKeyIndexToType = new HashMap<>(); + for (int i = 0; i < partitionInfo.partitionColumnRefs.size(); i++) { + String partitionColumn = partitionInfo.partitionColumnRefs.get(i); + partitionKeyIndexToType.put(i, columnToType.get(partitionColumn)); + } + List partitionRangeKeys + = createPartitionRangeKeys(partitionInfo, partitionKeySchema, partitionKeyIndexToType); + StructType dstTableSchema = DppUtils.createDstTableSchema(baseIndex.columns, false, false); + dstTableSchema = DppUtils.replaceBinaryColsInSchema(binaryBitmapColumnSet, dstTableSchema); + RollupTreeBuilder rollupTreeParser = new MinimumCoverageRollupTreeBuilder(); + RollupTreeNode rootNode = rollupTreeParser.build(etlTable); + LOG.info("Start to process rollup tree:" + rootNode); + + JavaPairRDD, Object[]> tablePairRDD = null; + for (EtlJobConfig.EtlFileGroup fileGroup : etlTable.fileGroups) { + List filePaths = fileGroup.filePaths; + Dataset fileGroupDataframe = null; + EtlJobConfig.SourceType sourceType = fileGroup.sourceType; + if (sourceType == EtlJobConfig.SourceType.FILE) { + fileGroupDataframe = loadDataFromFilePaths( + spark, baseIndex, filePaths, fileGroup, dstTableSchema); + } else if (sourceType == EtlJobConfig.SourceType.HIVE) { + fileGroupDataframe = loadDataFromHiveTable(spark, fileGroup.dppHiveDbTableName, + baseIndex, fileGroup, dstTableSchema, dictBitmapColumnSet, binaryBitmapColumnSet); + } else { + throw new RuntimeException("Unknown source type: " + sourceType.name()); + } + if (fileGroupDataframe == null) { + LOG.info("no data for file file group:" + fileGroup); + continue; + } + + JavaPairRDD, Object[]> ret = fillTupleWithPartitionColumn( + fileGroupDataframe, + partitionInfo, partitionKeyIndex, + partitionRangeKeys, + keyAndPartitionColumnNames, valueColumnNames, + dstTableSchema, baseIndex, fileGroup.partitions); + if (tablePairRDD == null) { + tablePairRDD = ret; + } else { + tablePairRDD.union(ret); + } + } + processRollupTree(rootNode, tablePairRDD, tableId, baseIndex); + } + LOG.info("invalid rows contents:" + invalidRows.value()); + dppResult.isSuccess = true; + dppResult.failedReason = ""; + } catch (Exception exception) { + LOG.warn("spark dpp failed for exception:" + exception); + dppResult.isSuccess = false; + dppResult.failedReason = exception.getMessage(); + throw exception; + } finally { + spark.stop(); + dppResult.normalRows = scannedRowsAcc.value() - abnormalRowAcc.value(); + dppResult.scannedRows = scannedRowsAcc.value(); + dppResult.fileNumber = fileNumberAcc.value(); + dppResult.fileSize = fileSizeAcc.value(); + dppResult.abnormalRows = abnormalRowAcc.value(); + dppResult.partialAbnormalRows = invalidRows.value(); + } + } + + private void writeDppResult(DppResult dppResult) throws Exception { + String outputPath = etlJobConfig.getOutputPath(); + String resultFilePath = outputPath + "/" + DPP_RESULT_FILE; + FileSystem fs = FileSystem.get(new Path(outputPath).toUri(), serializableHadoopConf.value()); + Path filePath = new Path(resultFilePath); + FSDataOutputStream outputStream = fs.create(filePath); + outputStream.write(JsonUtils.writeValueAsBytes(dppResult)); + outputStream.write('\n'); + outputStream.close(); + } + + public void doDpp() throws Exception { + try { + process(); + } finally { + // write dpp result to file in outputPath + writeDppResult(dppResult); + } + } +} diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkRDDAggregator.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkRDDAggregator.java new file mode 100644 index 00000000..e06dc2df --- /dev/null +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkRDDAggregator.java @@ -0,0 +1,607 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.common.SparkDppException; +import org.apache.doris.common.io.BitmapValue; +import org.apache.doris.common.io.Hll; +import org.apache.doris.config.EtlJobConfig; + +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.spark.Partitioner; +import org.apache.spark.api.java.function.Function2; +import org.apache.spark.api.java.function.PairFunction; +import scala.Tuple2; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.io.Serializable; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Map; + +// contains all class about spark aggregate + +public abstract class SparkRDDAggregator implements Serializable { + + T init(Object value) { + return (T) value; + } + + abstract T update(T v1, T v2); + + Object finalize(Object value) { + return value; + } + + public static SparkRDDAggregator buildAggregator(EtlJobConfig.EtlColumn column) throws SparkDppException { + String aggType = StringUtils.lowerCase(column.aggregationType); + String columnType = StringUtils.lowerCase(column.columnType); + switch (aggType) { + case "bitmap_union": + return new BitmapUnionAggregator(); + case "hll_union": + return new HllUnionAggregator(); + case "max": + switch (columnType) { + case "tinyint": + case "smallint": + case "int": + case "bigint": + case "float": + case "double": + case "decimalv2": + case "decimal32": + case "decimal64": + case "decimal128": + case "date": + case "datetime": + case "datev2": + case "datetimev2": + return new NumberMaxAggregator(); + case "char": + case "varchar": + return new StringMaxAggregator(); + case "largeint": + return new LargeIntMaxAggregator(); + default: + throw new SparkDppException( + String.format("unsupported max aggregator for column type:%s", columnType)); + } + case "min": + switch (columnType) { + case "tinyint": + case "smallint": + case "int": + case "bigint": + case "float": + case "double": + case "decimalv2": + case "decimal32": + case "decimal64": + case "decimal128": + case "date": + case "datetime": + case "datev2": + case "datetimev2": + return new NumberMinAggregator(); + case "char": + case "varchar": + return new StringMinAggregator(); + case "largeint": + return new LargeIntMinAggregator(); + default: + throw new SparkDppException( + String.format("unsupported min aggregator for column type:%s", columnType)); + } + case "sum": + switch (columnType) { + case "tinyint": + return new ByteSumAggregator(); + case "smallint": + return new ShortSumAggregator(); + case "int": + return new IntSumAggregator(); + case "bigint": + return new LongSumAggregator(); + case "float": + return new FloatSumAggregator(); + case "double": + return new DoubleSumAggregator(); + case "largeint": + return new LargeIntSumAggregator(); + case "decimalv2": + case "decimal32": + case "decimal64": + case "decimal128": + return new BigDecimalSumAggregator(); + default: + throw new SparkDppException( + String.format("unsupported sum aggregator for column type:%s", columnType)); + } + case "replace_if_not_null": + return new ReplaceIfNotNullAggregator(); + case "replace": + return new ReplaceAggregator(); + default: + throw new SparkDppException(String.format("unsupported aggregate type %s", aggType)); + } + } + +} + +// just used for duplicate table, default logic is enough +class DefaultSparkRDDAggregator extends SparkRDDAggregator { + + @Override + Object update(Object v1, Object v2) { + return null; + } +} + +// just encode value column,used for base rollup +class EncodeBaseAggregateTableFunction implements PairFunction, Object[]>, List, Object[]> { + + private SparkRDDAggregator[] valueAggregators; + + public EncodeBaseAggregateTableFunction(SparkRDDAggregator[] valueAggregators) { + this.valueAggregators = valueAggregators; + } + + + @Override + public Tuple2, Object[]> call(Tuple2, Object[]> srcPair) throws Exception { + for (int i = 0; i < srcPair._2().length; i++) { + srcPair._2()[i] = valueAggregators[i].init(srcPair._2()[i]); + } + return srcPair; + } +} + +// just map column from parent rollup index to child rollup index,used for child rollup +class EncodeRollupAggregateTableFunction + implements PairFunction, Object[]>, List, Object[]> { + + Pair columnIndexInParentRollup; + + public EncodeRollupAggregateTableFunction(Pair columnIndexInParentRollup) { + this.columnIndexInParentRollup = columnIndexInParentRollup; + } + + @Override + public Tuple2, Object[]> call(Tuple2, Object[]> parentRollupKeyValuePair) + throws Exception { + Integer[] keyColumnIndexMap = columnIndexInParentRollup.getKey(); + Integer[] valueColumnIndexMap = columnIndexInParentRollup.getValue(); + + List keys = new ArrayList(); + Object[] values = new Object[valueColumnIndexMap.length]; + + // deal bucket_id column + keys.add(parentRollupKeyValuePair._1().get(0)); + for (int i = 0; i < keyColumnIndexMap.length; i++) { + keys.add(parentRollupKeyValuePair._1().get(keyColumnIndexMap[i] + 1)); + } + + for (int i = 0; i < valueColumnIndexMap.length; i++) { + values[i] = parentRollupKeyValuePair._2()[valueColumnIndexMap[i]]; + } + return new Tuple2<>(keys, values); + } +} + +class AggregateReduceFunction implements Function2 { + + private SparkRDDAggregator[] valueAggregators; + + public AggregateReduceFunction(SparkRDDAggregator[] sparkDppAggregators) { + this.valueAggregators = sparkDppAggregators; + } + + @Override + public Object[] call(Object[] v1, Object[] v2) throws Exception { + Object[] result = new Object[valueAggregators.length]; + for (int i = 0; i < v1.length; i++) { + result[i] = valueAggregators[i].update(v1[i], v2[i]); + } + return result; + } +} + +class ReplaceAggregator extends SparkRDDAggregator { + + @Override + Object update(Object dst, Object src) { + return src; + } +} + +class ReplaceIfNotNullAggregator extends SparkRDDAggregator { + + @Override + Object update(Object dst, Object src) { + return src == null ? dst : src; + } +} + +class BitmapUnionAggregator extends SparkRDDAggregator { + private static final Logger LOG = LogManager.getLogger(BitmapUnionAggregator.class); + + @Override + BitmapValue init(Object value) { + try { + BitmapValue bitmapValue = new BitmapValue(); + if (value instanceof byte[]) { + bitmapValue.deserialize(new DataInputStream(new ByteArrayInputStream((byte[]) value))); + } else if (value != null) { + bitmapValue.add(Long.parseLong(value.toString())); + } + return bitmapValue; + } catch (Exception e) { + throw new RuntimeException("build bitmap value failed", e); + } + } + + @Override + BitmapValue update(BitmapValue v1, BitmapValue v2) { + BitmapValue newBitmapValue = new BitmapValue(); + if (v1 != null) { + newBitmapValue.or(v1); + } + if (v2 != null) { + newBitmapValue.or(v2); + } + return newBitmapValue; + } + + @Override + byte[] finalize(Object value) { + try { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + DataOutputStream outputStream = new DataOutputStream(bos); + ((BitmapValue) value).serialize(outputStream); + return bos.toByteArray(); + } catch (IOException ioException) { + LOG.warn("", ioException); + throw new RuntimeException(ioException); + } + } + +} + +class HllUnionAggregator extends SparkRDDAggregator { + private static final Logger LOG = LogManager.getLogger(HllUnionAggregator.class); + + @Override + Hll init(Object value) { + try { + Hll hll = new Hll(); + if (value instanceof byte[]) { + hll.deserialize(new DataInputStream(new ByteArrayInputStream((byte[]) value))); + } else if (value != null) { + hll.updateWithHash(value); + } + return hll; + } catch (Exception e) { + throw new RuntimeException("build hll value failed", e); + } + } + + @Override + Hll update(Hll v1, Hll v2) { + Hll newHll = new Hll(); + if (v1 != null) { + newHll.merge(v1); + } + if (v2 != null) { + newHll.merge(v2); + } + return newHll; + } + + @Override + byte[] finalize(Object value) { + try { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + DataOutputStream outputStream = new DataOutputStream(bos); + ((Hll) value).serialize(outputStream); + return bos.toByteArray(); + } catch (IOException ioException) { + LOG.warn("", ioException); + throw new RuntimeException(ioException); + } + } + +} + +class LargeIntMaxAggregator extends SparkRDDAggregator { + + BigInteger init(Object value) { + if (value == null) { + return null; + } + return new BigInteger(value.toString()); + } + + @Override + BigInteger update(BigInteger dst, BigInteger src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return dst.compareTo(src) > 0 ? dst : src; + } + + @Override + String finalize(Object value) { + BigInteger bigInteger = (BigInteger) value; + return bigInteger.toString(); + } +} + +class LargeIntMinAggregator extends LargeIntMaxAggregator { + + @Override + BigInteger update(BigInteger dst, BigInteger src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return dst.compareTo(src) < 0 ? dst : src; + } +} + +class LargeIntSumAggregator extends LargeIntMaxAggregator { + + @Override + BigInteger update(BigInteger dst, BigInteger src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return dst.add(src); + } +} + + +class NumberMaxAggregator extends SparkRDDAggregator { + + @Override + Object update(Object dst, Object src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return ((Comparable) dst).compareTo(src) > 0 ? dst : src; + } +} + + +class NumberMinAggregator extends SparkRDDAggregator { + + @Override + Object update(Object dst, Object src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return ((Comparable) dst).compareTo(src) < 0 ? dst : src; + } +} + +class LongSumAggregator extends SparkRDDAggregator { + + @Override + Long update(Long dst, Long src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return dst + src; + } +} + +class ShortSumAggregator extends SparkRDDAggregator { + + @Override + Short update(Short dst, Short src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + int ret = dst + src; + // here may overflow, just keep the same logic with be + return (short) ret; + } +} + +class IntSumAggregator extends SparkRDDAggregator { + + @Override + Integer update(Integer dst, Integer src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + long ret = Long.sum(dst, src); + // here may overflow, just keep the same logic with be + return (int) ret; + } +} + +class ByteSumAggregator extends SparkRDDAggregator { + + @Override + Byte update(Byte dst, Byte src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + int ret = dst + src; + // here may overflow, just keep the same logic with be + return (byte) ret; + } +} + +class DoubleSumAggregator extends SparkRDDAggregator { + + @Override + strictfp Double update(Double dst, Double src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return dst + src; + } +} + +class FloatSumAggregator extends SparkRDDAggregator { + + @Override + strictfp Float update(Float dst, Float src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return dst + src; + } +} + +class StringMaxAggregator extends SparkRDDAggregator { + + @Override + String update(String dst, String src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return dst.compareTo(src) > 0 ? dst : src; + } +} + +class StringMinAggregator extends SparkRDDAggregator { + + @Override + String update(String dst, String src) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return dst.compareTo(src) < 0 ? dst : src; + } +} + +class BigDecimalSumAggregator extends SparkRDDAggregator { + + + @Override + BigDecimal update(BigDecimal src, BigDecimal dst) { + if (src == null) { + return dst; + } + if (dst == null) { + return src; + } + return src.add(dst); + } +} + + +class BucketComparator implements Comparator>, Serializable { + + @Override + public int compare(List keyArray1, List keyArray2) { + int cmp = 0; + + for (int i = 0; i < keyArray1.size(); i++) { + Object key1 = keyArray1.get(i); + Object key2 = keyArray2.get(i); + if (key1 == key2) { + continue; + } + if (key1 == null || key2 == null) { + return key1 == null ? -1 : 1; + } + if (key1 instanceof Comparable && key2 instanceof Comparable) { + cmp = ((Comparable) key1).compareTo(key2); + } else { + throw new RuntimeException(String.format("uncomparable column type %s", key1.getClass().toString())); + } + if (cmp != 0) { + return cmp; + } + } + + return cmp; + } +} + +class BucketPartitioner extends Partitioner { + + private Map bucketKeyMap; + + public BucketPartitioner(Map bucketKeyMap) { + this.bucketKeyMap = bucketKeyMap; + } + + @Override + public int numPartitions() { + return bucketKeyMap.size(); + } + + @Override + public int getPartition(Object key) { + List rddKey = (List) key; + return bucketKeyMap.get(String.valueOf(rddKey.get(0))); + } +} diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/StringAccumulator.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/StringAccumulator.java new file mode 100644 index 00000000..428a9d42 --- /dev/null +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/StringAccumulator.java @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.spark.util.AccumulatorV2; + +import java.util.ArrayList; +import java.util.List; + +// This class is a accumulator of string based on AccumulatorV2 +// (https://spark.apache.org/docs/latest/api/java/org/apache/spark/util/AccumulatorV2.html). +// Spark does not provide string accumulator. +// +// This class is used to collect the invalid rows when doing etl. +public class StringAccumulator extends AccumulatorV2 { + private List strs = new ArrayList<>(); + + @Override + public boolean isZero() { + return strs.isEmpty(); + } + + @Override + public AccumulatorV2 copy() { + StringAccumulator newAccumulator = new StringAccumulator(); + newAccumulator.strs.addAll(this.strs); + return newAccumulator; + } + + @Override + public void reset() { + strs.clear(); + } + + @Override + public void add(String v) { + strs.add(v); + } + + @Override + public void merge(AccumulatorV2 other) { + StringAccumulator o = (StringAccumulator) other; + strs.addAll(o.strs); + } + + @Override + public String value() { + return strs.toString(); + } +} diff --git a/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/etl/SparkEtlJob.java b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/etl/SparkEtlJob.java new file mode 100644 index 00000000..03300014 --- /dev/null +++ b/spark-load/spark-load-dpp/src/main/java/org/apache/doris/load/loadv2/etl/SparkEtlJob.java @@ -0,0 +1,288 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.etl; + +import org.apache.doris.common.SparkDppException; +import org.apache.doris.config.EtlJobConfig; +import org.apache.doris.config.EtlJobConfig.EtlColumn; +import org.apache.doris.config.EtlJobConfig.EtlColumnMapping; +import org.apache.doris.config.EtlJobConfig.EtlFileGroup; +import org.apache.doris.config.EtlJobConfig.EtlIndex; +import org.apache.doris.config.EtlJobConfig.EtlTable; +import org.apache.doris.load.loadv2.dpp.GlobalDictBuilder; +import org.apache.doris.load.loadv2.dpp.SparkDpp; + +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; +import com.google.common.io.CharStreams; +import org.apache.commons.collections.map.MultiValueMap; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.spark.SparkConf; +import org.apache.spark.deploy.SparkHadoopUtil; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.functions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.DataInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * SparkEtlJob is responsible for global dict building, data partition, data sort and data aggregation. + * 1. init job config + * 2. check if job has bitmap_dict function columns + * 3. build global dict if step 2 is true + * 4. dpp (data partition, data sort and data aggregation) + */ +public class SparkEtlJob { + private static final Logger LOG = LoggerFactory.getLogger(SparkEtlJob.class); + + private static final String BITMAP_DICT_FUNC = "bitmap_dict"; + private static final String TO_BITMAP_FUNC = "to_bitmap"; + private static final String BITMAP_HASH = "bitmap_hash"; + private static final String BINARY_BITMAP = "binary_bitmap"; + + private String jobConfigFilePath; + private EtlJobConfig etlJobConfig; + private Set hiveSourceTables; + private Map> tableToBitmapDictColumns; + private Map> tableToBinaryBitmapColumns; + private final SparkConf conf; + private SparkSession spark; + + private SparkEtlJob(String jobConfigFilePath) { + this.jobConfigFilePath = jobConfigFilePath; + this.etlJobConfig = null; + this.hiveSourceTables = Sets.newHashSet(); + this.tableToBitmapDictColumns = Maps.newHashMap(); + this.tableToBinaryBitmapColumns = Maps.newHashMap(); + conf = new SparkConf(); + } + + private void initSpark() { + //serialization conf + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.set("spark.kryo.registrator", "org.apache.doris.load.loadv2.dpp.DorisKryoRegistrator"); + conf.set("spark.kryo.registrationRequired", "false"); + spark = SparkSession.builder().enableHiveSupport().config(conf).getOrCreate(); + } + + private void initSparkConfigs(Map configs) { + if (configs == null) { + return; + } + for (Map.Entry entry : configs.entrySet()) { + conf.set(entry.getKey(), entry.getValue()); + conf.set("spark.hadoop." + entry.getKey(), entry.getValue()); + } + } + + private void initConfig() throws IOException { + if (LOG.isDebugEnabled()) { + LOG.debug("job config file path: " + jobConfigFilePath); + } + Configuration hadoopConf = SparkHadoopUtil.get().newConfiguration(this.conf); + String jsonConfig; + Path path = new Path(jobConfigFilePath); + try (FileSystem fs = path.getFileSystem(hadoopConf); DataInputStream in = fs.open(path)) { + jsonConfig = CharStreams.toString(new InputStreamReader(in)); + } + if (LOG.isDebugEnabled()) { + LOG.debug("rdd read json config: " + jsonConfig); + } + etlJobConfig = EtlJobConfig.configFromJson(jsonConfig); + if (LOG.isDebugEnabled()) { + LOG.debug("etl job config: " + etlJobConfig); + } + } + + /* + * 1. check bitmap column + * 2. fill tableToBitmapDictColumns + * 3. remove bitmap_dict and to_bitmap mapping from columnMappings + */ + private void checkConfig() throws Exception { + for (Map.Entry entry : etlJobConfig.tables.entrySet()) { + boolean isHiveSource = false; + Set bitmapDictColumns = Sets.newHashSet(); + Set binaryBitmapColumns = Sets.newHashSet(); + + for (EtlFileGroup fileGroup : entry.getValue().fileGroups) { + if (fileGroup.sourceType == EtlJobConfig.SourceType.HIVE) { + isHiveSource = true; + } + Map newColumnMappings = Maps.newHashMap(); + for (Map.Entry mappingEntry : fileGroup.columnMappings.entrySet()) { + String columnName = mappingEntry.getKey(); + String exprStr = mappingEntry.getValue().toDescription(); + String funcName = functions.expr(exprStr).expr().prettyName(); + if (funcName.equalsIgnoreCase(BITMAP_HASH)) { + throw new SparkDppException("spark load not support bitmap_hash now"); + } + if (funcName.equalsIgnoreCase(BINARY_BITMAP)) { + binaryBitmapColumns.add(columnName.toLowerCase()); + } else if (funcName.equalsIgnoreCase(BITMAP_DICT_FUNC)) { + bitmapDictColumns.add(columnName.toLowerCase()); + } else if (!funcName.equalsIgnoreCase(TO_BITMAP_FUNC)) { + newColumnMappings.put(mappingEntry.getKey(), mappingEntry.getValue()); + } + } + // reset new columnMappings + fileGroup.columnMappings = newColumnMappings; + } + if (isHiveSource) { + hiveSourceTables.add(entry.getKey()); + } + if (!bitmapDictColumns.isEmpty()) { + tableToBitmapDictColumns.put(entry.getKey(), bitmapDictColumns); + } + if (!binaryBitmapColumns.isEmpty()) { + tableToBinaryBitmapColumns.put(entry.getKey(), binaryBitmapColumns); + } + } + LOG.info("init hiveSourceTables: " + hiveSourceTables + + ",tableToBitmapDictColumns: " + tableToBitmapDictColumns); + + // spark etl must have only one table with bitmap type column to process. + if (hiveSourceTables.size() > 1 + || tableToBitmapDictColumns.size() > 1 + || tableToBinaryBitmapColumns.size() > 1) { + throw new Exception("spark etl job must have only one hive table with bitmap type column to process"); + } + } + + private void processDpp() throws Exception { + SparkDpp sparkDpp = new SparkDpp(spark, etlJobConfig, tableToBitmapDictColumns, tableToBinaryBitmapColumns); + sparkDpp.init(); + sparkDpp.doDpp(); + } + + private String buildGlobalDictAndEncodeSourceTable(EtlTable table, long tableId) { + // dict column map + MultiValueMap dictColumnMap = new MultiValueMap(); + for (String dictColumn : tableToBitmapDictColumns.get(tableId)) { + dictColumnMap.put(dictColumn, null); + } + + // doris schema + List dorisOlapTableColumnList = Lists.newArrayList(); + for (EtlIndex etlIndex : table.indexes) { + if (etlIndex.isBaseIndex) { + for (EtlColumn column : etlIndex.columns) { + dorisOlapTableColumnList.add(column.columnName); + } + } + } + + // hive db and tables + EtlFileGroup fileGroup = table.fileGroups.get(0); + String sourceHiveDBTableName = fileGroup.hiveDbTableName; + String dorisHiveDB = sourceHiveDBTableName.split("\\.")[0]; + String taskId = etlJobConfig.outputPath.substring(etlJobConfig.outputPath.lastIndexOf("/") + 1); + String globalDictTableName = String.format(EtlJobConfig.GLOBAL_DICT_TABLE_NAME, tableId); + String distinctKeyTableName = String.format(EtlJobConfig.DISTINCT_KEY_TABLE_NAME, tableId, taskId); + String dorisIntermediateHiveTable = String.format( + EtlJobConfig.DORIS_INTERMEDIATE_HIVE_TABLE_NAME, tableId, taskId); + String sourceHiveFilter = fileGroup.where; + + // others + List mapSideJoinColumns = Lists.newArrayList(); + int buildConcurrency = 1; + List veryHighCardinalityColumn = Lists.newArrayList(); + int veryHighCardinalityColumnSplitNum = 1; + + LOG.info("global dict builder args, dictColumnMap: " + dictColumnMap + + ", dorisOlapTableColumnList: " + dorisOlapTableColumnList + + ", sourceHiveDBTableName: " + sourceHiveDBTableName + + ", sourceHiveFilter: " + sourceHiveFilter + + ", distinctKeyTableName: " + distinctKeyTableName + + ", globalDictTableName: " + globalDictTableName + + ", dorisIntermediateHiveTable: " + dorisIntermediateHiveTable); + try { + GlobalDictBuilder globalDictBuilder = new GlobalDictBuilder(dictColumnMap, dorisOlapTableColumnList, + mapSideJoinColumns, sourceHiveDBTableName, sourceHiveFilter, dorisHiveDB, distinctKeyTableName, + globalDictTableName, dorisIntermediateHiveTable, buildConcurrency, veryHighCardinalityColumn, + veryHighCardinalityColumnSplitNum, spark); + globalDictBuilder.createHiveIntermediateTable(); + globalDictBuilder.extractDistinctColumn(); + globalDictBuilder.buildGlobalDict(); + globalDictBuilder.encodeDorisIntermediateHiveTable(); + } catch (Exception e) { + throw new RuntimeException(e); + } + + return String.format("%s.%s", dorisHiveDB, dorisIntermediateHiveTable); + } + + private void processData() throws Exception { + if (!hiveSourceTables.isEmpty()) { + // only one table + long tableId = -1; + EtlTable table = null; + for (Map.Entry entry : etlJobConfig.tables.entrySet()) { + tableId = entry.getKey(); + table = entry.getValue(); + break; + } + + // init hive configs like metastore service + EtlFileGroup fileGroup = table.fileGroups.get(0); + initSparkConfigs(fileGroup.hiveTableProperties); + fileGroup.dppHiveDbTableName = fileGroup.hiveDbTableName; + + // build global dict and encode source hive table if has bitmap dict columns + if (!tableToBitmapDictColumns.isEmpty() && tableToBitmapDictColumns.containsKey(tableId)) { + String dorisIntermediateHiveDbTableName = buildGlobalDictAndEncodeSourceTable(table, tableId); + // set with dorisIntermediateHiveDbTable + fileGroup.dppHiveDbTableName = dorisIntermediateHiveDbTableName; + } + } + + initSpark(); + // data partition sort and aggregation + processDpp(); + } + + private void run() throws Exception { + initConfig(); + checkConfig(); + processData(); + } + + public static void main(String[] args) { + if (args.length < 1) { + System.err.println("missing job config file path arg"); + System.exit(-1); + } + + try { + new SparkEtlJob(args[0]).run(); + } catch (Exception e) { + System.err.println("spark etl job run failed"); + LOG.warn("", e); + System.exit(-1); + } + } +} diff --git a/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/ColumnParserTest.java b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/ColumnParserTest.java new file mode 100644 index 00000000..9c219a14 --- /dev/null +++ b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/ColumnParserTest.java @@ -0,0 +1,135 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.config.EtlJobConfig; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +public class ColumnParserTest { + + // TODO(wb) try to keep ut consistent with be's ut + @Test + public void testBoundCheck() { + // tinyint + TinyIntParser tinyIntParser = new TinyIntParser(); + // 1 normal + String tinyint = "100"; + Assertions.assertTrue(tinyIntParser.parse(tinyint)); + // 2 upper + String tinyintUpper = "128"; + Assertions.assertFalse(tinyIntParser.parse(tinyintUpper)); + // 3 lower + String tinyintLower = "-129"; + Assertions.assertFalse(tinyIntParser.parse(tinyintLower)); + + // smallint + SmallIntParser smallIntParser = new SmallIntParser(); + // 1 normal + String smallint = "100"; + Assertions.assertTrue(smallIntParser.parse(smallint)); + // 2 upper + String smallintUpper = "32768"; + Assertions.assertFalse(smallIntParser.parse(smallintUpper)); + // 3 lower + String smallintLower = "-32769"; + Assertions.assertFalse(smallIntParser.parse(smallintLower)); + + // int + IntParser intParser = new IntParser(); + // 1 normal + String intValue = "100"; + Assertions.assertTrue(intParser.parse(intValue)); + // 2 upper + String intUpper = "2147483648"; + Assertions.assertFalse(intParser.parse(intUpper)); + // 3 lower + String intLower = "-2147483649"; + Assertions.assertFalse(intParser.parse(intLower)); + + // bigint + BigIntParser bigIntParser = new BigIntParser(); + // 1 normal + String bigint = "100"; + Assertions.assertTrue(bigIntParser.parse(bigint)); + // 2 upper + String bigintUpper = "9223372036854775808"; + Assertions.assertFalse(bigIntParser.parse(bigintUpper)); + // 3 lower + String bigintLower = "-9223372036854775809"; + Assertions.assertFalse(bigIntParser.parse(bigintLower)); + + // largeint + LargeIntParser largeIntParser = new LargeIntParser(); + // 1 normal + String largeint = "100"; + Assertions.assertTrue(largeIntParser.parse(largeint)); + // 2 upper + String largeintUpper = "170141183460469231731687303715884105728"; + Assertions.assertFalse(largeIntParser.parse(largeintUpper)); + // 3 lower + String largeintLower = "-170141183460469231731687303715884105729"; + Assertions.assertFalse(largeIntParser.parse(largeintLower)); + + // float + FloatParser floatParser = new FloatParser(); + // normal + String floatValue = "1.1"; + Assertions.assertTrue(floatParser.parse(floatValue)); + // inf + String inf = "Infinity"; + Assertions.assertFalse(floatParser.parse(inf)); + // nan + String nan = "NaN"; + // failed + Assertions.assertFalse(floatParser.parse(nan)); + + // double + DoubleParser doubleParser = new DoubleParser(); + // normal + Assertions.assertTrue(doubleParser.parse(floatValue)); + // inf + Assertions.assertFalse(doubleParser.parse(inf)); + // nan + Assertions.assertFalse(doubleParser.parse(nan)); + + // decimal + EtlJobConfig.EtlColumn etlColumn = new EtlJobConfig.EtlColumn(); + etlColumn.precision = 5; + etlColumn.scale = 3; + DecimalParser decimalParser = new DecimalParser(etlColumn); + // normal + String decimalValue = "10.333"; + Assertions.assertTrue(decimalParser.parse(decimalValue)); + // overflow + String decimalOverflow = "1000.3333333333"; + Assertions.assertFalse(decimalParser.parse(decimalOverflow)); + + // string + EtlJobConfig.EtlColumn stringColumn = new EtlJobConfig.EtlColumn(); + stringColumn.stringLength = 3; + StringParser stringParser = new StringParser(stringColumn); + // normal + String stringnormal = "a"; + Assertions.assertTrue(stringParser.parse(stringnormal)); + // overflow + String stringoverflow = "中文"; + Assertions.assertFalse(stringParser.parse(stringoverflow)); + } + +} diff --git a/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitionerTest.java b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitionerTest.java new file mode 100644 index 00000000..28eba87f --- /dev/null +++ b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DorisRangePartitionerTest.java @@ -0,0 +1,135 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.config.EtlJobConfig; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + + +import java.util.ArrayList; +import java.util.List; + +public class DorisRangePartitionerTest { + + @Test + public void testRangePartitioner() { + List startKeys = new ArrayList<>(); + startKeys.add(new Integer(0)); + List endKeys = new ArrayList<>(); + endKeys.add(new Integer(100)); + EtlJobConfig.EtlPartition partition1 = new EtlJobConfig.EtlPartition( + 10000, startKeys, endKeys, false, 3); + + List startKeys2 = new ArrayList<>(); + startKeys2.add(new Integer(100)); + List endKeys2 = new ArrayList<>(); + endKeys2.add(new Integer(200)); + EtlJobConfig.EtlPartition partition2 = new EtlJobConfig.EtlPartition( + 10001, startKeys2, endKeys2, false, 4); + + List startKeys3 = new ArrayList<>(); + startKeys3.add(new Integer(200)); + List endKeys3 = new ArrayList<>(); + endKeys3.add(new Integer(300)); + EtlJobConfig.EtlPartition partition3 = new EtlJobConfig.EtlPartition( + 10002, startKeys3, endKeys3, false, 5); + + List partitions = new ArrayList<>(); + partitions.add(partition1); + partitions.add(partition2); + partitions.add(partition3); + + List partitionColumns = new ArrayList<>(); + partitionColumns.add("id"); + List bucketColumns = new ArrayList<>(); + bucketColumns.add("key"); + EtlJobConfig.EtlPartitionInfo partitionInfo = new EtlJobConfig.EtlPartitionInfo( + "RANGE", partitionColumns, bucketColumns, partitions); + List partitionRangeKeys = new ArrayList<>(); + for (EtlJobConfig.EtlPartition partition : partitions) { + DorisRangePartitioner.PartitionRangeKey partitionRangeKey = new DorisRangePartitioner.PartitionRangeKey(); + partitionRangeKey.isMaxPartition = false; + partitionRangeKey.startKeys = new DppColumns(partition.startKeys); + partitionRangeKey.endKeys = new DppColumns(partition.endKeys); + partitionRangeKeys.add(partitionRangeKey); + } + List partitionKeyIndexes = new ArrayList<>(); + partitionKeyIndexes.add(0); + DorisRangePartitioner rangePartitioner = new DorisRangePartitioner(partitionInfo, partitionKeyIndexes, partitionRangeKeys); + int num = rangePartitioner.numPartitions(); + Assertions.assertEquals(3, num); + + List fields1 = new ArrayList<>(); + fields1.add(-100); + fields1.add("name"); + DppColumns record1 = new DppColumns(fields1); + int id1 = rangePartitioner.getPartition(record1); + Assertions.assertEquals(-1, id1); + + List fields2 = new ArrayList<>(); + fields2.add(10); + fields2.add("name"); + DppColumns record2 = new DppColumns(fields2); + int id2 = rangePartitioner.getPartition(record2); + Assertions.assertEquals(0, id2); + + List fields3 = new ArrayList<>(); + fields3.add(110); + fields3.add("name"); + DppColumns record3 = new DppColumns(fields3); + int id3 = rangePartitioner.getPartition(record3); + Assertions.assertEquals(1, id3); + + List fields4 = new ArrayList<>(); + fields4.add(210); + fields4.add("name"); + DppColumns record4 = new DppColumns(fields4); + int id4 = rangePartitioner.getPartition(record4); + Assertions.assertEquals(2, id4); + + List fields5 = new ArrayList<>(); + fields5.add(310); + fields5.add("name"); + DppColumns record5 = new DppColumns(fields5); + int id5 = rangePartitioner.getPartition(record5); + Assertions.assertEquals(-1, id5); + } + + @Test + public void testUnpartitionedPartitioner() { + List bucketColumns = new ArrayList<>(); + bucketColumns.add("key"); + EtlJobConfig.EtlPartitionInfo partitionInfo = new EtlJobConfig.EtlPartitionInfo( + "UNPARTITIONED", null, bucketColumns, null); + List partitionSchema = new ArrayList<>(); + partitionSchema.add(Integer.class); + List partitionKeyIndexes = new ArrayList<>(); + partitionKeyIndexes.add(0); + DorisRangePartitioner rangePartitioner = new DorisRangePartitioner(partitionInfo, partitionKeyIndexes, null); + int num = rangePartitioner.numPartitions(); + Assertions.assertEquals(1, num); + + List fields = new ArrayList<>(); + fields.add(100); + fields.add("name"); + DppColumns record = new DppColumns(fields); + int id = rangePartitioner.getPartition(record); + Assertions.assertEquals(0, id); + } +} diff --git a/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DppUtilsTest.java b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DppUtilsTest.java new file mode 100644 index 00000000..4b47e14d --- /dev/null +++ b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/DppUtilsTest.java @@ -0,0 +1,239 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.config.EtlJobConfig; + +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructType; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + + +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; + +public class DppUtilsTest { + + @Test + public void testGetClassFromDataType() { + DppUtils dppUtils = new DppUtils(); + + Class stringResult = dppUtils.getClassFromDataType(DataTypes.StringType); + Assertions.assertEquals(String.class, stringResult); + + Class booleanResult = dppUtils.getClassFromDataType(DataTypes.BooleanType); + Assertions.assertEquals(Boolean.class, booleanResult); + + Class shortResult = dppUtils.getClassFromDataType(DataTypes.ShortType); + Assertions.assertEquals(Short.class, shortResult); + + Class integerResult = dppUtils.getClassFromDataType(DataTypes.IntegerType); + Assertions.assertEquals(Integer.class, integerResult); + + Class longResult = dppUtils.getClassFromDataType(DataTypes.LongType); + Assertions.assertEquals(Long.class, longResult); + + Class floatResult = dppUtils.getClassFromDataType(DataTypes.FloatType); + Assertions.assertEquals(Float.class, floatResult); + + Class doubleResult = dppUtils.getClassFromDataType(DataTypes.DoubleType); + Assertions.assertEquals(Double.class, doubleResult); + + Class dateResult = dppUtils.getClassFromDataType(DataTypes.DateType); + Assertions.assertEquals(Date.class, dateResult); + } + + @Test + public void testGetClassFromColumn() { + DppUtils dppUtils = new DppUtils(); + + try { + EtlJobConfig.EtlColumn column = new EtlJobConfig.EtlColumn(); + column.columnType = "CHAR"; + Class charResult = dppUtils.getClassFromColumn(column); + Assertions.assertEquals(String.class, charResult); + + column.columnType = "HLL"; + Class hllResult = dppUtils.getClassFromColumn(column); + Assertions.assertEquals(String.class, hllResult); + + column.columnType = "OBJECT"; + Class objectResult = dppUtils.getClassFromColumn(column); + Assertions.assertEquals(String.class, objectResult); + + column.columnType = "BOOLEAN"; + Class booleanResult = dppUtils.getClassFromColumn(column); + Assertions.assertEquals(Boolean.class, booleanResult); + + column.columnType = "TINYINT"; + Class tinyResult = dppUtils.getClassFromColumn(column); + Assertions.assertEquals(Short.class, tinyResult); + + column.columnType = "SMALLINT"; + Class smallResult = dppUtils.getClassFromColumn(column); + Assertions.assertEquals(Short.class, smallResult); + + column.columnType = "INT"; + Class integerResult = dppUtils.getClassFromColumn(column); + Assertions.assertEquals(Integer.class, integerResult); + + column.columnType = "DATETIME"; + Class datetimeResult = dppUtils.getClassFromColumn(column); + Assertions.assertEquals(java.sql.Timestamp.class, datetimeResult); + + column.columnType = "FLOAT"; + Class floatResult = dppUtils.getClassFromColumn(column); + Assertions.assertEquals(Float.class, floatResult); + + column.columnType = "DOUBLE"; + Class doubleResult = dppUtils.getClassFromColumn(column); + Assertions.assertEquals(Double.class, doubleResult); + + column.columnType = "DATE"; + Class dateResult = dppUtils.getClassFromColumn(column); + Assertions.assertEquals(Date.class, dateResult); + + column.columnType = "DECIMALV2"; + column.precision = 10; + column.scale = 2; + Class decimalResult = dppUtils.getClassFromColumn(column); + Assertions.assertEquals(BigDecimal.valueOf(10, 2).getClass(), decimalResult); + } catch (Exception e) { + Assertions.assertFalse(false); + } + + } + + @Test + public void testGetDataTypeFromColumn() { + DppUtils dppUtils = new DppUtils(); + + try { + EtlJobConfig.EtlColumn column = new EtlJobConfig.EtlColumn(); + column.columnType = "VARCHAR"; + DataType stringResult = dppUtils.getDataTypeFromColumn(column, false); + Assertions.assertEquals(DataTypes.StringType, stringResult); + + column.columnType = "CHAR"; + DataType charResult = dppUtils.getDataTypeFromColumn(column, false); + Assertions.assertEquals(DataTypes.StringType, charResult); + + column.columnType = "HLL"; + DataType hllResult = dppUtils.getDataTypeFromColumn(column, false); + Assertions.assertEquals(DataTypes.StringType, hllResult); + + column.columnType = "OBJECT"; + DataType objectResult = dppUtils.getDataTypeFromColumn(column, false); + Assertions.assertEquals(DataTypes.StringType, objectResult); + + column.columnType = "BOOLEAN"; + DataType booleanResult = dppUtils.getDataTypeFromColumn(column, false); + Assertions.assertEquals(DataTypes.StringType, booleanResult); + + column.columnType = "TINYINT"; + DataType tinyResult = dppUtils.getDataTypeFromColumn(column, false); + Assertions.assertEquals(DataTypes.ByteType, tinyResult); + + column.columnType = "SMALLINT"; + DataType smallResult = dppUtils.getDataTypeFromColumn(column, false); + Assertions.assertEquals(DataTypes.ShortType, smallResult); + + column.columnType = "INT"; + DataType integerResult = dppUtils.getDataTypeFromColumn(column, false); + Assertions.assertEquals(DataTypes.IntegerType, integerResult); + + column.columnType = "BIGINT"; + DataType longResult = dppUtils.getDataTypeFromColumn(column, false); + Assertions.assertEquals(DataTypes.LongType, longResult); + + column.columnType = "DATETIME"; + DataType datetimeResult = dppUtils.getDataTypeFromColumn(column, false); + Assertions.assertEquals(DataTypes.TimestampType, datetimeResult); + + column.columnType = "FLOAT"; + DataType floatResult = dppUtils.getDataTypeFromColumn(column, false); + Assertions.assertEquals(DataTypes.FloatType, floatResult); + + column.columnType = "DOUBLE"; + DataType doubleResult = dppUtils.getDataTypeFromColumn(column, false); + Assertions.assertEquals(DataTypes.DoubleType, doubleResult); + + column.columnType = "DATE"; + DataType dateResult = dppUtils.getDataTypeFromColumn(column, false); + Assertions.assertEquals(DataTypes.DateType, dateResult); + } catch (Exception e) { + Assertions.assertTrue(false); + } + } + + @Test + public void testCreateDstTableSchema() { + DppUtils dppUtils = new DppUtils(); + + EtlJobConfig.EtlColumn column1 = new EtlJobConfig.EtlColumn( + "column1", "INT", + true, true, + "NONE", "0", + 0, 0, 0); + EtlJobConfig.EtlColumn column2 = new EtlJobConfig.EtlColumn( + "column2", "SMALLINT", + true, true, + "NONE", "0", + 0, 0, 0); + List columns = new ArrayList<>(); + columns.add(column1); + columns.add(column2); + + try { + StructType schema = dppUtils.createDstTableSchema(columns, false, false); + Assertions.assertEquals(2, schema.fieldNames().length); + Assertions.assertEquals("column1", schema.fieldNames()[0]); + Assertions.assertEquals("column2", schema.fieldNames()[1]); + + StructType schema2 = dppUtils.createDstTableSchema(columns, true, false); + Assertions.assertEquals(3, schema2.fieldNames().length); + Assertions.assertEquals("__bucketId__", schema2.fieldNames()[0]); + Assertions.assertEquals("column1", schema2.fieldNames()[1]); + Assertions.assertEquals("column2", schema2.fieldNames()[2]); + } catch (Exception e) { + Assertions.assertTrue(false); + } + } + + @Test + public void testParseColumnsFromPath() { + DppUtils dppUtils = new DppUtils(); + + String path = "/path/to/file/city=beijing/date=2020-04-10/data"; + List columnFromPaths = new ArrayList<>(); + columnFromPaths.add("city"); + columnFromPaths.add("date"); + try { + List columnFromPathValues = dppUtils.parseColumnsFromPath(path, columnFromPaths); + Assertions.assertEquals(2, columnFromPathValues.size()); + Assertions.assertEquals("beijing", columnFromPathValues.get(0)); + Assertions.assertEquals("2020-04-10", columnFromPathValues.get(1)); + } catch (Exception e) { + Assertions.assertTrue(false); + } + } +} diff --git a/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilderTest.java b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilderTest.java new file mode 100644 index 00000000..fc57abe3 --- /dev/null +++ b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilderTest.java @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.config.EtlJobConfig; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + + +import java.util.ArrayList; +import java.util.List; + +public class MinimumCoverageRollupTreeBuilderTest { + + @Test + public void testBuild() { + EtlJobConfig.EtlColumn column1 = new EtlJobConfig.EtlColumn( + "column1", "INT", + true, true, + "NONE", "0", + 0, 0, 0); + EtlJobConfig.EtlColumn column2 = new EtlJobConfig.EtlColumn( + "column2", "SMALLINT", + true, true, + "NONE", "0", + 0, 0, 0); + EtlJobConfig.EtlColumn column3 = new EtlJobConfig.EtlColumn( + "column3", "VARCHAR", + true, true, + "NONE", "", + 0, 0, 0); + EtlJobConfig.EtlColumn column4 = new EtlJobConfig.EtlColumn( + "column4", "INT", + true, false, + "SUM", "", + 0, 0, 0); + List baseColumns = new ArrayList<>(); + baseColumns.add(column1); + baseColumns.add(column2); + baseColumns.add(column3); + baseColumns.add(column4); + EtlJobConfig.EtlIndex baseIndex = new EtlJobConfig.EtlIndex(10000, + baseColumns, 12345, "DUPLICATE", true, 1); + List roll1Columns = new ArrayList<>(); + roll1Columns.add(column1); + roll1Columns.add(column2); + roll1Columns.add(column4); + EtlJobConfig.EtlIndex roll1Index = new EtlJobConfig.EtlIndex(10001, + roll1Columns, 12346, "AGGREGATE", false, 1); + List roll2Columns = new ArrayList<>(); + roll2Columns.add(column1); + roll2Columns.add(column4); + EtlJobConfig.EtlIndex roll2Index = new EtlJobConfig.EtlIndex(10002, + roll2Columns, 12347, "AGGREGATE", false, 1); + + List roll3Columns = new ArrayList<>(); + roll3Columns.add(column3); + roll3Columns.add(column4); + EtlJobConfig.EtlIndex roll3Index = new EtlJobConfig.EtlIndex(10003, + roll3Columns, 12348, "AGGREGATE", false, 1); + + List indexes = new ArrayList<>(); + indexes.add(baseIndex); + indexes.add(roll1Index); + indexes.add(roll2Index); + indexes.add(roll3Index); + EtlJobConfig.EtlTable table = new EtlJobConfig.EtlTable(indexes, null); + + MinimumCoverageRollupTreeBuilder builder = new MinimumCoverageRollupTreeBuilder(); + RollupTreeNode resultNode = builder.build(table); + Assertions.assertEquals(resultNode.parent, null); + Assertions.assertEquals(resultNode.indexId, 10000); + Assertions.assertEquals(resultNode.level, 0); + Assertions.assertEquals(resultNode.children.size(), 2); + + RollupTreeNode index1Node = resultNode.children.get(0); + Assertions.assertEquals(index1Node.parent.indexId, 10000); + Assertions.assertEquals(index1Node.indexId, 10001); + Assertions.assertEquals(index1Node.level, 1); + Assertions.assertEquals(index1Node.children.size(), 1); + + RollupTreeNode index3Node = resultNode.children.get(1); + Assertions.assertEquals(index3Node.parent.indexId, 10000); + Assertions.assertEquals(index3Node.indexId, 10003); + Assertions.assertEquals(index3Node.level, 1); + Assertions.assertEquals(index3Node.children, null); + + RollupTreeNode index2Node = index1Node.children.get(0); + Assertions.assertEquals(index2Node.parent.indexId, 10001); + Assertions.assertEquals(index2Node.indexId, 10002); + Assertions.assertEquals(index2Node.level, 2); + Assertions.assertEquals(index2Node.children, null); + } +} diff --git a/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/SparkDppTest.java b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/SparkDppTest.java new file mode 100644 index 00000000..20039092 --- /dev/null +++ b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/SparkDppTest.java @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.dpp; + +import org.apache.doris.config.EtlJobConfig; + +import org.apache.spark.sql.RowFactory; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.math.BigDecimal; + +public class SparkDppTest { + + @Test + public void testValidateData() { + SparkDpp sparkDpp = new SparkDpp(); + + // decimal + EtlJobConfig.EtlColumn etlColumn = new EtlJobConfig.EtlColumn(); + etlColumn.columnType = "DECIMALV2"; + etlColumn.precision = 3; + etlColumn.scale = 2; + + DecimalParser decimalParser = new DecimalParser(etlColumn); + // test max/min + Assertions.assertEquals(decimalParser.getMaxValue().toString(), "9.99"); + Assertions.assertEquals(decimalParser.getMinValue().toString(), "-9.99"); + // normal + BigDecimal bigDecimal = new BigDecimal("1.21"); + Assertions.assertTrue(sparkDpp.validateData(bigDecimal, etlColumn, decimalParser, RowFactory.create(bigDecimal))); + // failed + BigDecimal bigDecimalFailed = new BigDecimal("10"); + Assertions.assertFalse(sparkDpp.validateData(bigDecimalFailed, etlColumn, decimalParser, RowFactory.create(bigDecimalFailed))); + + // string + EtlJobConfig.EtlColumn stringColumn = new EtlJobConfig.EtlColumn(); + stringColumn.stringLength = 3; + stringColumn.columnType = "VARCHAR"; + StringParser stringParser = new StringParser(stringColumn); + // normal + String normalString = "a1"; + Assertions.assertTrue(sparkDpp.validateData(normalString, stringColumn, stringParser, RowFactory.create(normalString))); + // cn normal + String normalStringCN = "中"; + Assertions.assertTrue(sparkDpp.validateData(normalStringCN, stringColumn, stringParser, RowFactory.create(normalStringCN))); + // cn failed + String failedStringCN = "中a"; + Assertions.assertFalse(sparkDpp.validateData(failedStringCN, stringColumn, stringParser, RowFactory.create(failedStringCN))); + } + +} diff --git a/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/etl/SparkEtlJobTest.java b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/etl/SparkEtlJobTest.java new file mode 100644 index 00000000..676a2139 --- /dev/null +++ b/spark-load/spark-load-dpp/src/test/java/org/apache/doris/load/loadv2/etl/SparkEtlJobTest.java @@ -0,0 +1,195 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2.etl; + +import org.apache.doris.common.jmockit.Deencapsulation; +import org.apache.doris.config.EtlJobConfig; +import org.apache.doris.config.EtlJobConfig.EtlColumn; +import org.apache.doris.config.EtlJobConfig.EtlColumnMapping; +import org.apache.doris.config.EtlJobConfig.EtlFileGroup; +import org.apache.doris.config.EtlJobConfig.EtlIndex; +import org.apache.doris.config.EtlJobConfig.EtlJobProperty; +import org.apache.doris.config.EtlJobConfig.EtlPartition; +import org.apache.doris.config.EtlJobConfig.EtlPartitionInfo; +import org.apache.doris.config.EtlJobConfig.EtlTable; + +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import mockit.Expectations; +import mockit.Mocked; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PositionedReadable; +import org.apache.hadoop.fs.Seekable; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + + +import java.io.ByteArrayInputStream; +import java.io.EOFException; +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Set; + +public class SparkEtlJobTest { + private long tableId; + private long index1Id; + private long index2Id; + private long partition1Id; + private long partition2Id; + private EtlJobConfig etlJobConfig; + + @BeforeEach + public void setUp() { + tableId = 0L; + index1Id = 1L; + index2Id = 2L; + partition1Id = 3L; + partition2Id = 4L; + + // indexes + EtlColumn k1 = new EtlColumn("k1", "INT", false, true, "NONE", "0", 0, 0, 0); + EtlColumn k2 = new EtlColumn("k2", "VARCHAR", false, true, "NONE", "0", 10, 0, 0); + EtlColumn v1 = new EtlColumn("v1", "BIGINT", false, false, "NONE", "0", 0, 0, 0); + EtlIndex index1 = new EtlIndex(index1Id, Lists.newArrayList(k1, k2, v1), 666666, "DUPLICATE", true, 1); + v1 = new EtlColumn("v1", "BIGINT", false, false, "SUM", "0", 0, 0, 0); + EtlIndex index2 = new EtlIndex(index2Id, Lists.newArrayList(k1, v1), 888888, "AGGREGATE", true,1 ); + List indexes = Lists.newArrayList(index1, index2); + // partition info + List partitions = Lists.newArrayList(); + partitions.add(new EtlPartition(partition1Id, Lists.newArrayList(0), Lists.newArrayList(100), false, 2)); + partitions.add(new EtlPartition(partition2Id, Lists.newArrayList(100), Lists.newArrayList(), true, 3)); + EtlPartitionInfo partitionInfo = new EtlPartitionInfo("RANGE", Lists.newArrayList("k1"), Lists.newArrayList("k2"), partitions); + EtlTable table = new EtlTable(indexes, partitionInfo); + // file group + Map columnMappings = Maps.newHashMap(); + columnMappings.put("k1", new EtlColumnMapping("k1 + 1")); + table.addFileGroup(new EtlFileGroup(EtlJobConfig.SourceType.FILE, Lists.newArrayList("hdfs://127.0.0.1:10000/file"), + Lists.newArrayList(), Lists.newArrayList(), "\t", "\n", false, null, + Maps.newHashMap(), "", Lists.newArrayList(partition1Id, partition2Id))); + // tables + Map tables = Maps.newHashMap(); + tables.put(tableId, table); + // others + String outputFilePattern = "V1.label0.%d.%d.%d.%d.%d.parquet"; + String label = "label0"; + EtlJobProperty properties = new EtlJobProperty(); + properties.strictMode = false; + properties.timezone = "Asia/Shanghai"; + etlJobConfig = new EtlJobConfig(tables, outputFilePattern, label, properties); + } + + @Test + public void testInitConfig(@Mocked FileSystem fs) throws IOException { + new Expectations() { + { + fs.open(new Path("hdfs://127.0.0.1:10000/jobconfig.json")); + result = new FSDataInputStream(new SeekableByteArrayInputStream(etlJobConfig.configToJson().getBytes())); + } + }; + + SparkEtlJob job = Deencapsulation.newInstance(SparkEtlJob.class, "hdfs://127.0.0.1:10000/jobconfig.json"); + Deencapsulation.invoke(job, "initConfig"); + EtlJobConfig parsedConfig = Deencapsulation.getField(job, "etlJobConfig"); + Assertions.assertTrue(parsedConfig.tables.containsKey(tableId)); + EtlTable table = parsedConfig.tables.get(tableId); + Assertions.assertEquals(2, table.indexes.size()); + Assertions.assertEquals(2, table.partitionInfo.partitions.size()); + Assertions.assertEquals(false, parsedConfig.properties.strictMode); + Assertions.assertEquals("label0", parsedConfig.label); + } + + @Test + public void testCheckConfigWithoutBitmapDictColumns() { + SparkEtlJob job = Deencapsulation.newInstance(SparkEtlJob.class, "hdfs://127.0.0.1:10000/jobconfig.json"); + Deencapsulation.setField(job, "etlJobConfig", etlJobConfig); + Deencapsulation.invoke(job, "checkConfig"); + Map> tableToBitmapDictColumns = Deencapsulation.getField(job, "tableToBitmapDictColumns"); + // check bitmap dict columns empty + Assertions.assertTrue(tableToBitmapDictColumns.isEmpty()); + } + + @Test + public void testCheckConfigWithBitmapDictColumns() { + SparkEtlJob job = Deencapsulation.newInstance(SparkEtlJob.class, "hdfs://127.0.0.1:10000/jobconfig.json"); + EtlTable table = etlJobConfig.tables.get(tableId); + table.indexes.get(0).columns.add( + new EtlColumn("v2", "BITMAP", false, false, "BITMAP_UNION", "0", 0, 0, 0) + ); + EtlFileGroup fileGroup = table.fileGroups.get(0); + fileGroup.sourceType = EtlJobConfig.SourceType.HIVE; + fileGroup.columnMappings.put( + "v2", new EtlColumnMapping("bitmap_dict", Lists.newArrayList("v2")) + ); + Deencapsulation.setField(job, "etlJobConfig", etlJobConfig); + Deencapsulation.invoke(job, "checkConfig"); + // check hive source + Set hiveSourceTables = Deencapsulation.getField(job, "hiveSourceTables"); + Assertions.assertTrue(hiveSourceTables.contains(tableId)); + // check bitmap dict columns has v2 + Map> tableToBitmapDictColumns = Deencapsulation.getField(job, "tableToBitmapDictColumns"); + Assertions.assertTrue(tableToBitmapDictColumns.containsKey(tableId)); + Assertions.assertTrue(tableToBitmapDictColumns.get(tableId).contains("v2")); + // check remove v2 bitmap_dict func mapping from file group column mappings + Assertions.assertFalse(table.fileGroups.get(0).columnMappings.containsKey("v2")); + } + + private static class SeekableByteArrayInputStream extends ByteArrayInputStream implements Seekable, PositionedReadable { + public SeekableByteArrayInputStream(byte[] buf) { + super(buf); + } + + public void seek(long position) { + if (position < 0 || position >= buf.length) { + throw new IllegalArgumentException("pos = " + position + " length = " + buf.length); + } + this.pos = (int) position; + } + + public long getPos() { + return this.pos; + } + + @Override + public boolean seekToNewSource(long targetPos) throws IOException { + return false; + } + + @Override + public int read(long position, byte[] buffer, int offset, int length) throws IOException { + this.seek(position); + return this.read(buffer, offset, length); + } + + @Override + public void readFully(long position, byte[] buffer, int offset, int length) throws IOException { + if (position + length > buf.length) { + throw new EOFException("End of file reached before reading fully."); + } + System.arraycopy(buf, (int) position, buffer, offset, length); + } + + @Override + public void readFully(long position, byte[] buffer) throws IOException { + readFully(position, buffer, 0, buffer.length); + } + } +}