From 6ea2b60aa4eafd145f7af18c154974a477f14d56 Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Tue, 4 May 2021 10:12:02 +0900 Subject: [PATCH] Refactor hashing and fingerprint functions Fixes #271. With newly added classes for fingerprinting, now it is easy to make a fingerprint from various types of data. FingerprintBuilder can be used to build a fingerprint from data. FileBasedRelation.signature now takes it as an argument, so the implementations don't have to do the hashing themselves. They can just use the provided builder. For unorderd combining, one can use bitwise XOR to combine multiple fingerprints. This way, the order becomes irrelavant. --- .../index/FileBasedSignatureProvider.scala | 35 +++--- .../hyperspace/index/IndexLogEntry.scala | 3 +- .../index/IndexSignatureProvider.scala | 15 ++- .../index/LogicalPlanSignatureProvider.scala | 16 ++- .../index/PlanSignatureProvider.scala | 17 ++- .../hyperspace/index/rules/RuleUtils.scala | 3 +- .../default/DefaultFileBasedRelation.scala | 23 ++-- .../sources/delta/DeltaLakeRelation.scala | 7 +- .../sources/iceberg/IcebergRelation.scala | 10 +- .../hyperspace/index/sources/interfaces.scala | 9 +- .../hyperspace/util/HashingUtils.scala | 35 ------ .../util/fingerprint/Fingerprint.scala | 64 ++++++++++ .../util/fingerprint/FingerprintBuilder.scala | 91 +++++++++++++++ .../FingerprintBuilderFactory.scala | 49 ++++++++ .../actions/RefreshActionTest.scala | 4 +- .../FileBasedSignatureProviderTest.scala | 9 +- .../hyperspace/index/IndexCacheTest.scala | 4 +- .../index/IndexCollectionManagerTest.scala | 7 +- .../hyperspace/index/IndexLogEntryTest.scala | 5 +- .../index/IndexLogManagerImplTest.scala | 3 +- .../index/IndexSignatureProviderTest.scala | 7 +- .../hyperspace/index/IndexTest.scala | 4 +- .../index/rules/HyperspaceRuleSuite.scala | 4 +- .../index/rules/RuleTestHelper.scala | 8 +- .../hyperspace/util/HashingUtilsTest.scala | 38 ------ .../hyperspace/util/JsonUtilsTest.scala | 4 +- .../fingerprint/FingerprintBuilderTest.scala | 110 ++++++++++++++++++ .../util/fingerprint/FingerprintTest.scala | 57 +++++++++ 28 files changed, 495 insertions(+), 146 deletions(-) delete mode 100644 src/main/scala/com/microsoft/hyperspace/util/HashingUtils.scala create mode 100644 src/main/scala/com/microsoft/hyperspace/util/fingerprint/Fingerprint.scala create mode 100644 src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilder.scala create mode 100644 src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderFactory.scala delete mode 100644 src/test/scala/com/microsoft/hyperspace/util/HashingUtilsTest.scala create mode 100644 src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderTest.scala create mode 100644 src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintTest.scala diff --git a/src/main/scala/com/microsoft/hyperspace/index/FileBasedSignatureProvider.scala b/src/main/scala/com/microsoft/hyperspace/index/FileBasedSignatureProvider.scala index 654c19142..0d71f6fd9 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/FileBasedSignatureProvider.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/FileBasedSignatureProvider.scala @@ -19,15 +19,20 @@ package com.microsoft.hyperspace.index import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan} import com.microsoft.hyperspace.Hyperspace -import com.microsoft.hyperspace.util.HashingUtils +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilder, FingerprintBuilderFactory} /** * [[FileBasedSignatureProvider]] provides the logical plan signature based on files in the * relation. File metadata, eg. size, modification time and path, of each file in the * relation will be used to generate the signature. + * + * Note that while the order of files in a single relation does not affect the signature, + * the order of relations in the plan do affect the signature calculation. + * * If the given logical plan does not have any supported relations, no signature is provided. */ -class FileBasedSignatureProvider extends LogicalPlanSignatureProvider { +class FileBasedSignatureProvider(fbf: FingerprintBuilderFactory) + extends LogicalPlanSignatureProvider { /** * Generate the signature of logical plan. @@ -35,28 +40,18 @@ class FileBasedSignatureProvider extends LogicalPlanSignatureProvider { * @param logicalPlan logical plan of data frame. * @return signature, if the logical plan has supported relations; Otherwise None. */ - def signature(logicalPlan: LogicalPlan): Option[String] = { - fingerprintVisitor(logicalPlan).map(HashingUtils.md5Hex) - } - - /** - * Visit logical plan and collect info needed for fingerprint. - * - * @param logicalPlan logical plan of data frame. - * @return fingerprint, if the logical plan has supported relations; Otherwise None. - */ - private def fingerprintVisitor(logicalPlan: LogicalPlan): Option[String] = { + def signature(logicalPlan: LogicalPlan): Option[Fingerprint] = { val provider = Hyperspace.getContext.sourceProviderManager - var fingerprint = "" + val fb: FingerprintBuilder = fbf.create + var updated = false logicalPlan.foreachUp { case l: LeafNode if provider.isSupportedRelation(l) => - fingerprint ++= provider.getRelation(l).signature + provider.getRelation(l).signature(fb).foreach { f => + fb.add(f) + updated = true + } case _ => } - - fingerprint match { - case "" => None - case _ => Some(fingerprint) - } + if (updated) Some(fb.build()) else None } } diff --git a/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntry.scala b/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntry.scala index 6e0badba7..d43db2f5e 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntry.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntry.scala @@ -32,6 +32,7 @@ import org.apache.spark.sql.types.{DataType, StructType} import com.microsoft.hyperspace.{BuildInfo, HyperspaceException} import com.microsoft.hyperspace.actions.Constants import com.microsoft.hyperspace.util.PathUtils +import com.microsoft.hyperspace.util.fingerprint.Fingerprint // IndexLogEntry-specific fingerprint to be temporarily used where fingerprint is not defined. case class NoOpFingerprint() { @@ -361,7 +362,7 @@ object CoveringIndex { } // IndexLogEntry-specific Signature that stores the signature provider and value. -case class Signature(provider: String, value: String) +case class Signature(provider: String, value: Fingerprint) // IndexLogEntry-specific LogicalPlanFingerprint to store fingerprint of logical plan. case class LogicalPlanFingerprint(properties: LogicalPlanFingerprint.Properties) { diff --git a/src/main/scala/com/microsoft/hyperspace/index/IndexSignatureProvider.scala b/src/main/scala/com/microsoft/hyperspace/index/IndexSignatureProvider.scala index efe4f4cf5..63b862fbd 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/IndexSignatureProvider.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/IndexSignatureProvider.scala @@ -18,7 +18,7 @@ package com.microsoft.hyperspace.index import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import com.microsoft.hyperspace.util.HashingUtils +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilderFactory} /** * [[IndexSignatureProvider]] provides signature for a logical plan based on: @@ -29,10 +29,13 @@ import com.microsoft.hyperspace.util.HashingUtils * * If the plan does not comply with [[FileBasedSignatureProvider]] or [[PlanSignatureProvider]] * requirements for signature computation, then no signature will be provided for the plan. + * + * @param fbf [[FingerprintBuilderFactory]] used for building fingerprints */ -class IndexSignatureProvider extends LogicalPlanSignatureProvider { - private val fileBasedSignatureProvider = new FileBasedSignatureProvider - private val planSignatureProvider = new PlanSignatureProvider +class IndexSignatureProvider(fbf: FingerprintBuilderFactory) + extends LogicalPlanSignatureProvider { + private val fileBasedSignatureProvider = new FileBasedSignatureProvider(fbf) + private val planSignatureProvider = new PlanSignatureProvider(fbf) /** * Generate the signature of logical plan. @@ -41,10 +44,10 @@ class IndexSignatureProvider extends LogicalPlanSignatureProvider { * @return signature, if both [[FileBasedSignatureProvider]] and [[PlanSignatureProvider]] * can generate signature for the logical plan; Otherwise None. */ - def signature(logicalPlan: LogicalPlan): Option[String] = { + def signature(logicalPlan: LogicalPlan): Option[Fingerprint] = { fileBasedSignatureProvider.signature(logicalPlan).flatMap { f => planSignatureProvider.signature(logicalPlan).map { p => - HashingUtils.md5Hex(f + p) + fbf.create.add(f).add(p).build() } } } diff --git a/src/main/scala/com/microsoft/hyperspace/index/LogicalPlanSignatureProvider.scala b/src/main/scala/com/microsoft/hyperspace/index/LogicalPlanSignatureProvider.scala index 4c8277cba..8727c4c6c 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/LogicalPlanSignatureProvider.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/LogicalPlanSignatureProvider.scala @@ -21,8 +21,12 @@ import scala.util.{Success, Try} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.util.hyperspace.Utils +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilderFactory, MD5FingerprintBuilderFactory} + /** * This trait contains the interface that provides the signature of logical plan. + * + * The implementation must have a constructor taking [[FingerprintBuilderFactory]] as an argument. */ trait LogicalPlanSignatureProvider { @@ -36,15 +40,17 @@ trait LogicalPlanSignatureProvider { * @param logicalPlan logical plan. * @return signature if it can be computed w.r.t signature provider assumptions; Otherwise None. */ - def signature(logicalPlan: LogicalPlan): Option[String] + def signature(logicalPlan: LogicalPlan): Option[Fingerprint] } /** * Factory object for LogicalPlanSignatureProvider. */ object LogicalPlanSignatureProvider { + private val fbf: FingerprintBuilderFactory = new MD5FingerprintBuilderFactory + // Creates a default signature provider. - def create(): LogicalPlanSignatureProvider = new IndexSignatureProvider + def create(): LogicalPlanSignatureProvider = new IndexSignatureProvider(fbf) /** * Creates a parameterized signature provider. @@ -53,7 +59,11 @@ object LogicalPlanSignatureProvider { * @return signature provider. */ def create(name: String): LogicalPlanSignatureProvider = { - Try(Utils.classForName(name).newInstance) match { + Try( + Utils + .classForName(name) + .getConstructor(classOf[FingerprintBuilderFactory]) + .newInstance(fbf)) match { case Success(provider: LogicalPlanSignatureProvider) => provider case _ => throw new IllegalArgumentException( diff --git a/src/main/scala/com/microsoft/hyperspace/index/PlanSignatureProvider.scala b/src/main/scala/com/microsoft/hyperspace/index/PlanSignatureProvider.scala index 8a780db4b..cef578d69 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/PlanSignatureProvider.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/PlanSignatureProvider.scala @@ -18,14 +18,16 @@ package com.microsoft.hyperspace.index import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import com.microsoft.hyperspace.util.HashingUtils +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilder, FingerprintBuilderFactory} /** * [[PlanSignatureProvider]] provides signature for a logical plan based on * the type of operators in it. * A plan needs to have at least one operator so its signature can be generated. + * + * @param fbf [[FingerprintBuilderFactory]] used for building fingerprints */ -class PlanSignatureProvider extends LogicalPlanSignatureProvider { +class PlanSignatureProvider(fbf: FingerprintBuilderFactory) extends LogicalPlanSignatureProvider { /** * Generate the signature of logical plan. @@ -33,12 +35,9 @@ class PlanSignatureProvider extends LogicalPlanSignatureProvider { * @param logicalPlan logical plan. * @return signature if there is at least one operator in the plan; Otherwise None. */ - def signature(logicalPlan: LogicalPlan): Option[String] = { - var signature = "" - logicalPlan.foreachUp(p => signature = HashingUtils.md5Hex(signature + p.nodeName)) - signature match { - case "" => None - case _ => Some(signature) - } + def signature(logicalPlan: LogicalPlan): Option[Fingerprint] = { + val fb: FingerprintBuilder = fbf.create + logicalPlan.foreachUp(node => fb.add(node.nodeName)) + Some(fb.build()) } } diff --git a/src/main/scala/com/microsoft/hyperspace/index/rules/RuleUtils.scala b/src/main/scala/com/microsoft/hyperspace/index/rules/RuleUtils.scala index d9e0f6bf8..2c0c999c5 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/rules/RuleUtils.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/rules/RuleUtils.scala @@ -34,6 +34,7 @@ import com.microsoft.hyperspace.index.IndexLogEntryTags.{HYBRIDSCAN_RELATED_CONF import com.microsoft.hyperspace.index.plans.logical.{BucketUnion, IndexHadoopFsRelation} import com.microsoft.hyperspace.index.sources.FileBasedRelation import com.microsoft.hyperspace.util.HyperspaceConf +import com.microsoft.hyperspace.util.fingerprint.Fingerprint object RuleUtils { @@ -54,7 +55,7 @@ object RuleUtils { indexes: Seq[IndexLogEntry], relation: FileBasedRelation): Seq[IndexLogEntry] = { // Map of a signature provider to a signature generated for the given plan. - val signatureMap = mutable.Map[String, Option[String]]() + val signatureMap = mutable.Map[String, Option[Fingerprint]]() def signatureValid(entry: IndexLogEntry): Boolean = { entry.withCachedTag(relation.plan, IndexLogEntryTags.SIGNATURE_MATCHED) { diff --git a/src/main/scala/com/microsoft/hyperspace/index/sources/default/DefaultFileBasedRelation.scala b/src/main/scala/com/microsoft/hyperspace/index/sources/default/DefaultFileBasedRelation.scala index 947573c4d..f1a27c1e2 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/sources/default/DefaultFileBasedRelation.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/sources/default/DefaultFileBasedRelation.scala @@ -30,7 +30,7 @@ import com.microsoft.hyperspace.HyperspaceException import com.microsoft.hyperspace.index.{Content, FileIdTracker, Hdfs, Relation} import com.microsoft.hyperspace.index.IndexConstants.GLOBBING_PATTERN_KEY import com.microsoft.hyperspace.index.sources.FileBasedRelation -import com.microsoft.hyperspace.util.HashingUtils +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilder} /** * Implementation for file-based relation used by [[DefaultFileBasedSource]] @@ -41,14 +41,15 @@ class DefaultFileBasedRelation(spark: SparkSession, override val plan: LogicalRe /** * Computes the signature of the current relation. + * + * @param fb [[FingerprintBuilder]] used for building fingerprints */ - override def signature: String = plan.relation match { + override def signature(fb: FingerprintBuilder): Option[Fingerprint] = plan.relation match { case HadoopFsRelation(location: PartitioningAwareFileIndex, _, _, _, _, _) => - val result = filesFromIndex(location).sortBy(_.getPath.toString).foldLeft("") { - (acc: String, f: FileStatus) => - HashingUtils.md5Hex(acc + fingerprint(f)) - } - result + val initialFingerprint = fb.build() + var fingerprint = initialFingerprint + filesFromIndex(location).foreach(fingerprint ^= createFingerprint(_, fb)) + Some(fingerprint).filter(_ != initialFingerprint) } /** @@ -179,9 +180,11 @@ class DefaultFileBasedRelation(spark: SparkSession, override val plan: LogicalRe } } - private def fingerprint(fileStatus: FileStatus): String = { - fileStatus.getLen.toString + fileStatus.getModificationTime.toString + - fileStatus.getPath.toString + private def createFingerprint(fileStatus: FileStatus, fb: FingerprintBuilder): Fingerprint = { + fb.add(fileStatus.getLen) + .add(fileStatus.getModificationTime) + .add(fileStatus.getPath.toString) + .build() } private def filesFromIndex(index: PartitioningAwareFileIndex): Seq[FileStatus] = { diff --git a/src/main/scala/com/microsoft/hyperspace/index/sources/delta/DeltaLakeRelation.scala b/src/main/scala/com/microsoft/hyperspace/index/sources/delta/DeltaLakeRelation.scala index 2cfac9656..705def81c 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/sources/delta/DeltaLakeRelation.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/sources/delta/DeltaLakeRelation.scala @@ -27,6 +27,7 @@ import com.microsoft.hyperspace.actions.Constants import com.microsoft.hyperspace.index.{Content, FileIdTracker, Hdfs, IndexLogEntry, Relation} import com.microsoft.hyperspace.index.sources.default.DefaultFileBasedRelation import com.microsoft.hyperspace.util.{HyperspaceConf, PathUtils} +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilder} /** * Implementation for file-based relation used by [[DeltaLakeFileBasedSource]] @@ -36,10 +37,12 @@ class DeltaLakeRelation(spark: SparkSession, override val plan: LogicalRelation) /** * Computes the signature of the current relation. + * + * @param fb [[FingerprintBuilder]] used for building fingerprints */ - override def signature: String = plan.relation match { + override def signature(fb: FingerprintBuilder): Option[Fingerprint] = plan.relation match { case HadoopFsRelation(location: TahoeLogFileIndex, _, _, _, _, _) => - location.tableVersion + location.path.toString + Some(fb.add(location.tableVersion).add(location.path.toString).build()) } /** diff --git a/src/main/scala/com/microsoft/hyperspace/index/sources/iceberg/IcebergRelation.scala b/src/main/scala/com/microsoft/hyperspace/index/sources/iceberg/IcebergRelation.scala index 7a3f2a99f..9c4a63db7 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/sources/iceberg/IcebergRelation.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/sources/iceberg/IcebergRelation.scala @@ -30,6 +30,7 @@ import org.apache.spark.sql.types.StructType import com.microsoft.hyperspace.index.{Content, FileIdTracker, Hdfs, IndexConstants, Relation} import com.microsoft.hyperspace.index.sources.FileBasedRelation import com.microsoft.hyperspace.util.PathUtils +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilder} /** * Implementation for file-based relation used by [[IcebergFileBasedSource]] @@ -61,9 +62,14 @@ class IcebergRelation( /** * Computes the signature of the current relation. + * + * @param fb [[FingerprintBuilder]] used for building fingerprints */ - override def signature: String = { - snapshotId.getOrElse(table.currentSnapshot().snapshotId()).toString + table.location() + override def signature(fb: FingerprintBuilder): Option[Fingerprint] = { + Some( + fb.add(snapshotId.getOrElse(table.currentSnapshot().snapshotId()).toString) + .add(table.location) + .build()) } /** diff --git a/src/main/scala/com/microsoft/hyperspace/index/sources/interfaces.scala b/src/main/scala/com/microsoft/hyperspace/index/sources/interfaces.scala index 66e3f2e52..495516690 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/sources/interfaces.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/sources/interfaces.scala @@ -24,6 +24,7 @@ import org.apache.spark.sql.execution.datasources.{FileIndex, HadoopFsRelation, import org.apache.spark.sql.types.StructType import com.microsoft.hyperspace.index.{FileIdTracker, FileInfo, IndexConstants, IndexLogEntry, Relation} +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilder} /** * ::Experimental:: @@ -67,8 +68,14 @@ trait FileBasedRelation extends SourceRelation { * * This API is used when the signature of source needs to be computed, e.g., creating an index, * computing query plan's signature, etc. + * + * If it is not possible to compute the signature (e.g. there are no files left), + * the implementation might return None. + * + * @param fb [[FingerprintBuilder]] used for building fingerprints. + * Use it to compute the signature from discriminating properties of the relation. */ - def signature: String + def signature(fb: FingerprintBuilder): Option[Fingerprint] /** * FileStatus list for all source files that the current relation references to. diff --git a/src/main/scala/com/microsoft/hyperspace/util/HashingUtils.scala b/src/main/scala/com/microsoft/hyperspace/util/HashingUtils.scala deleted file mode 100644 index 530cae4ca..000000000 --- a/src/main/scala/com/microsoft/hyperspace/util/HashingUtils.scala +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (2020) The Hyperspace Project Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.microsoft.hyperspace.util - -import org.apache.commons.codec.digest.DigestUtils - -/** - * HashingUtils supplies different hashing functions. - */ -object HashingUtils { - - /** - * MD5-based hashing function. - * - * @param input the input to be hashed, for Any type of data. - * @return the hash code string. - */ - def md5Hex(input: Any): String = { - DigestUtils.md5Hex(input.toString) - } -} diff --git a/src/main/scala/com/microsoft/hyperspace/util/fingerprint/Fingerprint.scala b/src/main/scala/com/microsoft/hyperspace/util/fingerprint/Fingerprint.scala new file mode 100644 index 000000000..17c89ed7e --- /dev/null +++ b/src/main/scala/com/microsoft/hyperspace/util/fingerprint/Fingerprint.scala @@ -0,0 +1,64 @@ +/* + * Copyright (2020) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.util.fingerprint + +import javax.xml.bind.DatatypeConverter + +import com.fasterxml.jackson.annotation.{JsonCreator, JsonValue} + +/** + * Represents a fingerprint, which can be used to identify and compare objects + * without actually comparing them. + * + * The value of a fingerprint is typically calculated by a hash function. To + * function as a fingerprinting function, the hash function must distribute + * the input uniformly into hashed values, and the number of bytes of the hash + * value should be sufficiently large. For example, MD5 produces 128-bit hash + * values (16 bytes), whereas SHA-1 produces 160-bit hash values (20 bytes). + * + * All objects to be identified and compared with each other must have fingerprints + * created from the same hash function. + */ +case class Fingerprint(value: Vector[Byte]) { + + /** + * Returns a human-readable form of this fingerprint in HEX format. + */ + @JsonValue + override def toString: String = { + value.map(_.formatted("%02x")).mkString + } + + /** + * Returns a new fingerprint by applying bitwise XOR to this and the other fingerprints. + */ + def ^(other: Fingerprint): Fingerprint = { + require(value.size == other.value.size) + Fingerprint(value.zip(other.value).map { case (x, y) => (x ^ y).toByte }) + } +} + +object Fingerprint { + + /** + * Creates a Fingerprint from a HEX string. + */ + @JsonCreator + def apply(value: String): Fingerprint = { + Fingerprint(DatatypeConverter.parseHexBinary(value).toVector) + } +} diff --git a/src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilder.scala b/src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilder.scala new file mode 100644 index 000000000..436a37391 --- /dev/null +++ b/src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilder.scala @@ -0,0 +1,91 @@ +/* + * Copyright (2020) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.util.fingerprint + +import java.nio.ByteBuffer +import java.nio.charset.StandardCharsets +import java.security.MessageDigest + +/** + * A builder class for Fingerprint. + * A fingerprint is built by hashing added values sequentially. + * The order of values is significant. + */ +class FingerprintBuilder(private val md: MessageDigest) { + + def add(value: Boolean): FingerprintBuilder = { + md.update((if (value) 1 else 0).toByte) + this + } + + def add(value: Byte): FingerprintBuilder = { + md.update(value) + this + } + + def add(value: Short): FingerprintBuilder = { + md.update(ByteBuffer.allocate(2).putShort(value).flip().asInstanceOf[ByteBuffer]) + this + } + + def add(value: Int): FingerprintBuilder = { + md.update(ByteBuffer.allocate(4).putInt(value).flip().asInstanceOf[ByteBuffer]) + this + } + + def add(value: Long): FingerprintBuilder = { + md.update(ByteBuffer.allocate(8).putLong(value).flip().asInstanceOf[ByteBuffer]) + this + } + + def add(value: Float): FingerprintBuilder = { + md.update(ByteBuffer.allocate(4).putFloat(value).flip().asInstanceOf[ByteBuffer]) + this + } + + def add(value: Double): FingerprintBuilder = { + md.update(ByteBuffer.allocate(8).putDouble(value).flip().asInstanceOf[ByteBuffer]) + this + } + + def add(value: Char): FingerprintBuilder = { + md.update(ByteBuffer.allocate(2).putChar(value).flip().asInstanceOf[ByteBuffer]) + this + } + + def add(value: String): FingerprintBuilder = { + md.update(value.getBytes(StandardCharsets.UTF_8)) + this + } + + def add(value: Array[Byte]): FingerprintBuilder = { + md.update(value) + this + } + + def add(value: Fingerprint): FingerprintBuilder = { + md.update(value.value.toArray) + this + } + + /** + * Builds a fingerprint by hashing added values. + * After this call is made, values are reset and this object + * can be used for another fingerprint. + */ + def build(): Fingerprint = Fingerprint(md.digest().toVector) +} diff --git a/src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderFactory.scala b/src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderFactory.scala new file mode 100644 index 000000000..08fb7dba9 --- /dev/null +++ b/src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderFactory.scala @@ -0,0 +1,49 @@ +/* + * Copyright (2020) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.util.fingerprint + +import org.apache.commons.codec.digest.DigestUtils + +/** + * A factory trait for FingerprintBuilder. This trait must be implemented without any state. + * The same instance of this trait can be shared and used by many objects. + */ +trait FingerprintBuilderFactory { + + /** + * Creates a fingerprint builder. + */ + def create: FingerprintBuilder +} + +/** + * MD5-based fingerprint builder. The fingerprint has 16 bytes (128 bits). + */ +class MD5FingerprintBuilderFactory extends FingerprintBuilderFactory { + override def create: FingerprintBuilder = { + new FingerprintBuilder(DigestUtils.getMd5Digest) + } +} + +/** + * SHA1-based fingerprint builder. The fingerprint has 20 bytes (160 bits). + */ +class SHA1FingerprintBuilderFactory extends FingerprintBuilderFactory { + override def create: FingerprintBuilder = { + new FingerprintBuilder(DigestUtils.getSha1Digest) + } +} diff --git a/src/test/scala/com/microsoft/hyperspace/actions/RefreshActionTest.scala b/src/test/scala/com/microsoft/hyperspace/actions/RefreshActionTest.scala index 909758687..c0432de28 100644 --- a/src/test/scala/com/microsoft/hyperspace/actions/RefreshActionTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/actions/RefreshActionTest.scala @@ -28,6 +28,7 @@ import com.microsoft.hyperspace.{HyperspaceException, SampleData, SparkInvolvedS import com.microsoft.hyperspace.actions.Constants.States.{ACTIVE, CREATING} import com.microsoft.hyperspace.index._ import com.microsoft.hyperspace.index.sources.FileBasedSourceProviderManager +import com.microsoft.hyperspace.util.fingerprint.Fingerprint class RefreshActionTest extends HyperspaceSuite { private val sampleParquetDataLocation = inTempDir("sampleparquet") @@ -78,7 +79,8 @@ class RefreshActionTest extends HyperspaceSuite { null, null, LogicalPlanFingerprint( - LogicalPlanFingerprint.Properties(Seq(Signature("signatureProvider", "dfSignature"))))) + LogicalPlanFingerprint.Properties( + Seq(Signature("signatureProvider", Fingerprint("abcd")))))) val entry = IndexLogEntry( "index1", diff --git a/src/test/scala/com/microsoft/hyperspace/index/FileBasedSignatureProviderTest.scala b/src/test/scala/com/microsoft/hyperspace/index/FileBasedSignatureProviderTest.scala index 6df97afc3..b574f526a 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/FileBasedSignatureProviderTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/FileBasedSignatureProviderTest.scala @@ -20,6 +20,7 @@ import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.spark.SparkFunSuite import com.microsoft.hyperspace.{HyperspaceException, SparkInvolvedSuite} +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, MD5FingerprintBuilderFactory} class FileBasedSignatureProviderTest extends SparkFunSuite with SparkInvolvedSuite { private val fileLength = 100 @@ -30,6 +31,8 @@ class FileBasedSignatureProviderTest extends SparkFunSuite with SparkInvolvedSui private val fileModificationTimeDelta = 10 private val newFilePath = new Path("newPath") + private val fbf = new MD5FingerprintBuilderFactory + test("Logical relations from a same file have the same signature.") { val signature1 = createFileBasedSignature(Seq(createFileStatus(fileLength, fileModificationTime, filePath))) @@ -98,7 +101,7 @@ class FileBasedSignatureProviderTest extends SparkFunSuite with SparkInvolvedSui } test("Create FileBasedSignatureProvider.") { - val fileBasedSignatureProvider = new FileBasedSignatureProvider + val fileBasedSignatureProvider = new FileBasedSignatureProvider(fbf) assert( LogicalPlanSignatureProvider .create(fileBasedSignatureProvider.name) @@ -113,8 +116,8 @@ class FileBasedSignatureProviderTest extends SparkFunSuite with SparkInvolvedSui private def createFileStatus(length: Long, modificationTime: Long, path: Path): FileStatus = SignatureProviderTestUtils.createFileStatus(length, modificationTime, path) - private def createFileBasedSignature(files: Seq[FileStatus]): String = - new FileBasedSignatureProvider() + private def createFileBasedSignature(files: Seq[FileStatus]): Fingerprint = + new FileBasedSignatureProvider(fbf) .signature(SignatureProviderTestUtils.createLogicalRelation(spark, files)) match { case Some(s) => s case None => throw HyperspaceException("Invalid plan for signature generation.") diff --git a/src/test/scala/com/microsoft/hyperspace/index/IndexCacheTest.scala b/src/test/scala/com/microsoft/hyperspace/index/IndexCacheTest.scala index 296ce028c..d430c5728 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/IndexCacheTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/IndexCacheTest.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.types.{StringType, StructField, StructType} import com.microsoft.hyperspace.{Hyperspace, HyperspaceException, SampleData, SparkInvolvedSuite} import com.microsoft.hyperspace.actions.Constants import com.microsoft.hyperspace.util.FileUtils +import com.microsoft.hyperspace.util.fingerprint.Fingerprint class IndexCacheTest extends HyperspaceSuite { val sampleParquetDataLocation = inTempDir("sampleparquet") @@ -45,7 +46,8 @@ class IndexCacheTest extends HyperspaceSuite { null, null, LogicalPlanFingerprint( - LogicalPlanFingerprint.Properties(Seq(Signature("signatureProvider", "dfSignature"))))) + LogicalPlanFingerprint.Properties( + Seq(Signature("signatureProvider", Fingerprint("abcd")))))) val entry = IndexLogEntry( "index1", diff --git a/src/test/scala/com/microsoft/hyperspace/index/IndexCollectionManagerTest.scala b/src/test/scala/com/microsoft/hyperspace/index/IndexCollectionManagerTest.scala index e6e22ee41..b72e59755 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/IndexCollectionManagerTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/IndexCollectionManagerTest.scala @@ -25,6 +25,7 @@ import org.mockito.Mockito.{mock, when} import com.microsoft.hyperspace.{HyperspaceException, SparkInvolvedSuite} import com.microsoft.hyperspace.actions.Constants import com.microsoft.hyperspace.index.IndexConstants.{REFRESH_MODE_FULL, REFRESH_MODE_INCREMENTAL} +import com.microsoft.hyperspace.util.fingerprint.Fingerprint class IndexCollectionManagerTest extends HyperspaceSuite { private val testLogManagerFactory: IndexLogManagerFactory = new IndexLogManagerFactory { @@ -43,7 +44,8 @@ class IndexCollectionManagerTest extends HyperspaceSuite { Seq(), null, null, - LogicalPlanFingerprint(LogicalPlanFingerprint.Properties(Seq(Signature("", ""))))) + LogicalPlanFingerprint( + LogicalPlanFingerprint.Properties(Seq(Signature("", Fingerprint("")))))) val entry = IndexLogEntry( indexPath.toString, @@ -95,7 +97,8 @@ class IndexCollectionManagerTest extends HyperspaceSuite { Seq(), null, null, - LogicalPlanFingerprint(LogicalPlanFingerprint.Properties(Seq(Signature("", ""))))) + LogicalPlanFingerprint( + LogicalPlanFingerprint.Properties(Seq(Signature("", Fingerprint("")))))) val entry = IndexLogEntry( str, diff --git a/src/test/scala/com/microsoft/hyperspace/index/IndexLogEntryTest.scala b/src/test/scala/com/microsoft/hyperspace/index/IndexLogEntryTest.scala index 3eec72cb2..1bac36a18 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/IndexLogEntryTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/IndexLogEntryTest.scala @@ -31,6 +31,7 @@ import org.scalatest.BeforeAndAfter import com.microsoft.hyperspace.{BuildInfo, HyperspaceException, TestUtils} import com.microsoft.hyperspace.index.IndexConstants.UNKNOWN_FILE_ID import com.microsoft.hyperspace.util.{JsonUtils, PathUtils} +import com.microsoft.hyperspace.util.fingerprint.Fingerprint class IndexLogEntryTest extends HyperspaceSuite with SQLHelper { var testDir: file.Path = _ @@ -167,7 +168,7 @@ class IndexLogEntryTest extends HyperspaceSuite with SQLHelper { | "properties" : { | "signatures" : [ { | "provider" : "provider", - | "value" : "signatureValue" + | "value" : "abcd" | } ] | }, | "kind" : "LogicalPlan" @@ -219,7 +220,7 @@ class IndexLogEntryTest extends HyperspaceSuite with SQLHelper { null, LogicalPlanFingerprint( LogicalPlanFingerprint - .Properties(Seq(Signature("provider", "signatureValue"))) + .Properties(Seq(Signature("provider", Fingerprint("abcd")))) )) val expected = IndexLogEntry.create( diff --git a/src/test/scala/com/microsoft/hyperspace/index/IndexLogManagerImplTest.scala b/src/test/scala/com/microsoft/hyperspace/index/IndexLogManagerImplTest.scala index d02795964..3fdd42a47 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/IndexLogManagerImplTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/IndexLogManagerImplTest.scala @@ -26,6 +26,7 @@ import org.scalatest.BeforeAndAfterAll import com.microsoft.hyperspace.{SparkInvolvedSuite, TestUtils} import com.microsoft.hyperspace.index.IndexConstants.HYPERSPACE_LOG import com.microsoft.hyperspace.util.{FileUtils, JsonUtils} +import com.microsoft.hyperspace.util.fingerprint.Fingerprint class IndexLogManagerImplTest extends HyperspaceSuite { val testRoot = inTempDir("indexLogManagerTests") @@ -68,7 +69,7 @@ class IndexLogManagerImplTest extends HyperspaceSuite { null, null, LogicalPlanFingerprint( - LogicalPlanFingerprint.Properties(Seq(Signature("provider", "signature"))))))), + LogicalPlanFingerprint.Properties(Seq(Signature("provider", Fingerprint("abcd")))))))), Map()) private def getEntry(state: String): LogEntry = { diff --git a/src/test/scala/com/microsoft/hyperspace/index/IndexSignatureProviderTest.scala b/src/test/scala/com/microsoft/hyperspace/index/IndexSignatureProviderTest.scala index 559ce5a2c..1afb664e6 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/IndexSignatureProviderTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/IndexSignatureProviderTest.scala @@ -27,6 +27,7 @@ import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructT import com.microsoft.hyperspace.SparkInvolvedSuite import com.microsoft.hyperspace.shim.JoinWithoutHint +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, MD5FingerprintBuilderFactory} class IndexSignatureProviderTest extends SparkFunSuite with SparkInvolvedSuite { private val fileLength1 = 100 @@ -37,6 +38,8 @@ class IndexSignatureProviderTest extends SparkFunSuite with SparkInvolvedSuite { private val fileModificationTime2 = 35000 private val filePath2 = new Path("originalLocation2") + private val fbf = new MD5FingerprintBuilderFactory + test("Verify signature for a plan with same logical relation node.") { val r1 = createSimplePlan(fileLength1, fileModificationTime1, filePath1) val r2 = createSimplePlan(fileLength1, fileModificationTime1, filePath1) @@ -128,8 +131,8 @@ class IndexSignatureProviderTest extends SparkFunSuite with SparkInvolvedSuite { Aggregate(grpExpression, aggExpression, projectNode) } - private def createIndexSignature(logicalPlan: LogicalPlan): String = - new IndexSignatureProvider().signature(logicalPlan) match { + private def createIndexSignature(logicalPlan: LogicalPlan): Fingerprint = + new IndexSignatureProvider(fbf).signature(logicalPlan) match { case Some(s) => s case None => fail("Invalid plan for signature generation.") } diff --git a/src/test/scala/com/microsoft/hyperspace/index/IndexTest.scala b/src/test/scala/com/microsoft/hyperspace/index/IndexTest.scala index 2a3f17c48..f901f6254 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/IndexTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/IndexTest.scala @@ -20,6 +20,7 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import com.microsoft.hyperspace.actions.Constants +import com.microsoft.hyperspace.util.fingerprint.Fingerprint class IndexTest extends SparkFunSuite { val indexConfig1 = IndexConfig("myIndex1", Array("id"), Seq("name")) @@ -35,7 +36,8 @@ class IndexTest extends SparkFunSuite { null, null, LogicalPlanFingerprint( - LogicalPlanFingerprint.Properties(Seq(Signature("signatureProvider", "dfSignature"))))) + LogicalPlanFingerprint.Properties( + Seq(Signature("signatureProvider", Fingerprint("abcd")))))) val entry = IndexLogEntry( config.indexName, diff --git a/src/test/scala/com/microsoft/hyperspace/index/rules/HyperspaceRuleSuite.scala b/src/test/scala/com/microsoft/hyperspace/index/rules/HyperspaceRuleSuite.scala index fc2cc1c87..d5767ce56 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/rules/HyperspaceRuleSuite.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/rules/HyperspaceRuleSuite.scala @@ -29,6 +29,7 @@ import com.microsoft.hyperspace.actions.Constants import com.microsoft.hyperspace.index._ import com.microsoft.hyperspace.index.Hdfs.Properties import com.microsoft.hyperspace.util.PathUtils +import com.microsoft.hyperspace.util.fingerprint.MD5FingerprintBuilderFactory trait HyperspaceRuleSuite extends HyperspaceSuite { private val defaultFileNames = Seq("f1.parquet", "f2.parquet") @@ -41,7 +42,8 @@ trait HyperspaceRuleSuite extends HyperspaceSuite { inputFiles: Seq[FileInfo] = Seq(), writeLog: Boolean = true, filenames: Seq[String] = defaultFileNames): IndexLogEntry = { - val signClass = new RuleTestHelper.TestSignatureProvider().getClass.getName + val fbf = new MD5FingerprintBuilderFactory + val signClass = new RuleTestHelper.TestSignatureProvider(fbf).getClass.getName LogicalPlanSignatureProvider.create(signClass).signature(plan) match { case Some(s) => diff --git a/src/test/scala/com/microsoft/hyperspace/index/rules/RuleTestHelper.scala b/src/test/scala/com/microsoft/hyperspace/index/rules/RuleTestHelper.scala index 708517d10..10f168510 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/rules/RuleTestHelper.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/rules/RuleTestHelper.scala @@ -20,15 +20,17 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} import com.microsoft.hyperspace.index.LogicalPlanSignatureProvider +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilderFactory} object RuleTestHelper { - class TestSignatureProvider extends LogicalPlanSignatureProvider { - def signature(plan: LogicalPlan): Option[String] = + class TestSignatureProvider(fbf: FingerprintBuilderFactory) + extends LogicalPlanSignatureProvider { + def signature(plan: LogicalPlan): Option[Fingerprint] = plan .collectFirst { case LogicalRelation(HadoopFsRelation(location, _, _, _, _, _), _, _, _) => location.hashCode() } - .map(_.toString) + .map(fbf.create.add(_).build()) } } diff --git a/src/test/scala/com/microsoft/hyperspace/util/HashingUtilsTest.scala b/src/test/scala/com/microsoft/hyperspace/util/HashingUtilsTest.scala deleted file mode 100644 index 10f6f43f9..000000000 --- a/src/test/scala/com/microsoft/hyperspace/util/HashingUtilsTest.scala +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (2020) The Hyperspace Project Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.microsoft.hyperspace.util - -import java.util.UUID - -import org.apache.spark.SparkFunSuite - -class HashingUtilsTest extends SparkFunSuite { - test("For md5Hashing(), same input has the same output hash code.") { - val randomUUID = UUID.randomUUID.toString - val hashCode1 = HashingUtils.md5Hex(randomUUID) - val hashCode2 = HashingUtils.md5Hex(randomUUID) - assert(hashCode1.equals(hashCode2)) - } - - test("For md5Hashing(), different input different output hash code.") { - val randomUUID1 = UUID.randomUUID.toString - val randomUUID2 = UUID.randomUUID.toString - val hashCode1 = HashingUtils.md5Hex(randomUUID1) - val hashCode2 = HashingUtils.md5Hex(randomUUID2) - assert(!hashCode1.equals(hashCode2)) - } -} diff --git a/src/test/scala/com/microsoft/hyperspace/util/JsonUtilsTest.scala b/src/test/scala/com/microsoft/hyperspace/util/JsonUtilsTest.scala index 686bc6f42..1d9295ab1 100644 --- a/src/test/scala/com/microsoft/hyperspace/util/JsonUtilsTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/util/JsonUtilsTest.scala @@ -21,6 +21,7 @@ import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructT import com.microsoft.hyperspace.actions.Constants import com.microsoft.hyperspace.index._ +import com.microsoft.hyperspace.util.fingerprint.Fingerprint class JsonUtilsTest extends SparkFunSuite { test("Test for JsonUtils.") { @@ -35,7 +36,8 @@ class JsonUtilsTest extends SparkFunSuite { null, null, LogicalPlanFingerprint( - LogicalPlanFingerprint.Properties(Seq(Signature("signatureProvider", "dfSignature"))))) + LogicalPlanFingerprint.Properties( + Seq(Signature("signatureProvider", Fingerprint("abcd")))))) val index = IndexLogEntry( "myIndex", diff --git a/src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderTest.scala b/src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderTest.scala new file mode 100644 index 000000000..752bdfd87 --- /dev/null +++ b/src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderTest.scala @@ -0,0 +1,110 @@ +/* + * Copyright (2020) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.util.fingerprint + +import org.apache.commons.codec.digest.DigestUtils +import org.apache.spark.SparkFunSuite + +class FingerprintBuilderTest extends SparkFunSuite { + + test("Empty MD5 fingerprint") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.build() == Fingerprint("d41d8cd98f00b204e9800998ecf8427e")) + } + + test("MD5 fingerprint with an integer") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(1).build() == Fingerprint("f1450306517624a57eafbbf8ed995985")) + } + + test("MD5 fingerprint with 0") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(0).build() == Fingerprint("f1d3ff8443297732862df21dc4e57262")) + } + + test("MD5 fingerprint with two integers") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(1).add(42).build() == Fingerprint("f26c63a4633a5deea0a644cc58050446")) + } + + test("MD5 fingerprint with two integers in a different order") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(42).add(1).build() == Fingerprint("3331d54f38cef13e1a2423af3c6fb223")) + } + + test("MD5 fingerprint with a long") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(1L).build() == Fingerprint("fa5ad9a8557e5a84cf23e52d3d3adf77")) + } + + test("MD5 fingerprint with a short") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(1.toShort).build() == Fingerprint("441077cc9e57554dd476bdfb8b8b8102")) + } + + test("MD5 fingerprint with a byte") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(1.toByte).build() == Fingerprint("55a54008ad1ba589aa210d2629c1df41")) + } + + test("MD5 fingerprint with a float") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(2.3f).build() == Fingerprint("472d85449cd4441d8e13540a57aeff49")) + } + + test("MD5 fingerprint with a double") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(42.3).build() == Fingerprint("4a4591ddc290c1e7db42b538ad681549")) + } + + test("MD5 fingerprint with a boolean") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(true).build() == Fingerprint("55a54008ad1ba589aa210d2629c1df41")) + } + + test("MD5 fingerprint with a char") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add('X').build() == Fingerprint("1e160201945908b455769e0318b1a1e9")) + } + + test("MD5 fingerprint with a string") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add("hello").build() == Fingerprint("5d41402abc4b2a76b9719d911017c592")) + } + + test("MD5 fingerprint with a string whose bytes are same as 1 (Int)") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add("\0\0\0\001").build() == fb.add(1).build()) + } + + test("MD5 fingerprint with a byte array") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert( + fb.add(Array[Byte](104, 101, 108, 108, 111)).build() == + Fingerprint("5d41402abc4b2a76b9719d911017c592")) + } + + test("MD5 fingerprint with a fingerprint") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(fb.build()).build() == Fingerprint("59adb24ef3cdbe0297f05b395827453f")) + } + + test("Empty SHA1 fingerprint") { + val fb = new FingerprintBuilder(DigestUtils.getSha1Digest) + assert(fb.build() == Fingerprint("da39a3ee5e6b4b0d3255bfef95601890afd80709")) + } +} diff --git a/src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintTest.scala b/src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintTest.scala new file mode 100644 index 000000000..1af12c17d --- /dev/null +++ b/src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintTest.scala @@ -0,0 +1,57 @@ +/* + * Copyright (2020) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.util.fingerprint + +import org.apache.spark.SparkFunSuite + +class FingerprintTest extends SparkFunSuite { + + test("toString returns a HEX string.") { + assert(Fingerprint(Vector[Byte](1, 2, 3, 10, 30, 66)).toString == "0102030a1e42") + } + + test("toString returns an empty string.") { + assert(Fingerprint(Vector[Byte]()).toString == "") + } + + test("Bitwise XOR works for fingerprints of the same length.") { + assert( + (Fingerprint(Vector[Byte](19, 121)) ^ Fingerprint(Vector[Byte](31, -4))) == + Fingerprint(Vector[Byte]((19 ^ 31).toByte, (121 ^ -4).toByte))) + } + + test("Bitwise XOR fails for fingerprints of different lengths.") { + assertThrows[IllegalArgumentException] { + Fingerprint(Vector[Byte](19, 121)) ^ Fingerprint(Vector[Byte](31, -4, 3)) + } + } + + test("Fingerprint can be created from an empty string.") { + assert(Fingerprint("") == Fingerprint(Vector[Byte]())) + } + + test("Fingerprint can be created from a HEX string.") { + assert(Fingerprint("0102fffe") == Fingerprint(Vector[Byte](1, 2, -1, -2))) + } + + test("Fingerprint cannot be created from an invalid string.") { + val ex = intercept[IllegalArgumentException] { + Fingerprint("xyz") + } + assert(ex.getMessage.contains("hexBinary needs to be even-length")) + } +}