From 8009ed56d0a2b61acc96689ea6e31589bae39e37 Mon Sep 17 00:00:00 2001 From: Yury Dylko <62417327+gabb1er@users.noreply.github.com> Date: Wed, 20 Mar 2024 21:26:11 +0300 Subject: [PATCH] fix: fixed bugs in HLL metric calculators and in email validation (#33) - added accuracy guarding for HLL metric calculator to avoid incorrect HLL monoid instantiation. - changed email validation method towards using apache commons email validator. This is done to allow a broader range of email addresses and to avoid using extremely complex regex patterns. - added new package dependency to support email validation: commons-validator-1.8.0 --- .../ru/raiffeisen/checkita/config/Implicits.scala | 12 ++++++++++-- .../raiffeisen/checkita/config/RefinedTypes.scala | 14 ++++++++++---- .../core/metrics/regular/AlgebirdMetrics.scala | 13 +++++++------ checkita-core/src/test/resources/test_job.conf | 6 +++--- project/Dependencies.scala | 10 ++++++---- 5 files changed, 36 insertions(+), 19 deletions(-) diff --git a/checkita-core/src/main/scala/ru/raiffeisen/checkita/config/Implicits.scala b/checkita-core/src/main/scala/ru/raiffeisen/checkita/config/Implicits.scala index 7821983d..b7d2017b 100644 --- a/checkita-core/src/main/scala/ru/raiffeisen/checkita/config/Implicits.scala +++ b/checkita-core/src/main/scala/ru/raiffeisen/checkita/config/Implicits.scala @@ -1,5 +1,6 @@ package ru.raiffeisen.checkita.config +import org.apache.commons.validator.routines.EmailValidator import org.apache.spark.sql.Column import org.apache.spark.sql.functions.expr import org.apache.spark.sql.types.DataType @@ -8,7 +9,7 @@ import pureconfig.generic.{FieldCoproductHint, ProductHint} import pureconfig.{CamelCase, ConfigConvert, ConfigFieldMapping} import ru.raiffeisen.checkita.config.Enums._ import ru.raiffeisen.checkita.config.Parsers.idParser -import ru.raiffeisen.checkita.config.RefinedTypes.{DateFormat, ID} +import ru.raiffeisen.checkita.config.RefinedTypes.{DateFormat, Email, ID} import ru.raiffeisen.checkita.config.jobconf.Outputs.FileOutputConfig import ru.raiffeisen.checkita.config.jobconf.Schemas.SchemaConfig import ru.raiffeisen.checkita.config.jobconf.Sources.{FileSourceConfig, VirtualSourceConfig} @@ -36,7 +37,14 @@ object Implicits { }, id => id.value ) - + + implicit val emailConverter: ConfigConvert[Email] = ConfigConvert[String].xmap[Email]( + emailString => + if (EmailValidator.getInstance().isValid(emailString)) Email(emailString) + else throw new IllegalArgumentException(s"Email '$emailString' is not valid."), + email => email.value + ) + implicit val dateFormatConverter: ConfigConvert[DateFormat] = ConfigConvert[String].xmap[DateFormat](DateFormat.fromString, _.pattern) diff --git a/checkita-core/src/main/scala/ru/raiffeisen/checkita/config/RefinedTypes.scala b/checkita-core/src/main/scala/ru/raiffeisen/checkita/config/RefinedTypes.scala index 79a82ef6..2f25d883 100644 --- a/checkita-core/src/main/scala/ru/raiffeisen/checkita/config/RefinedTypes.scala +++ b/checkita-core/src/main/scala/ru/raiffeisen/checkita/config/RefinedTypes.scala @@ -10,16 +10,15 @@ import eu.timepit.refined.string._ import java.time.format.DateTimeFormatter object RefinedTypes { -// type ID = String Refined MatchesRegex[W.`"""^[a-zA-Z0-9_]+$"""`.T] + type URI = String Refined Uri type URL = String Refined Url type Port = Int Refined Interval.Closed[W.`0`.T, W.`9999`.T] - type Email = String Refined MatchesRegex[W.`"""^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$"""`.T] type MMRecipient = String Refined MatchesRegex[W.`"""^(@|#)[a-zA-Z][a-zA-Z0-9.\\-_]+$"""`.T] type SparkParam = String Refined MatchesRegex[W.`"""^\\S+?\\=[\\S\\s]+$"""`.T] type PositiveInt = Int Refined Positive type FixedShortColumn = String Refined MatchesRegex[W.`"""^[^\\n\\r\\t:]+:\\d+$"""`.T] - type AccuracyDouble = Double Refined Interval.Closed[W.`0.0`.T, W.`1.0`.T] + type AccuracyDouble = Double Refined Interval.OpenClosed[W.`0.0`.T, W.`1.0`.T] type RegexPattern = String Refined Regex /** @@ -49,7 +48,14 @@ object RefinedTypes { * @param value Validated ID value */ case class ID(value: String) - + + /** + * Email class is used to wrap email addresses and provide email validation + * during configuration parsing. + * @param value Validated email address + */ + case class Email(value: String) + /** * DateFormat class is used to store both date-time pattern and corresponding formatter. * Such construction allows to verify if pattern is convertable to formatter at the point diff --git a/checkita-core/src/main/scala/ru/raiffeisen/checkita/core/metrics/regular/AlgebirdMetrics.scala b/checkita-core/src/main/scala/ru/raiffeisen/checkita/core/metrics/regular/AlgebirdMetrics.scala index 78e94b9b..06ced2f8 100644 --- a/checkita-core/src/main/scala/ru/raiffeisen/checkita/core/metrics/regular/AlgebirdMetrics.scala +++ b/checkita-core/src/main/scala/ru/raiffeisen/checkita/core/metrics/regular/AlgebirdMetrics.scala @@ -30,11 +30,13 @@ object AlgebirdMetrics { protected val status: CalculatorStatus = CalculatorStatus.Success, protected val failMsg: String = "OK") extends MetricCalculator { - - // axillary constrictor to init metric calculator: + + // Auxiliary constrictor to init metric calculator: + // Accuracy is limited to a value that correspond to bits number of 30 or less. + // This is done to prevent integer overflow when computing HLL monoid size. def this(accuracyError: Double) = this( - new HyperLogLogMonoid(HyperLogLog.bitsForError(accuracyError)).zero, - HyperLogLog.bitsForError(accuracyError), + new HyperLogLogMonoid(HyperLogLog.bitsForError(math.max(accuracyError, 0.00003174))).zero, + HyperLogLog.bitsForError(math.max(accuracyError, 0.00003174)), accuracyError ) @@ -65,10 +67,9 @@ object AlgebirdMetrics { Map(MetricName.ApproximateDistinctValues.entryName -> (hLL.approximateSize.estimate.toDouble, None)) def merge(m2: MetricCalculator): MetricCalculator = { - val monoid = new HyperLogLogMonoid(this.bitsNumber) val that = m2.asInstanceOf[HyperLogLogMetricCalculator] HyperLogLogMetricCalculator( - monoid.plus(this.hLL, that.hLL), + this.hLL + that.hLL, this.bitsNumber, this.accuracyError, this.failCount + that.getFailCounter, diff --git a/checkita-core/src/test/resources/test_job.conf b/checkita-core/src/test/resources/test_job.conf index 7049de8e..87ae3cc2 100644 --- a/checkita-core/src/test/resources/test_job.conf +++ b/checkita-core/src/test/resources/test_job.conf @@ -293,7 +293,7 @@ jobConfig: { attachMetricErrors: true metrics: ["hive_table_nulls", "fixed_file_dist_name", "table_source1_inn_regex"] dumpSize: 10 - recipients: ["some.person@some.domain"] + recipients: ["some.person@some.com"] } mattermost: { attachMetricErrors: true @@ -311,12 +311,12 @@ jobConfig: { { id: "alert1" checks: ["avg_bal_check", "zero_nulls"] - recipients: ["some.peron@some.domain"] + recipients: ["some.peron@some.com"] } { id: "alert2" checks: ["top2_curr_match", "completeness_check"] - recipients: ["another.peron@some.domain"] + recipients: ["another.peron@some.com"] } ] mattermost: [ diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 78fefd3f..260f9658 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -6,8 +6,9 @@ object Dependencies { val enumeratum = "com.beachape" %% "enumeratum" % "1.7.2" val isarn = "org.isarnproject" %% "isarn-sketches" % "0.3.0" val algebird = "com.twitter" %% "algebird-core" % "0.13.9" - val commonText = "org.apache.commons" % "commons-text" % "1.10.0" - val commonMail = "org.apache.commons" % "commons-email" % "1.5" + val commonsText = "org.apache.commons" % "commons-text" % "1.10.0" + val commonsMail = "org.apache.commons" % "commons-email" % "1.5" + val commonsValidators = "commons-validator" % "commons-validator" % "1.8.0" val mustache = "com.github.spullara.mustache.java" % "compiler" %"0.9.10" // XML support in json4s published in a separate package, @@ -51,8 +52,9 @@ object Dependencies { enumeratum, isarn, algebird, - commonText, - commonMail, + commonsText, + commonsMail, + commonsValidators, mustache, jsonJava, refined,