Skip to content

Commit

Permalink
fix: fixed bugs in HLL metric calculators and in email validation (#33)
Browse files Browse the repository at this point in the history
- added accuracy guarding for HLL metric calculator to avoid incorrect
HLL monoid instantiation.
- changed email validation method towards using apache commons email
validator. This is done to allow a broader range of email addresses and
to avoid using extremely complex regex patterns.
- added new package dependency to support email validation:
commons-validator-1.8.0
  • Loading branch information
gabb1er authored Mar 20, 2024
1 parent d04c754 commit 8009ed5
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 19 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package ru.raiffeisen.checkita.config

import org.apache.commons.validator.routines.EmailValidator
import org.apache.spark.sql.Column
import org.apache.spark.sql.functions.expr
import org.apache.spark.sql.types.DataType
Expand All @@ -8,7 +9,7 @@ import pureconfig.generic.{FieldCoproductHint, ProductHint}
import pureconfig.{CamelCase, ConfigConvert, ConfigFieldMapping}
import ru.raiffeisen.checkita.config.Enums._
import ru.raiffeisen.checkita.config.Parsers.idParser
import ru.raiffeisen.checkita.config.RefinedTypes.{DateFormat, ID}
import ru.raiffeisen.checkita.config.RefinedTypes.{DateFormat, Email, ID}
import ru.raiffeisen.checkita.config.jobconf.Outputs.FileOutputConfig
import ru.raiffeisen.checkita.config.jobconf.Schemas.SchemaConfig
import ru.raiffeisen.checkita.config.jobconf.Sources.{FileSourceConfig, VirtualSourceConfig}
Expand Down Expand Up @@ -36,7 +37,14 @@ object Implicits {
},
id => id.value
)


implicit val emailConverter: ConfigConvert[Email] = ConfigConvert[String].xmap[Email](
emailString =>
if (EmailValidator.getInstance().isValid(emailString)) Email(emailString)
else throw new IllegalArgumentException(s"Email '$emailString' is not valid."),
email => email.value
)

implicit val dateFormatConverter: ConfigConvert[DateFormat] =
ConfigConvert[String].xmap[DateFormat](DateFormat.fromString, _.pattern)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,15 @@ import eu.timepit.refined.string._
import java.time.format.DateTimeFormatter

object RefinedTypes {
// type ID = String Refined MatchesRegex[W.`"""^[a-zA-Z0-9_]+$"""`.T]

type URI = String Refined Uri
type URL = String Refined Url
type Port = Int Refined Interval.Closed[W.`0`.T, W.`9999`.T]
type Email = String Refined MatchesRegex[W.`"""^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$"""`.T]
type MMRecipient = String Refined MatchesRegex[W.`"""^(@|#)[a-zA-Z][a-zA-Z0-9.\\-_]+$"""`.T]
type SparkParam = String Refined MatchesRegex[W.`"""^\\S+?\\=[\\S\\s]+$"""`.T]
type PositiveInt = Int Refined Positive
type FixedShortColumn = String Refined MatchesRegex[W.`"""^[^\\n\\r\\t:]+:\\d+$"""`.T]
type AccuracyDouble = Double Refined Interval.Closed[W.`0.0`.T, W.`1.0`.T]
type AccuracyDouble = Double Refined Interval.OpenClosed[W.`0.0`.T, W.`1.0`.T]
type RegexPattern = String Refined Regex

/**
Expand Down Expand Up @@ -49,7 +48,14 @@ object RefinedTypes {
* @param value Validated ID value
*/
case class ID(value: String)


/**
* Email class is used to wrap email addresses and provide email validation
* during configuration parsing.
* @param value Validated email address
*/
case class Email(value: String)

/**
* DateFormat class is used to store both date-time pattern and corresponding formatter.
* Such construction allows to verify if pattern is convertable to formatter at the point
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,13 @@ object AlgebirdMetrics {
protected val status: CalculatorStatus = CalculatorStatus.Success,
protected val failMsg: String = "OK")
extends MetricCalculator {

// axillary constrictor to init metric calculator:

// Auxiliary constrictor to init metric calculator:
// Accuracy is limited to a value that correspond to bits number of 30 or less.
// This is done to prevent integer overflow when computing HLL monoid size.
def this(accuracyError: Double) = this(
new HyperLogLogMonoid(HyperLogLog.bitsForError(accuracyError)).zero,
HyperLogLog.bitsForError(accuracyError),
new HyperLogLogMonoid(HyperLogLog.bitsForError(math.max(accuracyError, 0.00003174))).zero,
HyperLogLog.bitsForError(math.max(accuracyError, 0.00003174)),
accuracyError
)

Expand Down Expand Up @@ -65,10 +67,9 @@ object AlgebirdMetrics {
Map(MetricName.ApproximateDistinctValues.entryName -> (hLL.approximateSize.estimate.toDouble, None))

def merge(m2: MetricCalculator): MetricCalculator = {
val monoid = new HyperLogLogMonoid(this.bitsNumber)
val that = m2.asInstanceOf[HyperLogLogMetricCalculator]
HyperLogLogMetricCalculator(
monoid.plus(this.hLL, that.hLL),
this.hLL + that.hLL,
this.bitsNumber,
this.accuracyError,
this.failCount + that.getFailCounter,
Expand Down
6 changes: 3 additions & 3 deletions checkita-core/src/test/resources/test_job.conf
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ jobConfig: {
attachMetricErrors: true
metrics: ["hive_table_nulls", "fixed_file_dist_name", "table_source1_inn_regex"]
dumpSize: 10
recipients: ["some.person@some.domain"]
recipients: ["some.person@some.com"]
}
mattermost: {
attachMetricErrors: true
Expand All @@ -311,12 +311,12 @@ jobConfig: {
{
id: "alert1"
checks: ["avg_bal_check", "zero_nulls"]
recipients: ["some.peron@some.domain"]
recipients: ["some.peron@some.com"]
}
{
id: "alert2"
checks: ["top2_curr_match", "completeness_check"]
recipients: ["another.peron@some.domain"]
recipients: ["another.peron@some.com"]
}
]
mattermost: [
Expand Down
10 changes: 6 additions & 4 deletions project/Dependencies.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@ object Dependencies {
val enumeratum = "com.beachape" %% "enumeratum" % "1.7.2"
val isarn = "org.isarnproject" %% "isarn-sketches" % "0.3.0"
val algebird = "com.twitter" %% "algebird-core" % "0.13.9"
val commonText = "org.apache.commons" % "commons-text" % "1.10.0"
val commonMail = "org.apache.commons" % "commons-email" % "1.5"
val commonsText = "org.apache.commons" % "commons-text" % "1.10.0"
val commonsMail = "org.apache.commons" % "commons-email" % "1.5"
val commonsValidators = "commons-validator" % "commons-validator" % "1.8.0"
val mustache = "com.github.spullara.mustache.java" % "compiler" %"0.9.10"

// XML support in json4s published in a separate package,
Expand Down Expand Up @@ -51,8 +52,9 @@ object Dependencies {
enumeratum,
isarn,
algebird,
commonText,
commonMail,
commonsText,
commonsMail,
commonsValidators,
mustache,
jsonJava,
refined,
Expand Down

0 comments on commit 8009ed5

Please sign in to comment.