Skip to content

Commit

Permalink
Merge pull request #82 from data-catering/number-rounding
Browse files Browse the repository at this point in the history
Add ability to round generated numbers, set metadata statistics and r…
  • Loading branch information
pflooky authored Dec 4, 2024
2 parents 4a5ff34 + 2375ac1 commit e301fc7
Show file tree
Hide file tree
Showing 8 changed files with 44 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -904,6 +904,15 @@ case class FieldBuilder(field: Field = Field()) {
def numericScale(scale: Int): FieldBuilder =
this.modify(_.field.generator).setTo(Some(getGenBuilder.numericScale(scale).generator))

/**
* Sets the rounding for the field.
*
* @param round Number of decimal places to round to
* @return the updated `FieldBuilder` instance
*/
def round(round: Int): FieldBuilder =
this.modify(_.field.generator).setTo(Some(getGenBuilder.round(round).generator))

/**
* Sets whether the field should be omitted from the generated output.
*
Expand Down Expand Up @@ -1225,6 +1234,15 @@ case class GeneratorBuilder(generator: Generator = Generator()) {
def numericScale(scale: Int): GeneratorBuilder =
this.modify(_.generator.options)(_ ++ Map(NUMERIC_SCALE -> scale.toString))

/**
* Rounding to decimal places for numeric data types
*
* @param round Number of decimal places to round to
* @return GeneratorBuilder
*/
def round(round: Int): GeneratorBuilder =
this.modify(_.generator.options)(_ ++ Map(ROUND -> round.toString))

/**
* Enable/disable including the value in the final output to the data source. Allows you to define intermediate values
* that can be used to generate other columns
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ object Constants {
lazy val MAXIMUM = "max"
lazy val STANDARD_DEVIATION = "stddev"
lazy val MEAN = "mean"
lazy val ROUND = "round"
lazy val DISTRIBUTION = "distribution"
lazy val DISTRIBUTION_RATE_PARAMETER = "distributionRateParam"
lazy val DISTRIBUTION_UNIFORM = "uniform"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package io.github.datacatering.datacaterer.core.generator.provider

import io.github.datacatering.datacaterer.api.model.Constants.{ARRAY_MAXIMUM_LENGTH, ARRAY_MINIMUM_LENGTH, DEFAULT_VALUE, DISTINCT_COUNT, DISTRIBUTION, DISTRIBUTION_EXPONENTIAL, DISTRIBUTION_NORMAL, DISTRIBUTION_RATE_PARAMETER, EXPRESSION, IS_UNIQUE, MAXIMUM, MAXIMUM_LENGTH, MEAN, MINIMUM, MINIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, ROW_COUNT, STANDARD_DEVIATION}
import io.github.datacatering.datacaterer.api.model.Constants.{ARRAY_MAXIMUM_LENGTH, ARRAY_MINIMUM_LENGTH, DEFAULT_VALUE, DISTINCT_COUNT, DISTRIBUTION, DISTRIBUTION_EXPONENTIAL, DISTRIBUTION_NORMAL, DISTRIBUTION_RATE_PARAMETER, EXPRESSION, IS_UNIQUE, MAXIMUM, MAXIMUM_LENGTH, MEAN, MINIMUM, MINIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, ROUND, ROW_COUNT, STANDARD_DEVIATION}
import io.github.datacatering.datacaterer.core.exception.UnsupportedDataGeneratorType
import io.github.datacatering.datacaterer.core.model.Constants._
import io.github.datacatering.datacaterer.core.util.GeneratorUtil
Expand Down Expand Up @@ -386,10 +386,15 @@ object RandomDataGenerator {
s"$sqlRand * $diff + $min"
}

if (!baseFormula.contains(INDEX_INC_COL) && (typeName == "INT" || typeName == "SHORT" || typeName == "LONG")) {
s"CAST(ROUND($baseFormula, 0) AS $typeName)"
val rounded = if (metadata.contains(ROUND)) {
val roundValue = metadata.getString(ROUND)
s"ROUND($baseFormula, $roundValue)"
} else baseFormula

if (!rounded.contains(INDEX_INC_COL) && (typeName == "INT" || typeName == "SHORT" || typeName == "LONG")) {
s"CAST(ROUND($rounded, 0) AS $typeName)"
} else {
s"CAST($baseFormula AS $typeName)"
s"CAST($rounded AS $typeName)"
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class RecordTrackingProcessor(recordTrackingFolderPath: String) {

def trackRecords(df: DataFrame, dataSourceName: String, planName: String, step: Step): Unit = {
val subDataSourcePath = getSubDataSourcePath(dataSourceName, planName, step, recordTrackingFolderPath)
LOGGER.info(s"Generated record tracking is enabled, data-source-name=$dataSourceName, plan-name=$planName, save-path=$subDataSourcePath")
LOGGER.debug(s"Generated record tracking is enabled, data-source-name=$dataSourceName, plan-name=$planName, save-path=$subDataSourcePath")
if (df.isEmpty || df.schema.isEmpty) {
LOGGER.debug("Unable to save records for record tracking due to 0 records found or empty schema")
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ object MetadataUtil {
computeColumnStatistics(sourceData, dataSourceReadOptions, dataSourceMetadata.name, dataSourceMetadata.format)
val columnLevelStatistics = sparkSession.sharedState.cacheManager.lookupCachedData(sourceData).get.cachedRepresentation.stats
val rowCount = columnLevelStatistics.rowCount.getOrElse(BigInt(0))
LOGGER.info(s"Computed metadata statistics for data source, name=${dataSourceMetadata.name}, format=$dataSourceFormat, " +
LOGGER.debug(s"Computed metadata statistics for data source, name=${dataSourceMetadata.name}, format=$dataSourceFormat, " +
s"details=${ConfigUtil.cleanseOptions(dataSourceReadOptions)}, rows-analysed=$rowCount, size-in-bytes=${columnLevelStatistics.sizeInBytes}, " +
s"num-columns-analysed=${columnLevelStatistics.attributeStats.size}")

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package io.github.datacatering.datacaterer.core.generator.provider

import io.github.datacatering.datacaterer.api.model.Constants.{ARRAY_MINIMUM_LENGTH, DISTINCT_COUNT, DISTRIBUTION, DISTRIBUTION_EXPONENTIAL, DISTRIBUTION_NORMAL, DISTRIBUTION_RATE_PARAMETER, ENABLED_EDGE_CASE, ENABLED_NULL, EXPRESSION, IS_UNIQUE, MAXIMUM, MAXIMUM_LENGTH, MEAN, MINIMUM, MINIMUM_LENGTH, PROBABILITY_OF_EDGE_CASE, PROBABILITY_OF_NULL, ROW_COUNT, STANDARD_DEVIATION}
import io.github.datacatering.datacaterer.api.model.Constants.{ARRAY_MINIMUM_LENGTH, DISTINCT_COUNT, DISTRIBUTION, DISTRIBUTION_EXPONENTIAL, DISTRIBUTION_NORMAL, DISTRIBUTION_RATE_PARAMETER, ENABLED_EDGE_CASE, ENABLED_NULL, EXPRESSION, IS_UNIQUE, MAXIMUM, MAXIMUM_LENGTH, MEAN, MINIMUM, MINIMUM_LENGTH, PROBABILITY_OF_EDGE_CASE, PROBABILITY_OF_NULL, ROUND, ROW_COUNT, STANDARD_DEVIATION}
import io.github.datacatering.datacaterer.core.generator.provider.RandomDataGenerator._
import io.github.datacatering.datacaterer.core.model.Constants.INDEX_INC_COL
import org.apache.spark.sql.types._
Expand Down Expand Up @@ -189,6 +189,17 @@ class RandomDataGeneratorTest extends AnyFunSuite {
assert(doubleGenerator.generateSqlExpression == "CAST(RAND() * 5.0 + 5.0 AS DOUBLE)")
}

test("Can create random double generator with custom min, max and rounding") {
val metadata = new MetadataBuilder().putString(MAXIMUM, "10.0").putString(MINIMUM, "5.0").putString(ROUND, "2").build()
val doubleGenerator = new RandomDoubleDataGenerator(StructField("random_double", DoubleType, false, metadata))
val sampleData = doubleGenerator.generate

assert(doubleGenerator.edgeCases.nonEmpty)
assert(sampleData >= 5.0)
assert(sampleData <= 10.0)
assert(doubleGenerator.generateSqlExpression == "CAST(ROUND(RAND() * 5.0 + 5.0, 2) AS DOUBLE)")
}

test("Can create random float generator") {
val floatGenerator = new RandomFloatDataGenerator(StructField("random_float", FloatType, false))
val sampleData = floatGenerator.generate
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class PlanProcessorTest extends SparkSuite {
.schema(
field.name("account_id").regex("ACC[0-9]{8}"),
field.name("year").`type`(IntegerType).sql("YEAR(date)"),
field.name("balance").`type`(DoubleType).min(10).max(1000),
field.name("balance").`type`(DoubleType).min(10).max(1000).round(2),
field.name("date").`type`(DateType).min(Date.valueOf("2022-01-01")),
field.name("status").oneOf(accountStatus: _*),
field.name("update_history")
Expand Down
2 changes: 1 addition & 1 deletion gradle.properties
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
groupId=io.github.data-catering
version=0.12.2
version=0.12.3

scalaVersion=2.12
scalaSpecificVersion=2.12.19
Expand Down

0 comments on commit e301fc7

Please sign in to comment.