Skip to content

Commit

Permalink
feat: job configuration model enhancements (part 2):
Browse files Browse the repository at this point in the history
- documentation update
- added tests for SQL expression parser used to validate partition filter expressions.
- added script to fix heading levels in changelog file during release.
  • Loading branch information
gabb1er committed Mar 4, 2024
1 parent 397ea95 commit dd81135
Show file tree
Hide file tree
Showing 14 changed files with 336 additions and 30 deletions.
4 changes: 4 additions & 0 deletions .prepare_changelog.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
set -eu

echo "Updating changelog headers' levels ..."
sed -rie 's/^# \[/## \[/g' docs/changelog/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package ru.raiffeisen.checkita.config

import org.apache.spark.sql.Column
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec
import org.apache.spark.sql.functions.expr
import ru.raiffeisen.checkita.config.Parsers._


class ParsersSpec extends AnyWordSpec with Matchers {

"ExpressionParsingOps" must {
"yield valid list of columns for allowed simple SQL expressions" in {
val testExpressions: Seq[(Column, Set[String])] = Seq(
expr("partition_column like '2024-%'") -> Set("partition_column"),
expr("partition_column = 3.14") -> Set("partition_column"),
expr("partition_column in ('one', 'two', 'three')") -> Set("partition_column"),
expr("partition_column = last_part('some_schema.some_table')") -> Set("partition_column"),
expr("partition_column = date_format(dt_column, 'yyyy-MM-dd')") -> Set("partition_column", "dt_column"),
expr(
"partition_column in (last_part('some.table'), 'part_two', date_format(dt_column, 'yyyy-MM-dd'))"
) -> Set("partition_column", "dt_column"),
expr(
"partition_column >= '2024-01-01' and partition_column < current_date() and partition_column != null"
) -> Set("partition_column"),
expr(
"""
|partition_column = CASE
| WHEN dayofweek(current_date()) = 1 THEN date_add(current_date(), -2)
| WHEN dayofweek(current_date()) = 2 THEN date_add(current_date(), -3)
| ELSE date_add(current_date(), -1)
|END
|""".stripMargin) -> Set("partition_column"),
expr("partition_column = max(dt_column)") -> Set("partition_column", "dt_column"),
expr("partition_column >= '2024-01-01' and true") -> Set("partition_column"),
)

testExpressions.foreach {
case (expression, columns) => expression.dependentColumns.toSet shouldEqual columns
}
}

"throw exception when SQL expression includes sub-query" in {
an [IllegalArgumentException] should be thrownBy expr(
"partition_column in (select max(dt_column) from some.table)"
).dependentColumns
}

"return invalid list of columns when using parameterless functions without parentheses" in {
val testExpressions: Seq[(Column, Set[String])] = Seq(
expr(
"partition_column >= '2024-01-01' and partition_column < current_date and partition_column != null"
) -> Set("partition_column", "current_date"),
expr(
"""
|partition_column = CASE
| WHEN dayofweek(current_date) = 1 THEN date_add(current_date, -2)
| WHEN dayofweek(current_date) = 2 THEN date_add(current_date, -3)
| ELSE date_add(current_date, -1)
|END
|""".stripMargin) -> Set("partition_column", "current_date")
)

testExpressions.foreach {
case (expression, columns) => expression.dependentColumns.toSet shouldEqual columns
}
}
}
}
43 changes: 28 additions & 15 deletions docs/en/01-application-setup/03-ResultsStorage.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ only one set of results per Data Quality job and given reference date.

### Regular Metrics Results Schema

* Primary key: `(job_id, metric_id, reference_date)`
* Primary key: `(job_id, metric_id, reference_date)`;
* `source_id` & `column_names` contain string representation of lists in format `'[val1,val2,val3]'`.
* `params` is a JSON string.

Expand All @@ -73,6 +73,7 @@ only one set of results per Data Quality job and given reference date.
| metric_id | STRING | NOT NULL |
| metric_name | STRING | NOT NULL |
| description | STRING | |
| metadata | STRING | |
| source_id | STRING | NOT NULL |
| column_names | STRING | |
| params | STRING | |
Expand All @@ -83,7 +84,7 @@ only one set of results per Data Quality job and given reference date.

### Composed Metrics Results Schema

* Primary key: `(job_id, metric_id, reference_date)`
* Primary key: `(job_id, metric_id, reference_date)`;
* `source_id` contains string representation of lists in format `'[val1,val2,val3]'`.

| Column Name | Column Type | Constraint |
Expand All @@ -92,6 +93,7 @@ only one set of results per Data Quality job and given reference date.
| metric_id | STRING | NOT NULL |
| metric_name | STRING | NOT NULL |
| description | STRING | |
| metadata | STRING | |
| source_id | STRING | NOT NULL |
| formula | STRING | NOT NULL |
| result | DOUBLE | NOT NULL |
Expand All @@ -101,14 +103,16 @@ only one set of results per Data Quality job and given reference date.

### Load Checks Results Schema

* Primary key: `(job_id, check_id, reference_date)`
* Primary key: `(job_id, check_id, reference_date)`;
* `source_id` contains string representation of lists in format `'[val1,val2,val3]'`.

| Column Name | Column Type | Constraint |
|----------------|-------------|------------|
| job_id | STRING | NOT NULL |
| check_id | STRING | NOT NULL |
| check_name | STRING | NOT NULL |
| description | STRING | |
| metadata | STRING | |
| source_id | STRING | NOT NULL |
| expected | STRING | NOT NULL |
| status | STRING | NOT NULL |
Expand All @@ -118,7 +122,7 @@ only one set of results per Data Quality job and given reference date.

### Checks Results Schema

* Primary key: `(job_id, check_id, reference_date)`
* Primary key: `(job_id, check_id, reference_date)`;
* `source_id` contains string representation of lists in format `'[val1,val2,val3]'`.

| Column Name | Column Type | Constraint |
Expand All @@ -127,6 +131,7 @@ only one set of results per Data Quality job and given reference date.
| check_id | STRING | NOT NULL |
| check_name | STRING | NOT NULL |
| description | STRING | |
| metadata | STRING | |
| source_id | STRING | NOT NULL |
| base_metric | STRING | NOT NULL |
| compared_metric | STRING | |
Expand All @@ -140,15 +145,17 @@ only one set of results per Data Quality job and given reference date.

### Job State Schema

* Primary key: `(job_id, reference_date)`
* Primary key: `(job_id, reference_date)`;
* `version_info` is a JSON string;
* `config` is a JSON string.

| Column Name | Column Type | Constraint |
|--------------------|-------------|------------|
| job_id | STRING | NOT NULL |
| config | STRING | NOT NULL |
| reference_date | TIMESTAMP | NOT NULL |
| execution_date | TIMESTAMP | NOT NULL |
| Column Name | Column Type | Constraint |
|----------------|-------------|------------|
| job_id | STRING | NOT NULL |
| config | STRING | NOT NULL |
| version_info | STRING | |
| reference_date | TIMESTAMP | NOT NULL |
| execution_date | TIMESTAMP | NOT NULL |

## Hive Storage Setup Scripts

Expand All @@ -168,6 +175,7 @@ CREATE EXTERNAL TABLE ${schema_name}.results_metric_regular
metric_id STRING COMMENT '',
metric_name STRING COMMENT '',
description STRING COMMENT '',
metadata STRING COMMENT '',
source_id STRING COMMENT '',
column_names STRING COMMENT '',
params STRING COMMENT '',
Expand All @@ -188,6 +196,7 @@ CREATE EXTERNAL TABLE ${schema_name}.results_metric_composed
metric_id STRING COMMENT '',
metric_name STRING COMMENT '',
description STRING COMMENT '',
metadata STRING COMMENT '',
source_id STRING COMMENT '',
formula STRING COMMENT '',
result DOUBLE COMMENT '',
Expand All @@ -206,6 +215,8 @@ CREATE EXTERNAL TABLE ${schema_name}.results_check_load
job_id STRING COMMENT '',
check_id STRING COMMENT '',
check_name STRING COMMENT '',
description STRING COMMENT '',
metadata STRING COMMENT '',
source_id STRING COMMENT '',
expected STRING COMMENT '',
status STRING COMMENT '',
Expand All @@ -225,6 +236,7 @@ CREATE EXTERNAL TABLE ${schema_name}.results_check
check_id STRING COMMENT '',
check_name STRING COMMENT '',
description STRING COMMENT '',
metadata STRING COMMENT '',
source_id STRING COMMENT '',
base_metric STRING COMMENT '',
compared_metric STRING COMMENT '',
Expand All @@ -246,11 +258,12 @@ CREATE EXTERNAL TABLE ${schema_name}.job_state
(
job_id STRING COMMENT '',
config STRING COMMENT '',
version_info STRING COMMENT '',
reference_date TIMESTAMP COMMENT '',
execution_date TIMESTAMP COMMENT ''
)
COMMENT 'Data Quality Job State'
PARTITIONED BY (job_id STRING)
STORED AS PARQUET
LOCATION '${schema_dir}/job_state';
COMMENT 'Data Quality Job State'
PARTITIONED BY (job_id STRING)
STORED AS PARQUET
LOCATION '${schema_dir}/job_state';
```
Loading

0 comments on commit dd81135

Please sign in to comment.