Merge pull request #574 from boozallen/571-disable-relation-spark-val…

…idation #571 Disable relations validations in Spark schema for records with relations
boozallen · Feb 7, 2025 · 07b2a12 · 07b2a12
2 parents c1840e3 + 67b449f
commit 07b2a12
Show file tree

Hide file tree

Showing 6 changed files with 30 additions and 12 deletions.
diff --git a/DRAFT_RELEASE_NOTES.md b/DRAFT_RELEASE_NOTES.md
@@ -15,7 +15,7 @@ Spark and PySpark have been upgraded from version 3.5.2 to 3.5.4.
 ## Record Relation
 To enable nested data records, we have added a new relation feature to the record metamodel. This allows records to reference other records. For more details, refer to the [Record Relation Options](https://boozallen.github.io/aissemble/aissemble/current-dev/record-metamodel.html#_record_relation_options).
 Several features are still a work in progress:
-- PySpark and Spark based validation for records with a One to Many multiplicity. (Object validation is available.)
+- PySpark and Spark schema based validation for relations will only validate the record and not its relations. Object based validation for relations is available.
 
 ## Helm Charts Resource Specification
 The following Helm charts have been updated to include the configuration options for specifying container resource requests/limits:

diff --git a/...ion-mda/src/main/resources/templates/data-delivery-data-records/pyspark.schema.base.py.vm b/...ion-mda/src/main/resources/templates/data-delivery-data-records/pyspark.schema.base.py.vm
@@ -166,6 +166,8 @@ class ${record.capitalizedName}SchemaBase(ABC):
         #end
         #end
 
+    ## TODO revise validation for relations
+    #if (false)
         #foreach($relation in $record.relations)
         #if($relation.isOneToManyRelation())
         data_with_validations = data_with_validations.withColumn(self.${relation.upperSnakecaseName}_COLUMN + "_VALID", lit(self._validate_with_${relation.snakeCaseName}_schema(data_with_validations.select(col(self.${relation.upperSnakecaseName}_COLUMN)))))
@@ -174,6 +176,7 @@ class ${record.capitalizedName}SchemaBase(ABC):
         data_with_validations = data_with_validations.withColumn(self.${relation.upperSnakecaseName}_COLUMN + "_VALID", lit(not ${relation.snakeCaseName}_schema.validate_dataset_with_prefix(data_with_validations.select(col(self.${relation.upperSnakecaseName}_COLUMN)), '${relation.columnName}.').isEmpty()))
         #end
         #end
+    #end
 
         validation_columns = [x for x in data_with_validations.columns if x not in ingest_dataset.columns]
 
@@ -192,11 +195,14 @@ class ${record.capitalizedName}SchemaBase(ABC):
         valid_data = valid_data.drop(*validation_columns)
         return valid_data
 
+## TODO revise validation for relations
+#if (false)
 #foreach($relation in $record.relations)
     #if($relation.isOneToManyRelation())
     def _validate_with_${relation.snakeCaseName}_schema(self, dataset: DataFrame) -> bool:
         raise NotImplementedError
     #end
 #end
+#end
 
 
diff --git a/...ion-mda/src/main/resources/templates/data-delivery-data-records/spark.schema.base.java.vm b/...ion-mda/src/main/resources/templates/data-delivery-data-records/spark.schema.base.java.vm
@@ -164,6 +164,8 @@ public abstract class ${record.capitalizedName}SchemaBase extends SparkSchema {
                 #end
             #end ;
 
+    ## TODO revise validation for relations
+    #if (false)
         #foreach($relation in $record.relations)
             #if($relation.isOneToManyRelation())
             dataWithValidations = dataWithValidations.withColumn(${relationVars[$relation.name]} + "_VALID", lit(validateWith${relation.capitalizedName}Schema(data.select(col(${relationVars[$relation.name]})))));
@@ -172,6 +174,7 @@ public abstract class ${record.capitalizedName}SchemaBase extends SparkSchema {
             dataWithValidations = dataWithValidations.withColumn(${relationVars[$relation.name]} + "_VALID", lit(!${relation.uncapitalizedName}Schema.validateDataFrame(data.select(col(${relationVars[$relation.name]})), ${relationVars[$relation.name]} + ".").isEmpty()));
             #end
         #end
+    #end
 
         Column filterSchema = null;
         List<String> validationColumns = new ArrayList<>();
@@ -277,6 +280,8 @@ public abstract class ${record.capitalizedName}SchemaBase extends SparkSchema {
     }
 
 
+## TODO revise validation for relations
+#if (false)
     #foreach ($relation in $record.relations)
         #if ($relation.isOneToManyRelation())
 
@@ -295,4 +300,5 @@ public abstract class ${record.capitalizedName}SchemaBase extends SparkSchema {
 
         #end
     #end
+#end
 }
diff --git a/...issemble-test-data-delivery-pyspark-model/tests/features/pyspark_schema_relations.feature b/...issemble-test-data-delivery-pyspark-model/tests/features/pyspark_schema_relations.feature
@@ -23,6 +23,7 @@ Feature: Pyspark schema functionality works for relations
     When a "City" object is mapped to a spark dataset using the record
     Then the dataset has the correct values for the relational objects
 
+  # TODO validation for invalid relations should fail
   Scenario Outline: Records with a One to One relation can be validated using the spark schema
     Given the spark schema is generated for the "PersonWithOneToOneRelation" record
     And a "<validity>" "PersonWithOneToOneRelation" dataSet exists
@@ -31,8 +32,9 @@ Feature: Pyspark schema functionality works for relations
     Examples:
       | validity | success |
       | valid    | passes  |
-      | invalid  | fails   |
+      | invalid  | passes  |
 
+  # TODO validation for invalid relations should fail
   Scenario Outline: Records with a Many to One relation can be validated using the spark schema
     Given the spark schema is generated for the "PersonWithMToOneRelation" record
     And a "<validity>" "PersonWithMToOneRelation" dataSet exists
@@ -41,13 +43,14 @@ Feature: Pyspark schema functionality works for relations
     Examples:
       | validity | success |
       | valid    | passes  |
-      | invalid  | fails   |
+      | invalid  | passes  |
 
-  Scenario Outline: Spark schemas generated fails to validate One to Many relations with not yet implemented exception
+  # TODO validation for One to Many relations should include pass/fail testing
+  Scenario Outline: Spark schemas generated validates One to Many relations
     Given the spark schema is generate for the "City" record
     And a "City" dataSet with "<valid_size>" valid "Street" and "<invalid_size>" invalid streets exists
     When spark schema validation is performed on the "City" dataSet
-    Then the dataSet validation raises a not implemented error
+    Then the dataSet validation "passes"
     Examples:
       | valid_size | invalid_size |
       | 1          | 0            |

diff --git a/...a-delivery-spark-model/src/test/java/com/boozallen/aiops/mda/pattern/SparkSchemaTest.java b/...a-delivery-spark-model/src/test/java/com/boozallen/aiops/mda/pattern/SparkSchemaTest.java
@@ -162,8 +162,8 @@ public void sparkSchemaValidationIsPerformedOnThePersonWithMToOneRelationDataSet
         }
     }
 
-    @When("spark schema validation is performed on the dataSet")
-    public void sparkSchemaValidationIsPerformedOnTheDataSet() {
+    @When("spark schema validation is performed on the \"City\" dataSet")
+    public void sparkSchemaValidationIsPerformedOnTheCityDataSet() {
         try {
             this.validatedDataSet = this.citySchema.validateDataFrame(this.cityDataSet);
         }catch (Exception e) {

diff --git a/...dels/test-data-delivery-spark-model/src/test/resources/specifications/sparkSchema.feature b/...dels/test-data-delivery-spark-model/src/test/resources/specifications/sparkSchema.feature
@@ -22,6 +22,7 @@ Feature: Records with relations are generated correctly and function as expected
     When a "City" POJO is mapped to a spark dataset using the schema
     Then the dataset has the correct values for the relational objects
 
+  # TODO validation for invalid relations should fail
   Scenario Outline: Records with a One to One relation can be validated using the spark schema
     Given the spark schema is generated for the "PersonWithOneToOneRelation" record
     And a "<validity>" "PersonWithOneToOneRelation" dataSet exists
@@ -30,8 +31,9 @@ Feature: Records with relations are generated correctly and function as expected
     Examples:
       | validity | success |
       | valid    | passes  |
-      | invalid  | fails   |
+      | invalid  | passes  |
 
+  # TODO validation for invalid relations should fail
   Scenario Outline: Records with a Many to One relation can be validated using the spark schema
     Given the spark schema is generated for the "PersonWithMToOneRelation" record
     And a "<validity>" "PersonWithMToOneRelation" dataSet exists
@@ -40,10 +42,11 @@ Feature: Records with relations are generated correctly and function as expected
     Examples:
       | validity | success |
       | valid    | passes  |
-      | invalid  | fails   |
+      | invalid  | passes  |
 
-  Scenario: Spark schemas generated fails to validate One to Many relations with not yet implemented exception
+  # TODO validation for One to Many relations should include pass/fail testing
+  Scenario: Spark schemas generated validates One to Many relations
     Given the spark schema is generated for the "City" record
     And a valid "City" dataSet exists
-    When spark schema validation is performed on the dataSet
-    Then the validation fails with NotYetImplementedException
+    When spark schema validation is performed on the "City" dataSet
+    Then the dataSet validation "passes"