snowplow · colmsnowplow · Aug 11, 2022 · Aug 4, 2022 · colmsnowplow · Aug 8, 2022
diff --git a/config/test-fixtures/transform-mixed-filtered.hcl b/config/test-fixtures/transform-mixed-filtered.hcl
@@ -8,7 +8,7 @@ transform {
 
 transform {
   use "spEnrichedFilter" {
-    field = "app_id"
+    atomic_field = "app_id"
     regex = "wrong"
     regex_timeout = 10
   }

diff --git a/pkg/transform/snowplow_enriched_filter.go b/pkg/transform/snowplow_enriched_filter.go
@@ -8,8 +8,8 @@ package transform
 
 import (
 	"fmt"
-	"log"
 	"regexp"
+	"strconv"
 	"strings"
 	"time"
 
@@ -21,119 +21,55 @@ import (
 	"github.com/snowplow-devops/stream-replicator/pkg/models"
 )
 
-func findSpEnrichedFilterValue(queriedField, parsedEventName, eventVer, field string, parsedMessage analytics.ParsedEvent, path []interface{}) ([]interface{}, error) {
-	var vf interface{}
-	var valueFound []interface{}
-	var err error
-
-	switch {
-	case strings.HasPrefix(queriedField, `contexts_`):
-		vf, err = parsedMessage.GetContextValue(queriedField, path...)
-		valueFound = append(valueFound, vf.([]interface{})...)
-	case strings.HasPrefix(queriedField, `unstruct_event`):
-		eventNameFull := `unstruct_event_` + parsedEventName
-		if queriedField == eventNameFull || queriedField == eventNameFull+`_`+eventVer {
-			vf, err = parsedMessage.GetUnstructEventValue(path...)
-			valueFound = append(valueFound, vf)
-		}
-	default:
-		vf, err = parsedMessage.GetValue(field)
-		valueFound = append(valueFound, vf)
+func evaluateSpEnrichedFilter(re *regexp2.Regexp, valuesFound []interface{}) bool {
+	// if valuesFound is nil, we found no value.
+	// Because negative matches are a thing, we still want to match against an empty string
+	if valuesFound == nil {
+		valuesFound = make([]interface{}, 1)
 	}
-	if err != nil {
-		// GetValue returns an error if the field requested is empty. Check for that particular error before returning error
-		if err.Error() == analytics.EmptyFieldErr {
-			return nil, nil
+	for _, v := range valuesFound {
+		if v == nil {
+			v = "" // because nil gets cast to `<nil>`
 		}
-		return nil, err
-	}
-	return valueFound, nil
-}
 
-func evaluateSpEnrichedFilter(valuesFound []interface{}, regex string, regexTimeout int) bool {
-	re, err := regexp2.Compile(regex, 0)
-	re.MatchTimeout = time.Duration(regexTimeout) * time.Second
-	if err != nil {
-		log.Fatal(errors.Wrap(err, `error compiling regex for filter`))
-	}
-	for _, v := range valuesFound {
 		if ok, _ := re.MatchString(fmt.Sprintf("%v", v)); ok {
 			return true
 		}
 	}
 	return false
 }
 
-// createSpEnrichedFilterFunction returns a TransformationFunction which filters messages based on a field in the Snowplow enriched event
-// and a regex declared by the user.
-func createSpEnrichedFilterFunction(field, regex string, regexTimeout int, isUnstructEvent bool) (TransformationFunction, error) {
+func createSpEnrichedFilterFunction(regex string, regexTimeout int, getFunc valueGetter) (TransformationFunction, error) {
+	if regexTimeout == 0 {
+		// default timeout for regex is 10 seconds
+		regexTimeout = 10
+	}
+
+	// regexToMatch is what we use to evaluate the actual filter, once we have the value.
+	regexToMatch, err := regexp2.Compile(regex, 0)
+	regexToMatch.MatchTimeout = time.Duration(regexTimeout) * time.Second
+	if err != nil {
+		return nil, errors.Wrap(err, `error compiling regex for filter`)
+	}
+
 	return func(message *models.Message, intermediateState interface{}) (*models.Message, *models.Message, *models.Message, interface{}) {
-		if regexTimeout == 0 {
-			// default timeout for regex is 10 seconds
-			regexTimeout = 10
-		}
+
 		// Evaluate intermediateState to parsedEvent
 		parsedMessage, parseErr := IntermediateAsSpEnrichedParsed(intermediateState, message)
 		if parseErr != nil {
 			message.SetError(parseErr)
 			return nil, nil, message, nil
 		}
 
-		// This regex retrieves the path fields
-		// (e.g. field1.field2[0].field3 -> [field1, field2, 0, field3])
-		regexWords := `\w+`
-		re := regexp.MustCompile(regexWords)
-
-		// separate the path string into words using regex
-		path := re.FindAllString(field, -1)
-		separatedPath := make([]string, len(path)-1)
-		for idx, pathField := range path[1:] {
-			separatedPath[idx] = pathField
-		}
-
-		var parsedEventName string
-		var eventMajorVer string
-		var err error
-
-		// only call SDK functions if an unstruct_event is being filtered
-		if isUnstructEvent {
-			// get event name
-			eventName, err := parsedMessage.GetValue(`event_name`)
-			if err != nil {
-				message.SetError(err)
-				return nil, nil, message, nil
-			}
-			parsedEventName = eventName.(string)
-			// get event version
-			fullEventVer, err := parsedMessage.GetValue(`event_version`)
-			if err != nil {
-				message.SetError(err)
-				return nil, nil, message, nil
-			}
-			// get the major event version
-			eventMajorVer = strings.Split(fullEventVer.(string), `-`)[0]
-			if eventMajorVer == `` {
-				message.SetError(fmt.Errorf(`invalid schema version format: %s`, fullEventVer))
-				return nil, nil, message, nil
-			}
-		}
-
-		// find the value in the event
-		valueFound, err := findSpEnrichedFilterValue(
-			path[0],
-			parsedEventName,
-			eventMajorVer,
-			field,
-			parsedMessage,
-			convertPathToInterfaces(separatedPath),
-		)
+		// get the value
+		valueFound, err := getFunc(parsedMessage)
 		if err != nil {
 			message.SetError(err)
 			return nil, nil, message, nil
 		}
 
 		// evaluate whether the found value passes the filter, determining if the message should be kept
-		shouldKeepMessage := evaluateSpEnrichedFilter(valueFound, regex, regexTimeout)
+		shouldKeepMessage := evaluateSpEnrichedFilter(regexToMatch, valueFound)
 
 		// if message is not to be kept, return it as a filtered message to be acked in the main function
 		if !shouldKeepMessage {
@@ -145,17 +81,158 @@ func createSpEnrichedFilterFunction(field, regex string, regexTimeout int, isUns
 	}, nil
 }
 
+// valueGetter is a function that can hold the logic for getting values in the case of base, context, and unstruct fields,
+// which respecively require different logic.
+type valueGetter func(analytics.ParsedEvent) ([]interface{}, error)
+
+// Because each type of value requires different arguments, we use these `make` functions to construct them.
+// This allows us to unit test each one, plug them into the createSpEnrichedFilterFunction constructor,
+// and to construct them so that field names/paths and regexes are handled only once, at startup.
+
+// makeBaseValueGetter returns a valueGetter for base-level values.
+func makeBaseValueGetter(field string) valueGetter {
+	return func(parsedMessage analytics.ParsedEvent) (value []interface{}, err error) {
+		// find the value in the event
+		valueFound, err := parsedMessage.GetValue(field)
+		// We don't return an error for empty field since this just means the value is nil.
+		if err != nil && err.Error() != analytics.EmptyFieldErr {
+			return nil, err
+		}
+		return []interface{}{valueFound}, nil
+	}
+}
+
 // NewSpEnrichedFilterFunction returns a TransformationFunction which filters messages based on a field in the Snowplow enriched event.
 func NewSpEnrichedFilterFunction(field, regex string, regexTimeout int) (TransformationFunction, error) {
-	return createSpEnrichedFilterFunction(field, regex, regexTimeout, false)
+
+	// getBaseValueForMatch is responsible for retrieving data from the message for base fields
+	getBaseValueForMatch := makeBaseValueGetter(field)
+
+	return createSpEnrichedFilterFunction(regex, regexTimeout, getBaseValueForMatch)
+}
+
+// makeContextValueGetter creates a valueGetter for context data
+func makeContextValueGetter(name string, path []interface{}) valueGetter {
+	return func(parsedMessage analytics.ParsedEvent) ([]interface{}, error) {
+		value, err := parsedMessage.GetContextValue(name, path...)
+		// We don't return an error for empty field since this just means the value is nil.
+		if err != nil && err.Error() != analytics.EmptyFieldErr {
+			return nil, err
+		}
+		// bug in analytics sdk requires the type casting below. https://github.com/snowplow/snowplow-golang-analytics-sdk/issues/36
+		// GetContextValue should always return []interface{} but instead it returns an interface{} which always contains type []interface{}
+
+		// if it's nil, return nil - we just didn't find any value.
+		if value == nil {
+			return nil, nil
+		}
+		// otherwise, type assertion.
+		valueFound, ok := value.([]interface{})
+		if !ok {
+			return nil, errors.New(fmt.Sprintf("Context filter encountered unexpected type in getting value for path %v", path))
+		}
+
+		return valueFound, nil
+	}
 }
 
 // NewSpEnrichedFilterFunctionContext returns a TransformationFunction for filtering a context
-func NewSpEnrichedFilterFunctionContext(field, regex string, regexTimeout int) (TransformationFunction, error) {
-	return createSpEnrichedFilterFunction(field, regex, regexTimeout, false)
+func NewSpEnrichedFilterFunctionContext(contextFullName, pathToField, regex string, regexTimeout int) (TransformationFunction, error) {
+
+	path, err := parsePathToArguments(pathToField)
+	if err != nil {
+		return nil, errors.Wrap(err, "error creating Context filter function")
+	}
+
+	// getContextValuesForMatch is responsible for retrieving data from the message for context fields
+	getContextValuesForMatch := makeContextValueGetter(contextFullName, path)
+
+	return createSpEnrichedFilterFunction(regex, regexTimeout, getContextValuesForMatch)
+}
+
+// makeUnstructValueGetter creates a valueGetter for unstruct data.
+func makeUnstructValueGetter(eventName string, versionRegex *regexp.Regexp, path []interface{}) valueGetter {
+	return func(parsedMessage analytics.ParsedEvent) (value []interface{}, err error) {
+		eventNameFound, err := parsedMessage.GetValue(`event_name`)
+		if err != nil { // This field can't be empty for a valid event, so we return all errors here
+			return nil, err
+		}
+		if eventNameFound != eventName { // If we don't have an exact match on event name, we return nil value
+			return nil, nil
+		}
+		versionFound, err := parsedMessage.GetValue(`event_version`)
+		if err != nil { // This field can't be empty for a valid event, so we return all errors here
+			return nil, err
+		}
+		if !versionRegex.MatchString(versionFound.(string)) { // If we don't match the provided version regex, return nil value
+			return nil, nil
+		}
+
+		valueFound, err := parsedMessage.GetUnstructEventValue(path...)
+		// We don't return an error for empty field since this just means the value is nil.
+		if err != nil && err.Error() != analytics.EmptyFieldErr && !strings.Contains(err.Error(), "not found") {
+			// This last clause exists because of this: https://github.com/snowplow/snowplow-golang-analytics-sdk/issues/37
+			// TODO: Fix that and remove it as soon as possible.
+			return nil, err
+		}
+
+		if valueFound == nil {
+			return nil, nil
+		}
+
+		return []interface{}{valueFound}, nil
+	}
 }
 
 // NewSpEnrichedFilterFunctionUnstructEvent returns a TransformationFunction for filtering an unstruct_event
-func NewSpEnrichedFilterFunctionUnstructEvent(field, regex string, regexTimeout int) (TransformationFunction, error) {
-	return createSpEnrichedFilterFunction(field, regex, regexTimeout, true)
+func NewSpEnrichedFilterFunctionUnstructEvent(eventNameToMatch, eventVersionToMatch, pathToField, regex string, regexTimeout int) (TransformationFunction, error) {
+
+	path, err := parsePathToArguments(pathToField)
+	if err != nil {
+		return nil, errors.Wrap(err, "error creating Unstruct filter function")
+	}
+
+	versionRegex, err := regexp.Compile(eventVersionToMatch)
+	if err != nil {
+		return nil, errors.Wrap(err, fmt.Sprint("Failed to compile regex: ", eventVersionToMatch))
+	}
+
+	// getUnstructValuesForMatch is responsible for retrieving data from the message for context fields.
+	// It also checks that the correct event name and version are provided, and returns nil if not.
+	getUnstructValuesForMatch := makeUnstructValueGetter(eventNameToMatch, versionRegex, path)
+
+	return createSpEnrichedFilterFunction(regex, regexTimeout, getUnstructValuesForMatch)
+}
+
+// parsePathToArguments parses a string path to custom data (eg. `test1.test2[0].test3`)
+// into the slice of interfaces expected by the analytics SDK's Get() methods.
+func parsePathToArguments(pathToField string) ([]interface{}, error) {
+	// validate that an edge case (unmatched opening brace) isn't present
+	if strings.Count(pathToField, "[") != strings.Count(pathToField, "]") {
+		return nil, errors.New(fmt.Sprint("unmatched brace in path: ", pathToField))
+	}
+
+	// regex to separate path into components
+	re := regexp.MustCompile(`\[\d+\]|[^\.\[]+`)
+	parts := re.FindAllString(pathToField, -1)
+
+	// regex to identify arrays
+	arrayRegex := regexp.MustCompile(`\[\d+\]`)
+
+	convertedPath := make([]interface{}, 0)
+	for _, part := range parts {
+
+		if arrayRegex.MatchString(part) { // handle arrays first
+			intPart, err := strconv.Atoi(part[1 : len(part)-1]) // strip braces and convert to int
+			if err != nil {
+				return nil, errors.New(fmt.Sprint("error parsing path element: ", part))
+			}
+
+			convertedPath = append(convertedPath, intPart)
+		} else { // handle strings
+			convertedPath = append(convertedPath, part)
+		}
+
+	}
+	return convertedPath, nil
 }