From 571b1a893442a0ac721d68f1e99c23f0ac88f2df Mon Sep 17 00:00:00 2001 From: Matthew Pope <81593196+popematt@users.noreply.github.com> Date: Thu, 14 Dec 2023 18:45:30 -0800 Subject: [PATCH] Add option to provide alternate regex implementation (#298) --- ion-schema/build.gradle | 2 + .../ionschema/IonSchemaSystemBuilder.kt | 28 ++++++- .../internal/ConstraintFactoryDefault.kt | 2 +- .../ionschema/internal/IonSchemaSystemImpl.kt | 4 +- .../ionschema/internal/constraint/Regex.kt | 23 +++--- .../ionschema/util/RegexImplementation.kt | 39 ++++++++++ .../AlternateRegexImplementationTest.kt | 78 +++++++++++++++++++ .../amazon/ionschema/IonSchemaTestsRunner.kt | 8 +- 8 files changed, 166 insertions(+), 18 deletions(-) create mode 100644 ion-schema/src/main/kotlin/com/amazon/ionschema/util/RegexImplementation.kt create mode 100644 ion-schema/src/test/kotlin/com/amazon/ionschema/AlternateRegexImplementationTest.kt diff --git a/ion-schema/build.gradle b/ion-schema/build.gradle index 95685f74..9b1f9e95 100644 --- a/ion-schema/build.gradle +++ b/ion-schema/build.gradle @@ -34,6 +34,8 @@ dependencies { testImplementation 'org.junit.jupiter:junit-jupiter-params:5.6.2' testImplementation 'io.mockk:mockk:1.13.3' testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine:5.6.2' + // For AlternateRegexImplementationTest + testImplementation "com.google.re2j:re2j:1.7" } processResources { diff --git a/ion-schema/src/main/kotlin/com/amazon/ionschema/IonSchemaSystemBuilder.kt b/ion-schema/src/main/kotlin/com/amazon/ionschema/IonSchemaSystemBuilder.kt index 5786da94..369e1378 100644 --- a/ion-schema/src/main/kotlin/com/amazon/ionschema/IonSchemaSystemBuilder.kt +++ b/ion-schema/src/main/kotlin/com/amazon/ionschema/IonSchemaSystemBuilder.kt @@ -20,6 +20,8 @@ import com.amazon.ion.system.IonSystemBuilder import com.amazon.ionschema.internal.ConstraintFactoryDefault import com.amazon.ionschema.internal.IonSchemaSystemImpl import com.amazon.ionschema.internal.WarningType +import com.amazon.ionschema.util.DefaultRegexImplementation +import com.amazon.ionschema.util.RegexImplementation import java.util.function.Consumer /** @@ -43,6 +45,7 @@ class IonSchemaSystemBuilder private constructor() { private var schemaCache: SchemaCache? = null private var params = mutableMapOf, Any>() private var warningCallback: ((() -> String) -> Unit)? = null + private var regexImplementation: RegexImplementation = DefaultRegexImplementation /** * Adds the provided authority to the list of [Authority]s. @@ -215,6 +218,28 @@ class IonSchemaSystemBuilder private constructor() { return this } + /** + * Sets the regex implementation to be used by the [IonSchemaSystem]. + * + * This can be used to replace the regex implementation in the Java standard library with an implementation of your + * own choosing. You might want to provide your own [RegexImplementation] in order to be able to set a timeout for + * evaluating inputs against a pattern, or to use an algorithm with different time or space complexity. + * + * For example, if you are accepting input from untrusted sources, you may choose to use a linear time algorithm for + * finding matches in order to protect against potential ReDoS attacks using + * [catastrophic backtracking](https://www.regular-expressions.info/catastrophic.html). + * + * See [AlternateRegexImplementationTest.kt](https://github.com/amazon-ion/ion-schema-kotlin/blob/master/ion-schema/src/test/kotlin/com/amazon/ionschema/AlternateRegexImplementationTest.kt) + * for an example of how one might implement [RegexImplementation] using a linear-time regex library. + * + * **WARNING**—if you supply your own [RegexImplementation] that differs from the ECMA standard, it may result in + * unexpected behavior when validating Ion data. + */ + fun withRegexImplementation(regexImplementation: RegexImplementation): IonSchemaSystemBuilder { + this.regexImplementation = regexImplementation + return this + } + /** * Instantiates an [IonSchemaSystem] using the provided [Authority](s) * and IonSystem. @@ -225,6 +250,7 @@ class IonSchemaSystemBuilder private constructor() { constraintFactory, schemaCache ?: SchemaCacheDefault(), params, - (warningCallback ?: { }) + (warningCallback ?: { }), + regexImplementation, ) } diff --git a/ion-schema/src/main/kotlin/com/amazon/ionschema/internal/ConstraintFactoryDefault.kt b/ion-schema/src/main/kotlin/com/amazon/ionschema/internal/ConstraintFactoryDefault.kt index 15c45b6d..b4f5a4d0 100644 --- a/ion-schema/src/main/kotlin/com/amazon/ionschema/internal/ConstraintFactoryDefault.kt +++ b/ion-schema/src/main/kotlin/com/amazon/ionschema/internal/ConstraintFactoryDefault.kt @@ -86,7 +86,7 @@ internal object ConstraintFactoryDefault : ConstraintFactory { ConstraintConstructor("one_of", v1_0..v2_0, ::OneOf), ConstraintConstructor("ordered_elements", v1_0..v2_0, ::OrderedElements), ConstraintConstructor("precision", v1_0..v2_0, ::Precision), - ConstraintConstructor("regex", v1_0..v2_0) { ion, schema -> Regex(ion, schema.ionSchemaLanguageVersion) }, + ConstraintConstructor("regex", v1_0..v2_0) { ion, schema -> Regex(ion, schema.ionSchemaLanguageVersion, schema.getSchemaSystem().regexImplementation) }, ConstraintConstructor("scale", v1_0, ::Scale), ConstraintConstructor("timestamp_offset", v1_0..v2_0, ::TimestampOffset), ConstraintConstructor("timestamp_precision", v1_0..v2_0, ::TimestampPrecision), diff --git a/ion-schema/src/main/kotlin/com/amazon/ionschema/internal/IonSchemaSystemImpl.kt b/ion-schema/src/main/kotlin/com/amazon/ionschema/internal/IonSchemaSystemImpl.kt index ee9d8dbc..8141c4a0 100644 --- a/ion-schema/src/main/kotlin/com/amazon/ionschema/internal/IonSchemaSystemImpl.kt +++ b/ion-schema/src/main/kotlin/com/amazon/ionschema/internal/IonSchemaSystemImpl.kt @@ -24,6 +24,7 @@ import com.amazon.ionschema.IonSchemaSystem import com.amazon.ionschema.IonSchemaVersion import com.amazon.ionschema.SchemaCache import com.amazon.ionschema.internal.util.islRequireNotNull +import com.amazon.ionschema.util.RegexImplementation /** * Implementation of [IonSchemaSystem]. @@ -34,7 +35,8 @@ internal class IonSchemaSystemImpl( private val constraintFactory: ConstraintFactory, private val schemaCache: SchemaCache, private val params: Map, Any>, - private val warnCallback: (() -> String) -> Unit + private val warnCallback: (() -> String) -> Unit, + internal val regexImplementation: RegexImplementation, ) : IonSchemaSystem { private val schemaContentCache = SchemaContentCache(this::loadSchemaContent) diff --git a/ion-schema/src/main/kotlin/com/amazon/ionschema/internal/constraint/Regex.kt b/ion-schema/src/main/kotlin/com/amazon/ionschema/internal/constraint/Regex.kt index db733097..6166dba2 100644 --- a/ion-schema/src/main/kotlin/com/amazon/ionschema/internal/constraint/Regex.kt +++ b/ion-schema/src/main/kotlin/com/amazon/ionschema/internal/constraint/Regex.kt @@ -24,7 +24,7 @@ import com.amazon.ionschema.Violation import com.amazon.ionschema.Violations import com.amazon.ionschema.internal.util.islRequire import com.amazon.ionschema.internal.util.validateRegexPattern -import java.util.regex.Pattern +import com.amazon.ionschema.util.RegexImplementation /** * Implements the regex constraint. This implementation translates @@ -39,38 +39,39 @@ import java.util.regex.Pattern */ internal class Regex( ion: IonValue, - private val islVersion: IonSchemaVersion + islVersion: IonSchemaVersion, + regexImplementation: RegexImplementation, ) : ConstraintBase(ion) { - private val pattern: Pattern + private val pattern: RegexImplementation.Pattern init { islRequire(ion is IonString && !ion.isNullValue && ion.stringValue().isNotEmpty()) { "Regex must be a non-empty string; but was: $ion" } - var flags = 0 + var multiline = false + var caseInsensitive = false ion.typeAnnotations.forEach { - val flag = when (it) { - "i" -> Pattern.CASE_INSENSITIVE - "m" -> Pattern.MULTILINE + when (it) { + "i" -> caseInsensitive = true + "m" -> multiline = true else -> throw InvalidSchemaException( "Unrecognized flags for regex ($ion)" ) } - flags = flags.or(flag) } val patternString = validateRegexPattern(ion.stringValue(), islVersion) - pattern = Pattern.compile(patternString, flags) + pattern = regexImplementation.compile(patternString, multiline, caseInsensitive) } override fun validate(value: IonValue, issues: Violations) { validateAs(value, issues) { v -> - if (!pattern.matcher(v.stringValue()).find()) { + if (!pattern.test(v.stringValue())) { issues.add( Violation( ion, "regex_mismatch", - "'${v.stringValue()}' doesn't match regex '${pattern.pattern()}'" + "'${v.stringValue()}' doesn't match regex '${pattern.pattern}'" ) ) } diff --git a/ion-schema/src/main/kotlin/com/amazon/ionschema/util/RegexImplementation.kt b/ion-schema/src/main/kotlin/com/amazon/ionschema/util/RegexImplementation.kt new file mode 100644 index 00000000..ba95e267 --- /dev/null +++ b/ion-schema/src/main/kotlin/com/amazon/ionschema/util/RegexImplementation.kt @@ -0,0 +1,39 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +package com.amazon.ionschema.util + +import java.util.function.Predicate +import java.util.regex.Pattern as JPattern + +/** + * Interface that allows any regular expression implementation to be injected into an + * [`IonSchemaSystem`][com.amazon.ionschema.IonSchemaSystem]. + * + * See [`IonSchemaSystemBuilder.withRegexImplementation`] + * [com.amazon.ionschema.IonSchemaSystemBuilder.withRegexImplementation] for details. + */ +interface RegexImplementation { + + /** Compile a [pattern] string into a [Pattern]. */ + fun compile(pattern: String, multiline: Boolean, caseInsensitive: Boolean): Pattern + + /** An abstraction over a compiled regular expression regardless of the particular regex implementation. */ + open class Pattern( + /** The regular expression from which this pattern was compiled */ + val pattern: String, + /** A predicate which can be used for finding a match on a subsequence of a string. */ + test: Predicate + ) : Predicate by test +} + +/** Default [RegexImplementation] used by Ion Schema Kotlin. This is backed by the Java standard library. */ +object DefaultRegexImplementation : RegexImplementation { + + override fun compile(pattern: String, multiline: Boolean, caseInsensitive: Boolean): RegexImplementation.Pattern { + val flags = (if (multiline) JPattern.MULTILINE else 0) + + (if (caseInsensitive) JPattern.CASE_INSENSITIVE else 0) + + return RegexImplementation.Pattern(pattern, JPattern.compile(pattern, flags).asPredicate()) + } +} diff --git a/ion-schema/src/test/kotlin/com/amazon/ionschema/AlternateRegexImplementationTest.kt b/ion-schema/src/test/kotlin/com/amazon/ionschema/AlternateRegexImplementationTest.kt new file mode 100644 index 00000000..c6d79385 --- /dev/null +++ b/ion-schema/src/test/kotlin/com/amazon/ionschema/AlternateRegexImplementationTest.kt @@ -0,0 +1,78 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +package com.amazon.ionschema + +import com.amazon.ionschema.util.RegexImplementation +import com.google.re2j.Pattern + +/** + * Alternate Regex implementation that promises linear time evaluation. + * + * See [`google/re2j`](https://github.com/google/re2j/) on GitHub for more information. + * + * Use with caution! The `re2j` library is not quite compliant to Ion Schema Specification. Specifically, it does not + * treat `\r`, `\u2028`, and `\u2029` as newline characters. For many applications, this is fine because *nix systems + * use `\n`. If you are validating data from a Windows system, you may encounter issues because Windows uses `\r\n` for + * a newline. + * + * A naive workaround would be to replace all occurrences of the `.` character class with `[^\n\r\u2028\u2029]` in + * the regular expression before compiling it. However, that doesn't work when the multiline flag is set because `^` and + * `$` still do not match any newline characters other than `\n`. + * + * Another workaround (and the one that this class optionally supports) is to pre-process the regex pattern and the + * regex input to replace all `\r`, `\u2028`, and `\u2029` with `\n`. This approach is _probably_ sufficient for any + * use case that doesn't try to distinguish between different newline characters. + */ +class Re2jRegexImplementation(private val consolidateNewlines: Boolean = false) : RegexImplementation { + + override fun compile(pattern: String, multiline: Boolean, caseInsensitive: Boolean): RegexImplementation.Pattern { + val flags = (if (multiline) Pattern.MULTILINE else 0) + + (if (caseInsensitive) Pattern.CASE_INSENSITIVE else 0) + + val compiled = if (consolidateNewlines) { + Pattern.compile(pattern.consolidateNewlines(), flags) + } else { + Pattern.compile(pattern, flags) + } + + return RegexImplementation.Pattern(pattern) { input -> + if (consolidateNewlines) { + compiled.matcher(input.consolidateNewlines()).find() + } else { + compiled.matcher(input).find() + } + } + } + + /** + * Consolidates all newline characters to be the same by replacing all `\r`, `\u2028`, and `\u2029` with `\n` + */ + private fun String.consolidateNewlines() = map { consolidateNewlineChars(it) }.joinToString("") + + /** Returns `\n` if [c] is any newline character, otherwise returns [c] */ + private fun consolidateNewlineChars(c: Char) = when (c) { + '\r', '\u2028', '\u2029' -> '\n' + else -> c + } +} + +class IonSchemaTests_1_0_AlternateRegex : TestFactory by IonSchemaTestsRunner( + islVersion = IonSchemaVersion.v1_0, + systemBuilder = IonSchemaSystemBuilder.standard() + .allowTransitiveImports(false) + // Some cases in ion-schema-tests use \r, so we need to consolidate the newlines. + .withRegexImplementation(Re2jRegexImplementation(consolidateNewlines = true)) +) + +class IonSchemaTests_2_0_AlternateRegex : TestFactory by IonSchemaTestsRunner( + islVersion = IonSchemaVersion.v2_0, + systemBuilder = IonSchemaSystemBuilder.standard() + .allowTransitiveImports(false) + // Some cases in ion-schema-tests use \r, so we need to consolidate the newlines. + .withRegexImplementation(Re2jRegexImplementation(consolidateNewlines = true)), + // This one test fails because it's checking to make sure that '\n' and '\r' aren't interchangeable + testNameFilter = { + it != "[constraints/regex.isl] Type 'regex_unescaped_newline' should not match value: \"hello\\rworld\"" + } +) diff --git a/ion-schema/src/test/kotlin/com/amazon/ionschema/IonSchemaTestsRunner.kt b/ion-schema/src/test/kotlin/com/amazon/ionschema/IonSchemaTestsRunner.kt index e098fd75..3253f514 100644 --- a/ion-schema/src/test/kotlin/com/amazon/ionschema/IonSchemaTestsRunner.kt +++ b/ion-schema/src/test/kotlin/com/amazon/ionschema/IonSchemaTestsRunner.kt @@ -100,7 +100,7 @@ class IonSchemaTestsRunner( .map { createValueTestCase(schemaId, type, it, expectValid = true) } val shouldNotMatch = (ion["should_reject_as_invalid"] as IonList? ?: emptyList()) .map { createValueTestCase(schemaId, type, it, expectValid = false) } - dynamicContainer(schemaId, shouldMatch + shouldNotMatch) + dynamicContainer(schemaId, (shouldMatch + shouldNotMatch).filter { testNameFilter(it.displayName) }) } isInvalidSchemasTestCase(ion) -> createSchemasTestCases(schemaId, ion, expectValid = false) @@ -114,13 +114,13 @@ class IonSchemaTestsRunner( assertThrows { schema.newType(it as IonStruct) } } } - dynamicContainer("[$schemaId] $baseDescription", cases) + dynamicContainer("[$schemaId] $baseDescription", cases.filter { testNameFilter(it.displayName) }) } else -> dynamicTest(schemaId) { throw IllegalArgumentException("Malformed test input: $ion") } } } - return dynamicContainer(schemaId, f.toURI(), dynamicNodeTestCases.stream().filter { testNameFilter(it.displayName) }) + return dynamicContainer(schemaId, f.toURI(), dynamicNodeTestCases.stream()) } private fun createSchemasTestCases(schemaId: String, ion: IonStruct, expectValid: Boolean): DynamicNode { @@ -134,7 +134,7 @@ class IonSchemaTestsRunner( assertThrows { schemaSystem.newSchema(it.asDocument().iterator()) } } } - return dynamicContainer("[$schemaId] $baseDescription", cases) + return dynamicContainer("[$schemaId] $baseDescription", cases.filter { testNameFilter(it.displayName) }) } private fun createValueTestCase(schemaId: String, testType: Type, value: IonValue, expectValid: Boolean): DynamicNode {