Skip to content

Commit

Permalink
Add option to provide alternate regex implementation (#298)
Browse files Browse the repository at this point in the history
  • Loading branch information
popematt authored Dec 15, 2023
1 parent 92e0a25 commit 571b1a8
Show file tree
Hide file tree
Showing 8 changed files with 166 additions and 18 deletions.
2 changes: 2 additions & 0 deletions ion-schema/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ dependencies {
testImplementation 'org.junit.jupiter:junit-jupiter-params:5.6.2'
testImplementation 'io.mockk:mockk:1.13.3'
testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine:5.6.2'
// For AlternateRegexImplementationTest
testImplementation "com.google.re2j:re2j:1.7"
}

processResources {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ import com.amazon.ion.system.IonSystemBuilder
import com.amazon.ionschema.internal.ConstraintFactoryDefault
import com.amazon.ionschema.internal.IonSchemaSystemImpl
import com.amazon.ionschema.internal.WarningType
import com.amazon.ionschema.util.DefaultRegexImplementation
import com.amazon.ionschema.util.RegexImplementation
import java.util.function.Consumer

/**
Expand All @@ -43,6 +45,7 @@ class IonSchemaSystemBuilder private constructor() {
private var schemaCache: SchemaCache? = null
private var params = mutableMapOf<IonSchemaSystemImpl.Param<*>, Any>()
private var warningCallback: ((() -> String) -> Unit)? = null
private var regexImplementation: RegexImplementation = DefaultRegexImplementation

/**
* Adds the provided authority to the list of [Authority]s.
Expand Down Expand Up @@ -215,6 +218,28 @@ class IonSchemaSystemBuilder private constructor() {
return this
}

/**
* Sets the regex implementation to be used by the [IonSchemaSystem].
*
* This can be used to replace the regex implementation in the Java standard library with an implementation of your
* own choosing. You might want to provide your own [RegexImplementation] in order to be able to set a timeout for
* evaluating inputs against a pattern, or to use an algorithm with different time or space complexity.
*
* For example, if you are accepting input from untrusted sources, you may choose to use a linear time algorithm for
* finding matches in order to protect against potential ReDoS attacks using
* [catastrophic backtracking](https://www.regular-expressions.info/catastrophic.html).
*
* See [AlternateRegexImplementationTest.kt](https://github.com/amazon-ion/ion-schema-kotlin/blob/master/ion-schema/src/test/kotlin/com/amazon/ionschema/AlternateRegexImplementationTest.kt)
* for an example of how one might implement [RegexImplementation] using a linear-time regex library.
*
* **WARNING**—if you supply your own [RegexImplementation] that differs from the ECMA standard, it may result in
* unexpected behavior when validating Ion data.
*/
fun withRegexImplementation(regexImplementation: RegexImplementation): IonSchemaSystemBuilder {
this.regexImplementation = regexImplementation
return this
}

/**
* Instantiates an [IonSchemaSystem] using the provided [Authority](s)
* and IonSystem.
Expand All @@ -225,6 +250,7 @@ class IonSchemaSystemBuilder private constructor() {
constraintFactory,
schemaCache ?: SchemaCacheDefault(),
params,
(warningCallback ?: { })
(warningCallback ?: { }),
regexImplementation,
)
}
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ internal object ConstraintFactoryDefault : ConstraintFactory {
ConstraintConstructor("one_of", v1_0..v2_0, ::OneOf),
ConstraintConstructor("ordered_elements", v1_0..v2_0, ::OrderedElements),
ConstraintConstructor("precision", v1_0..v2_0, ::Precision),
ConstraintConstructor("regex", v1_0..v2_0) { ion, schema -> Regex(ion, schema.ionSchemaLanguageVersion) },
ConstraintConstructor("regex", v1_0..v2_0) { ion, schema -> Regex(ion, schema.ionSchemaLanguageVersion, schema.getSchemaSystem().regexImplementation) },
ConstraintConstructor("scale", v1_0, ::Scale),
ConstraintConstructor("timestamp_offset", v1_0..v2_0, ::TimestampOffset),
ConstraintConstructor("timestamp_precision", v1_0..v2_0, ::TimestampPrecision),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import com.amazon.ionschema.IonSchemaSystem
import com.amazon.ionschema.IonSchemaVersion
import com.amazon.ionschema.SchemaCache
import com.amazon.ionschema.internal.util.islRequireNotNull
import com.amazon.ionschema.util.RegexImplementation

/**
* Implementation of [IonSchemaSystem].
Expand All @@ -34,7 +35,8 @@ internal class IonSchemaSystemImpl(
private val constraintFactory: ConstraintFactory,
private val schemaCache: SchemaCache,
private val params: Map<Param<out Any>, Any>,
private val warnCallback: (() -> String) -> Unit
private val warnCallback: (() -> String) -> Unit,
internal val regexImplementation: RegexImplementation,
) : IonSchemaSystem {

private val schemaContentCache = SchemaContentCache(this::loadSchemaContent)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import com.amazon.ionschema.Violation
import com.amazon.ionschema.Violations
import com.amazon.ionschema.internal.util.islRequire
import com.amazon.ionschema.internal.util.validateRegexPattern
import java.util.regex.Pattern
import com.amazon.ionschema.util.RegexImplementation

/**
* Implements the regex constraint. This implementation translates
Expand All @@ -39,38 +39,39 @@ import java.util.regex.Pattern
*/
internal class Regex(
ion: IonValue,
private val islVersion: IonSchemaVersion
islVersion: IonSchemaVersion,
regexImplementation: RegexImplementation,
) : ConstraintBase(ion) {

private val pattern: Pattern
private val pattern: RegexImplementation.Pattern

init {
islRequire(ion is IonString && !ion.isNullValue && ion.stringValue().isNotEmpty()) {
"Regex must be a non-empty string; but was: $ion"
}

var flags = 0
var multiline = false
var caseInsensitive = false
ion.typeAnnotations.forEach {
val flag = when (it) {
"i" -> Pattern.CASE_INSENSITIVE
"m" -> Pattern.MULTILINE
when (it) {
"i" -> caseInsensitive = true
"m" -> multiline = true
else -> throw InvalidSchemaException(
"Unrecognized flags for regex ($ion)"
)
}
flags = flags.or(flag)
}
val patternString = validateRegexPattern(ion.stringValue(), islVersion)
pattern = Pattern.compile(patternString, flags)
pattern = regexImplementation.compile(patternString, multiline, caseInsensitive)
}

override fun validate(value: IonValue, issues: Violations) {
validateAs<IonText>(value, issues) { v ->
if (!pattern.matcher(v.stringValue()).find()) {
if (!pattern.test(v.stringValue())) {
issues.add(
Violation(
ion, "regex_mismatch",
"'${v.stringValue()}' doesn't match regex '${pattern.pattern()}'"
"'${v.stringValue()}' doesn't match regex '${pattern.pattern}'"
)
)
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

package com.amazon.ionschema.util

import java.util.function.Predicate
import java.util.regex.Pattern as JPattern

/**
* Interface that allows any regular expression implementation to be injected into an
* [`IonSchemaSystem`][com.amazon.ionschema.IonSchemaSystem].
*
* See [`IonSchemaSystemBuilder.withRegexImplementation`]
* [com.amazon.ionschema.IonSchemaSystemBuilder.withRegexImplementation] for details.
*/
interface RegexImplementation {

/** Compile a [pattern] string into a [Pattern]. */
fun compile(pattern: String, multiline: Boolean, caseInsensitive: Boolean): Pattern

/** An abstraction over a compiled regular expression regardless of the particular regex implementation. */
open class Pattern(
/** The regular expression from which this pattern was compiled */
val pattern: String,
/** A predicate which can be used for finding a match on a subsequence of a string. */
test: Predicate<String>
) : Predicate<String> by test
}

/** Default [RegexImplementation] used by Ion Schema Kotlin. This is backed by the Java standard library. */
object DefaultRegexImplementation : RegexImplementation {

override fun compile(pattern: String, multiline: Boolean, caseInsensitive: Boolean): RegexImplementation.Pattern {
val flags = (if (multiline) JPattern.MULTILINE else 0) +
(if (caseInsensitive) JPattern.CASE_INSENSITIVE else 0)

return RegexImplementation.Pattern(pattern, JPattern.compile(pattern, flags).asPredicate())
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

package com.amazon.ionschema

import com.amazon.ionschema.util.RegexImplementation
import com.google.re2j.Pattern

/**
* Alternate Regex implementation that promises linear time evaluation.
*
* See [`google/re2j`](https://github.com/google/re2j/) on GitHub for more information.
*
* Use with caution! The `re2j` library is not quite compliant to Ion Schema Specification. Specifically, it does not
* treat `\r`, `\u2028`, and `\u2029` as newline characters. For many applications, this is fine because *nix systems
* use `\n`. If you are validating data from a Windows system, you may encounter issues because Windows uses `\r\n` for
* a newline.
*
* A naive workaround would be to replace all occurrences of the `.` character class with `[^\n\r\u2028\u2029]` in
* the regular expression before compiling it. However, that doesn't work when the multiline flag is set because `^` and
* `$` still do not match any newline characters other than `\n`.
*
* Another workaround (and the one that this class optionally supports) is to pre-process the regex pattern and the
* regex input to replace all `\r`, `\u2028`, and `\u2029` with `\n`. This approach is _probably_ sufficient for any
* use case that doesn't try to distinguish between different newline characters.
*/
class Re2jRegexImplementation(private val consolidateNewlines: Boolean = false) : RegexImplementation {

override fun compile(pattern: String, multiline: Boolean, caseInsensitive: Boolean): RegexImplementation.Pattern {
val flags = (if (multiline) Pattern.MULTILINE else 0) +
(if (caseInsensitive) Pattern.CASE_INSENSITIVE else 0)

val compiled = if (consolidateNewlines) {
Pattern.compile(pattern.consolidateNewlines(), flags)
} else {
Pattern.compile(pattern, flags)
}

return RegexImplementation.Pattern(pattern) { input ->
if (consolidateNewlines) {
compiled.matcher(input.consolidateNewlines()).find()
} else {
compiled.matcher(input).find()
}
}
}

/**
* Consolidates all newline characters to be the same by replacing all `\r`, `\u2028`, and `\u2029` with `\n`
*/
private fun String.consolidateNewlines() = map { consolidateNewlineChars(it) }.joinToString("")

/** Returns `\n` if [c] is any newline character, otherwise returns [c] */
private fun consolidateNewlineChars(c: Char) = when (c) {
'\r', '\u2028', '\u2029' -> '\n'
else -> c
}
}

class IonSchemaTests_1_0_AlternateRegex : TestFactory by IonSchemaTestsRunner(
islVersion = IonSchemaVersion.v1_0,
systemBuilder = IonSchemaSystemBuilder.standard()
.allowTransitiveImports(false)
// Some cases in ion-schema-tests use \r, so we need to consolidate the newlines.
.withRegexImplementation(Re2jRegexImplementation(consolidateNewlines = true))
)

class IonSchemaTests_2_0_AlternateRegex : TestFactory by IonSchemaTestsRunner(
islVersion = IonSchemaVersion.v2_0,
systemBuilder = IonSchemaSystemBuilder.standard()
.allowTransitiveImports(false)
// Some cases in ion-schema-tests use \r, so we need to consolidate the newlines.
.withRegexImplementation(Re2jRegexImplementation(consolidateNewlines = true)),
// This one test fails because it's checking to make sure that '\n' and '\r' aren't interchangeable
testNameFilter = {
it != "[constraints/regex.isl] Type 'regex_unescaped_newline' should not match value: \"hello\\rworld\""
}
)
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ class IonSchemaTestsRunner(
.map { createValueTestCase(schemaId, type, it, expectValid = true) }
val shouldNotMatch = (ion["should_reject_as_invalid"] as IonList? ?: emptyList<IonValue>())
.map { createValueTestCase(schemaId, type, it, expectValid = false) }
dynamicContainer(schemaId, shouldMatch + shouldNotMatch)
dynamicContainer(schemaId, (shouldMatch + shouldNotMatch).filter { testNameFilter(it.displayName) })
}

isInvalidSchemasTestCase(ion) -> createSchemasTestCases(schemaId, ion, expectValid = false)
Expand All @@ -114,13 +114,13 @@ class IonSchemaTestsRunner(
assertThrows<InvalidSchemaException> { schema.newType(it as IonStruct) }
}
}
dynamicContainer("[$schemaId] $baseDescription", cases)
dynamicContainer("[$schemaId] $baseDescription", cases.filter { testNameFilter(it.displayName) })
}

else -> dynamicTest(schemaId) { throw IllegalArgumentException("Malformed test input: $ion") }
}
}
return dynamicContainer(schemaId, f.toURI(), dynamicNodeTestCases.stream().filter { testNameFilter(it.displayName) })
return dynamicContainer(schemaId, f.toURI(), dynamicNodeTestCases.stream())
}

private fun createSchemasTestCases(schemaId: String, ion: IonStruct, expectValid: Boolean): DynamicNode {
Expand All @@ -134,7 +134,7 @@ class IonSchemaTestsRunner(
assertThrows<InvalidSchemaException> { schemaSystem.newSchema(it.asDocument().iterator()) }
}
}
return dynamicContainer("[$schemaId] $baseDescription", cases)
return dynamicContainer("[$schemaId] $baseDescription", cases.filter { testNameFilter(it.displayName) })
}

private fun createValueTestCase(schemaId: String, testType: Type, value: IonValue, expectValid: Boolean): DynamicNode {
Expand Down

0 comments on commit 571b1a8

Please sign in to comment.