Skip to content

Commit

Permalink
Pass regex flags into character class construction
Browse files Browse the repository at this point in the history
This makes it possible to start implementing flags related to case
sensitivity and unicode support. Fixed already are:

  - Without `DOT_ALL` newlines are properly excluded
  - With `CASE_INSENSITIVE`, code point sets are case insensitive

Unicode casing as well as extended unicode classes aren't implemented.
  • Loading branch information
harpocrates committed Jun 30, 2022
1 parent 7251c5e commit 7576fe8
Show file tree
Hide file tree
Showing 10 changed files with 235 additions and 101 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ features) installed as well as [SBT][0]. Then,
sbt test # compile and run the tests
sbt bench/Jmh/run # compile and run all JMH benchmarks

There's also a test driver that consumes `.txt` test cases like those used in
the OpenJDK tests for `java.util.regex` (in `test/jdk/java/util/regex/*.txt`):

sbt 'tester/run TestCases.txt'

## References

This would not have been possible without the following papers and blogs:
Expand Down
126 changes: 81 additions & 45 deletions src/main/java/automata/parser/BuiltinClass.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package automata.parser;

import automata.util.IntRangeSet;
import java.util.Map;
import java.util.regex.Pattern;

/**
* Special character classes.
Expand All @@ -13,8 +15,13 @@ public enum BuiltinClass {
*/
DOT {
@Override
public <D> D desugar(CharClassVisitor<D> visitor) {
return visitor.visitRange(Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
public <D> D desugar(CharClassVisitor<D> visitor, int flags) {
D dotAll = visitor.visitRange(Character.MIN_CODE_POINT, Character.MAX_CODE_POINT, flags);
if ((flags & Pattern.DOTALL) == 0) {
final D lineTerminator = visitor.visitNegated(LINE_TERMINATOR.desugar(visitor, flags));
dotAll = visitor.visitIntersection(dotAll, lineTerminator);
}
return dotAll;
}
},

Expand All @@ -23,8 +30,16 @@ public <D> D desugar(CharClassVisitor<D> visitor) {
*/
DIGIT {
@Override
public <D> D desugar(CharClassVisitor<D> visitor) {
return visitor.visitRange('0', '9');
public <D> D desugar(CharClassVisitor<D> visitor, int flags) {
if ((flags & Pattern.UNICODE_CHARACTER_CLASS) != 0) {
final var unicodeDigits = IntRangeSet.matching(
CodePoints.UNICODE_RANGE,
Character::isDigit
);
return visitor.visitCodePointSet(unicodeDigits).get();
} else {
return visitor.visitRange('0', '9', flags);
}
}
},

Expand All @@ -33,8 +48,8 @@ public <D> D desugar(CharClassVisitor<D> visitor) {
*/
NON_DIGIT {
@Override
public <D> D desugar(CharClassVisitor<D> visitor) {
return visitor.visitNegated(DIGIT.desugar(visitor));
public <D> D desugar(CharClassVisitor<D> visitor, int flags) {
return visitor.visitNegated(DIGIT.desugar(visitor, flags));
}
},

Expand All @@ -43,16 +58,16 @@ public <D> D desugar(CharClassVisitor<D> visitor) {
*/
HORIZONTAL_WHITE_SPACE {
@Override
public <D> D desugar(CharClassVisitor<D> visitor) {
D space = visitor.visitCharacter(' ');
space = visitor.visitUnion(space, visitor.visitCharacter('\t'));
space = visitor.visitUnion(space, visitor.visitCharacter('\u00A0'));
space = visitor.visitUnion(space, visitor.visitCharacter('\u1680'));
space = visitor.visitUnion(space, visitor.visitCharacter('\u180e'));
space = visitor.visitUnion(space, visitor.visitRange('\u2000', '\u200a'));
space = visitor.visitUnion(space, visitor.visitCharacter('\u202f'));
space = visitor.visitUnion(space, visitor.visitCharacter('\u205f'));
space = visitor.visitUnion(space, visitor.visitCharacter('\u3000'));
public <D> D desugar(CharClassVisitor<D> visitor, int flags) {
D space = visitor.visitCharacter(' ', flags);
space = visitor.visitUnion(space, visitor.visitCharacter('\t', flags));
space = visitor.visitUnion(space, visitor.visitCharacter('\u00A0', flags));
space = visitor.visitUnion(space, visitor.visitCharacter('\u1680', flags));
space = visitor.visitUnion(space, visitor.visitCharacter('\u180e', flags));
space = visitor.visitUnion(space, visitor.visitRange('\u2000', '\u200a', flags));
space = visitor.visitUnion(space, visitor.visitCharacter('\u202f', flags));
space = visitor.visitUnion(space, visitor.visitCharacter('\u205f', flags));
space = visitor.visitUnion(space, visitor.visitCharacter('\u3000', flags));
return space;
}
},
Expand All @@ -62,8 +77,8 @@ public <D> D desugar(CharClassVisitor<D> visitor) {
*/
NON_HORIZONTAL_WHITE_SPACE {
@Override
public <D> D desugar(CharClassVisitor<D> visitor) {
return visitor.visitNegated(HORIZONTAL_WHITE_SPACE.desugar(visitor));
public <D> D desugar(CharClassVisitor<D> visitor, int flags) {
return visitor.visitNegated(HORIZONTAL_WHITE_SPACE.desugar(visitor, flags));
}
},

Expand All @@ -72,13 +87,13 @@ public <D> D desugar(CharClassVisitor<D> visitor) {
*/
WHITE_SPACE {
@Override
public <D> D desugar(CharClassVisitor<D> visitor) {
D space = visitor.visitCharacter(' ');
space = visitor.visitUnion(space, visitor.visitCharacter('\t'));
space = visitor.visitUnion(space, visitor.visitCharacter('\n'));
space = visitor.visitUnion(space, visitor.visitCharacter('\u000B'));
space = visitor.visitUnion(space, visitor.visitCharacter('\f'));
space = visitor.visitUnion(space, visitor.visitCharacter('\r'));
public <D> D desugar(CharClassVisitor<D> visitor, int flags) {
D space = visitor.visitCharacter(' ', flags);
space = visitor.visitUnion(space, visitor.visitCharacter('\t', flags));
space = visitor.visitUnion(space, visitor.visitCharacter('\n', flags));
space = visitor.visitUnion(space, visitor.visitCharacter('\u000B', flags));
space = visitor.visitUnion(space, visitor.visitCharacter('\f', flags));
space = visitor.visitUnion(space, visitor.visitCharacter('\r', flags));
return space;
}
},
Expand All @@ -88,8 +103,8 @@ public <D> D desugar(CharClassVisitor<D> visitor) {
*/
NON_WHITE_SPACE {
@Override
public <D> D desugar(CharClassVisitor<D> visitor) {
return visitor.visitNegated(WHITE_SPACE.desugar(visitor));
public <D> D desugar(CharClassVisitor<D> visitor, int flags) {
return visitor.visitNegated(WHITE_SPACE.desugar(visitor, flags));
}
},

Expand All @@ -98,14 +113,14 @@ public <D> D desugar(CharClassVisitor<D> visitor) {
*/
VERTICAL_WHITE_SPACE {
@Override
public <D> D desugar(CharClassVisitor<D> visitor) {
D space = visitor.visitCharacter('\n');
space = visitor.visitUnion(space, visitor.visitCharacter('\u000B'));
space = visitor.visitUnion(space, visitor.visitCharacter('\f'));
space = visitor.visitUnion(space, visitor.visitCharacter('\r'));
space = visitor.visitUnion(space, visitor.visitCharacter('\u0085'));
space = visitor.visitUnion(space, visitor.visitCharacter('\u2028'));
space = visitor.visitUnion(space, visitor.visitCharacter('\u2029'));
public <D> D desugar(CharClassVisitor<D> visitor, int flags) {
D space = visitor.visitCharacter('\n', flags);
space = visitor.visitUnion(space, visitor.visitCharacter('\u000B', flags));
space = visitor.visitUnion(space, visitor.visitCharacter('\f', flags));
space = visitor.visitUnion(space, visitor.visitCharacter('\r', flags));
space = visitor.visitUnion(space, visitor.visitCharacter('\u0085', flags));
space = visitor.visitUnion(space, visitor.visitCharacter('\u2028', flags));
space = visitor.visitUnion(space, visitor.visitCharacter('\u2029', flags));
return space;
}
},
Expand All @@ -115,8 +130,8 @@ public <D> D desugar(CharClassVisitor<D> visitor) {
*/
NON_VERTICAL_WHITE_SPACE {
@Override
public <D> D desugar(CharClassVisitor<D> visitor) {
return visitor.visitNegated(VERTICAL_WHITE_SPACE.desugar(visitor));
public <D> D desugar(CharClassVisitor<D> visitor, int flags) {
return visitor.visitNegated(VERTICAL_WHITE_SPACE.desugar(visitor, flags));
}
},

Expand All @@ -125,11 +140,11 @@ public <D> D desugar(CharClassVisitor<D> visitor) {
*/
WORD {
@Override
public <D> D desugar(CharClassVisitor<D> visitor) {
D word = visitor.visitCharacter('_');
word = visitor.visitUnion(word, visitor.visitRange('a', 'z'));
word = visitor.visitUnion(word, visitor.visitRange('A', 'Z'));
word = visitor.visitUnion(word, visitor.visitRange('0', '9'));
public <D> D desugar(CharClassVisitor<D> visitor, int flags) {
D word = visitor.visitCharacter('_', flags);
word = visitor.visitUnion(word, visitor.visitRange('a', 'z', flags));
word = visitor.visitUnion(word, visitor.visitRange('A', 'Z', flags));
word = visitor.visitUnion(word, visitor.visitRange('0', '9', flags));
return word;
}
},
Expand All @@ -139,12 +154,33 @@ public <D> D desugar(CharClassVisitor<D> visitor) {
*/
NON_WORD {
@Override
public <D> D desugar(CharClassVisitor<D> visitor) {
return visitor.visitNegated(WORD.desugar(visitor));
public <D> D desugar(CharClassVisitor<D> visitor, int flags) {
return visitor.visitNegated(WORD.desugar(visitor, flags));
}
},

/**
* Line terminator.
*
* Note: this is not a user-writable character class. It is also not _exactly_
* what is understood by a "line terminator" in the context of `^` or `$`
* under multiline mode: it is missing the two codepoint sequence `\r\n`.
*/
LINE_TERMINATOR {
@Override
public <D> D desugar(CharClassVisitor<D> visitor, int flags) {
D term = visitor.visitCharacter('\n', flags);
if ((flags & Pattern.UNIX_LINES) == 0) {
term = visitor.visitUnion(term, visitor.visitCharacter('\r', flags));
term = visitor.visitUnion(term, visitor.visitCharacter('\u0085', flags));
term = visitor.visitUnion(term, visitor.visitCharacter('\u2028', flags));
term = visitor.visitUnion(term, visitor.visitCharacter('\u2029', flags));
}
return term;
}
};

public abstract <D> D desugar(CharClassVisitor<D> visitor);
public abstract <D> D desugar(CharClassVisitor<D> visitor, int flags);

/**
* Mapping from the character used to represent the class to the class.
Expand Down
32 changes: 27 additions & 5 deletions src/main/java/automata/parser/CharClassVisitor.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
package automata.parser;

import automata.util.IntRangeSet;
import java.util.Optional;

/**
* Bottom-up traversal of the character class AST.
*
Expand All @@ -16,16 +19,18 @@ public interface CharClassVisitor<C> {
* code units).
*
* @param codePoint unicode code point to match
* @param flags bitmask of regular expression flags
*/
C visitCharacter(int codePoint);
C visitCharacter(int codePoint, int flags);

/**
* Matches a range of abstract characters.
*
* @param startCodePoint first code point in the range (inclusive)
* @param endCodePoint last code point in the range (inclusive)
* @param flags bitmask of regular expression flags
*/
C visitRange(int startCodePoint, int endCodePoint);
C visitRange(int startCodePoint, int endCodePoint, int flags);

/**
* Matches all characters that don't match another pattern.
Expand Down Expand Up @@ -58,23 +63,40 @@ public interface CharClassVisitor<C> {
* Matches characters inside a builtin character class.
*
* @param cls builtin class
* @param flags bitmask of regular expression flags
*/
C visitBuiltinClass(BuiltinClass cls);
C visitBuiltinClass(BuiltinClass cls, int flags);

/**
* Matches characters inside a specified unicode block.
*
* @param block unicode block
* @param negated match characters outside the block
* @param flags bitmask of regular expression flags
*/
C visitUnicodeBlock(Character.UnicodeBlock block, boolean negated);
C visitUnicodeBlock(Character.UnicodeBlock block, boolean negated, int flags);

/**
* Matches characters inside a specified unicode script.
*
* @param script unicode script
* @param negated match characters outside the script
* @param flags bitmask of regular expression flags
*/
C visitUnicodeScript(Character.UnicodeScript script, boolean negated, int flags);

/**
* Matches any character inside the set of code points.
*
* @param codePointSet set of accepted code points
* @return nothing if the code point set is empty
*/
C visitUnicodeScript(Character.UnicodeScript script, boolean negated);
default Optional<C> visitCodePointSet(IntRangeSet codePoints) {
return codePoints
.ranges()
.stream()
.<C>map(range -> visitRange(range.lowerBound(), range.upperBound(), 0))
.reduce((l, r) -> visitUnion(l, r));
}
}

29 changes: 21 additions & 8 deletions src/main/java/automata/parser/CodePointSetVisitor.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import automata.util.IntRange;
import automata.util.IntRangeSet;
import java.util.Optional;
import java.util.regex.Pattern;

/**
* Interpret character classes into range sets of unicode code points.
Expand All @@ -16,13 +18,19 @@
public class CodePointSetVisitor implements CharClassVisitor<IntRangeSet> {

@Override
public IntRangeSet visitCharacter(int codePoint) {
return IntRangeSet.of(IntRange.single(codePoint));
public IntRangeSet visitCharacter(int codePoint, int flags) {
final var single = IntRangeSet.of(IntRange.single(codePoint));
return ((flags & Pattern.CASE_INSENSITIVE) == 0)
? single
: CodePoints.asciiCaseInsensitive(single);
}

@Override
public IntRangeSet visitRange(int startCodePoint, int endCodePoint) {
return IntRangeSet.of(IntRange.between(startCodePoint, endCodePoint));
public IntRangeSet visitRange(int startCodePoint, int endCodePoint, int flags) {
final var range = IntRangeSet.of(IntRange.between(startCodePoint, endCodePoint));
return ((flags & Pattern.CASE_INSENSITIVE) == 0)
? range
: CodePoints.asciiCaseInsensitive(range);
}

@Override
Expand All @@ -41,20 +49,25 @@ public IntRangeSet visitIntersection(IntRangeSet lhs, IntRangeSet rhs) {
}

@Override
public IntRangeSet visitBuiltinClass(BuiltinClass cls) {
return cls.desugar(this);
public IntRangeSet visitBuiltinClass(BuiltinClass cls, int flags) {
return cls.desugar(this, flags);
}

@Override
public IntRangeSet visitUnicodeScript(Character.UnicodeScript script, boolean negated) {
public IntRangeSet visitUnicodeScript(Character.UnicodeScript script, boolean negated, int flags) {
final var scriptCodePoints = CodePoints.scriptCodePoints(script);
return negated ? visitNegated(scriptCodePoints) : scriptCodePoints;
}

@Override
public IntRangeSet visitUnicodeBlock(Character.UnicodeBlock block, boolean negated) {
public IntRangeSet visitUnicodeBlock(Character.UnicodeBlock block, boolean negated, int flags) {
final var blockCodePoints = CodePoints.blockCodePoints(block);
return negated ? visitNegated(blockCodePoints) : blockCodePoints;
}

@Override
public Optional<IntRangeSet> visitCodePointSet(IntRangeSet codePoints) {
return Optional.of(codePoints);
}
}

Loading

0 comments on commit 7576fe8

Please sign in to comment.