Skip to content

Commit

Permalink
#220 switch to lookup tables in hotspots isNameChar()/isNameStartChar…
Browse files Browse the repository at this point in the history
…() for better performance
  • Loading branch information
winfriedgerlach committed Nov 27, 2024
1 parent c72367f commit 1c1293c
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 20 deletions.
45 changes: 25 additions & 20 deletions src/main/java/com/ctc/wstx/io/WstxInputData.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

import com.ctc.wstx.util.XmlChars;

import java.util.stream.IntStream;

/**
* Base class used by readers (specifically, by
* {@link com.ctc.wstx.sr.StreamScanner}, and its sub-classes)
Expand Down Expand Up @@ -50,6 +52,23 @@ public class WstxInputData
*/
public final static int MAX_UNICODE_CHAR = 0x10FFFF;

private static final boolean[] asciiNameStartChars = new boolean[128];
static {
IntStream.rangeClosed('a', 'z').forEach(i -> asciiNameStartChars[i] = true);
IntStream.rangeClosed('A', 'Z').forEach(i -> asciiNameStartChars[i] = true);
asciiNameStartChars['_'] = true;
}

private static final boolean[] asciiNameChars = new boolean[128];
static {
IntStream.rangeClosed('a', 'z').forEach(i -> asciiNameChars[i] = true);
IntStream.rangeClosed('A', 'Z').forEach(i -> asciiNameChars[i] = true);
IntStream.rangeClosed('0', '9').forEach(i -> asciiNameChars[i] = true);
asciiNameChars['.'] = true;
asciiNameChars['-'] = true;
asciiNameChars['_'] = true;
}

/*
////////////////////////////////////////////////////
// Configuration
Expand Down Expand Up @@ -153,14 +172,9 @@ protected final boolean isNameStartChar(char c)
/* First, let's handle 7-bit ascii range (identical between xml
* 1.0 and 1.1)
*/
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
return true;
}
if (c < 0x41) { // before 'A' just white space
return false;
}
return (c <= 0x5A) || (c == '_'); // 'A' - 'Z' and '_' are ok
if (c < 128) {
// this is performance critical, so we use a lookup table instead of if-branches
return asciiNameStartChars[c];
}
/* Ok, otherwise need to use a big honking bit sets... which
* differ between 1.0 and 1.1
Expand All @@ -178,18 +192,9 @@ protected final boolean isNameStartChar(char c)
protected final boolean isNameChar(char c)
{
// First, let's handle 7-bit ascii range
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
return true;
}
if (c <= 0x5A) {
if (c >= 0x41) { // 'A' - 'Z' ok too
return true;
}
// As are 0-9, '.' and '-'
return (c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-');
}
return (c == 0x5F); // '_' is ok too
if (c < 128) {
// this is performance critical, so we use a lookup table instead of if-branches
return asciiNameChars[c];
}
return mXml11 ? XmlChars.is11NameChar(c) : XmlChars.is10NameChar(c);
}
Expand Down
77 changes: 77 additions & 0 deletions src/test/java/com/ctc/wstx/io/WstxInputDataTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package com.ctc.wstx.io;

import com.ctc.wstx.util.XmlChars;
import junit.framework.TestCase;
import org.junit.Test;

import java.util.stream.IntStream;

public class WstxInputDataTest extends TestCase {

@Test
public void testIsNameStartCharBehavesSameAsBranchyVersion() {
WstxInputData wstxInputDataXml10 = new WstxInputData();
WstxInputData wstxInputDataXml11 = new WstxInputData();
wstxInputDataXml11.mXml11 = true;

// include all 7-bit ASCII characters plus some left and right
IntStream.range(-10, 138).forEach(i -> {
char c = (char) i;
assertEquals(isNameStartCharBranchy(c, false), wstxInputDataXml10.isNameStartChar(c));
assertEquals(isNameStartCharBranchy(c, true), wstxInputDataXml11.isNameStartChar(c));
});
}

// previous implementation with branches
private final boolean isNameStartCharBranchy(char c, boolean mXml11) {
/* First, let's handle 7-bit ascii range (identical between xml
* 1.0 and 1.1)
*/
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
return true;
}
if (c < 0x41) { // before 'A' just white space
return false;
}
return (c <= 0x5A) || (c == '_'); // 'A' - 'Z' and '_' are ok
}
/* Ok, otherwise need to use a big honking bit sets... which
* differ between 1.0 and 1.1
*/
return mXml11 ? XmlChars.is11NameStartChar(c) : XmlChars.is10NameStartChar(c);
}

@Test
public void testIsNameCharBehavesSameAsBranchyVersion() {
WstxInputData wstxInputDataXml10 = new WstxInputData();
WstxInputData wstxInputDataXml11 = new WstxInputData();
wstxInputDataXml11.mXml11 = true;

// include all 7-bit ASCII characters plus some left and right
IntStream.range(-10, 138).forEach(i -> {
char c = (char) i;
assertEquals(isNameCharBranchy(c, false), wstxInputDataXml10.isNameChar(c));
assertEquals(isNameCharBranchy(c, true), wstxInputDataXml11.isNameChar(c));
});
}

// previous implementation with branches
private final boolean isNameCharBranchy(char c, boolean mXml11) {
// First, let's handle 7-bit ascii range
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
return true;
}
if (c <= 0x5A) {
if (c >= 0x41) { // 'A' - 'Z' ok too
return true;
}
// As are 0-9, '.' and '-'
return (c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-');
}
return (c == 0x5F); // '_' is ok too
}
return mXml11 ? XmlChars.is11NameChar(c) : XmlChars.is10NameChar(c);
}
}

0 comments on commit 1c1293c

Please sign in to comment.