Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support to optionally allow surrogate pair entities (#165) #174

Merged
merged 13 commits into from
Jan 16, 2024
Merged
5 changes: 5 additions & 0 deletions release-notes/CREDITS
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,8 @@ Guillaume Nodet (@gnodet)
* Contributed #176: Fix parser when not replacing entities and treating char references
as entities
(6.6.0)

Kamil Gołębiewski (@Magmaruss)

* Contributed #165: Add support to optionally allow surrogate pair entities
(6.6.0)
2 changes: 2 additions & 0 deletions release-notes/VERSION
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ Project: woodstox

6.6.0 (not yet released)

#165: Add support to optionally allow surrogate pair entities
(contributed by Kamil G)
#176: Fix parser when not replacing entities and treating char references
as entities
(contributed by Guillaume N)
Expand Down
31 changes: 30 additions & 1 deletion src/main/java/com/ctc/wstx/api/ReaderConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,11 @@ public final class ReaderConfig

final static int PROP_MAX_DTD_DEPTH = 69;

/**
* @since 6.6
*/
final static int PROP_ALLOW_SURROGATE_PAIR_ENTITIES = 70;

/*
////////////////////////////////////////////////
// Limits for numeric properties
Expand Down Expand Up @@ -361,6 +366,8 @@ public final class ReaderConfig
PROP_UNDECLARED_ENTITY_RESOLVER);
sProperties.put(WstxInputProperties.P_BASE_URL,
PROP_BASE_URL);
sProperties.put(WstxInputProperties.P_ALLOW_SURROGATE_PAIR_ENTITIES,
PROP_ALLOW_SURROGATE_PAIR_ENTITIES);
sProperties.put(WstxInputProperties.P_INPUT_PARSING_MODE,
PROP_INPUT_PARSING_MODE);
}
Expand Down Expand Up @@ -412,13 +419,20 @@ public final class ReaderConfig

// since 5.4/6.4
protected int mMaxDtdDepth = DEFAULT_MAX_DTD_DEPTH;

/**
* Base URL to use as the resolution context for relative entity
* references
*/
protected URL mBaseURL;

/**
* Whether to allow surrogate pairs as entities (2 code-points as one target character).
*
* @since 6.6
*/
protected boolean mAllowSurrogatePairEntities = false;

/**
* Parsing mode can be changed from the default xml compliant
* behavior to one of alternate modes (fragment processing,
Expand Down Expand Up @@ -583,6 +597,7 @@ public ReaderConfig createNonShared(SymbolTable sym)
rc.mMaxEntityDepth = mMaxEntityDepth;
rc.mMaxEntityCount = mMaxEntityCount;
rc.mMaxDtdDepth = mMaxDtdDepth;
rc.mAllowSurrogatePairEntities = mAllowSurrogatePairEntities;
if (mSpecialProperties != null) {
int len = mSpecialProperties.length;
Object[] specProps = new Object[len];
Expand Down Expand Up @@ -792,6 +807,10 @@ public XMLResolver getUndeclaredEntityResolver() {

public URL getBaseURL() { return mBaseURL; }

public boolean allowsSurrogatePairEntities() {
return mAllowSurrogatePairEntities;
}

public WstxInputProperties.ParsingMode getInputParsingMode() {
return mParsingMode;
}
Expand Down Expand Up @@ -1074,6 +1093,10 @@ public void setUndeclaredEntityResolver(XMLResolver r) {
}

public void setBaseURL(URL baseURL) { mBaseURL = baseURL; }

public void doAllowSurrogatePairEntities(boolean state) {
mAllowSurrogatePairEntities = state;
}

public void setInputParsingMode(WstxInputProperties.ParsingMode mode) {
mParsingMode = mode;
Expand Down Expand Up @@ -1533,6 +1556,8 @@ public Object getProperty(int id)
return getUndeclaredEntityResolver();
case PROP_BASE_URL:
return getBaseURL();
case PROP_ALLOW_SURROGATE_PAIR_ENTITIES:
return allowsSurrogatePairEntities();
case PROP_INPUT_PARSING_MODE:
return getInputParsingMode();

Expand Down Expand Up @@ -1757,6 +1782,10 @@ public boolean setProperty(String propName, int id, Object value)
setBaseURL(u);
}
break;

case PROP_ALLOW_SURROGATE_PAIR_ENTITIES:
doAllowSurrogatePairEntities(ArgUtil.convertToBoolean(propName, value));
break;

case PROP_INPUT_PARSING_MODE:
setInputParsingMode((WstxInputProperties.ParsingMode) value);
Expand Down
9 changes: 9 additions & 0 deletions src/main/java/com/ctc/wstx/api/WstxInputProperties.java
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,15 @@ public final class WstxInputProperties
* DTD subset).
*/
public final static String P_BASE_URL = "com.ctc.wstx.baseURL";

/**
* Property of type {@link java.lang.Boolean}, that will allow parsing
* high unicode characters written by surrogate pairs (2 code points)
* Default set as Boolean.FALSE, because it is not a standard behavior
*
* @since 6.6
*/
cowtowncoder marked this conversation as resolved.
Show resolved Hide resolved
public final static String P_ALLOW_SURROGATE_PAIR_ENTITIES = "com.ctc.wstx.allowSurrogatePairEntities";

// // // Alternate parsing modes

Expand Down
19 changes: 7 additions & 12 deletions src/main/java/com/ctc/wstx/sr/BasicStreamReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -3671,9 +3671,7 @@ private int skipCoalescedText(int i)
private int skipTokenText(int i)
throws XMLStreamException
{
/* Fairly easy; except for potential to have entities
* expand to some crap?
*/
// Fairly easy; except for potential to have entities expand to some crap?
int count = 0;

main_loop:
Expand All @@ -3690,18 +3688,15 @@ && resolveSimpleEntity(true) != 0) {
;
} else {
i = fullyResolveEntity(true);
/* Either way, it's just fine; we don't care about
* returned single-char value.
*/
// Either way, it's just fine; we don't care about
// returned single-char value.
}
} else {
/* Can only skip character entities; others need to
* be returned separately.
*/
// Can only skip character entities; others need to
// be returned separately.
if (resolveCharOnlyEntity(true) == 0) {
/* Now points to the char after ampersand, and we need
* to return the ampersand itself
*/
// Now points to the char after ampersand, and we need
// to return the ampersand itself
return i;
}
}
Expand Down
121 changes: 73 additions & 48 deletions src/main/java/com/ctc/wstx/sr/StreamScanner.java
Original file line number Diff line number Diff line change
Expand Up @@ -1186,57 +1186,69 @@ protected int resolveSimpleEntity(boolean checkStd)

// Numeric reference?
if (c == '#') {
c = buf[ptr++];
int value = 0;
int pairValue = 0;
int inputLen = mInputEnd;
if (c == 'x') { // hex
while (ptr < inputLen) {
final StringBuffer buffer = new StringBuffer(new String(buf));
cowtowncoder marked this conversation as resolved.
Show resolved Hide resolved

mInputPtr = ptr;
value = resolveCharEnt(buffer, false);
ptr = mInputPtr;
c = buf[ptr - 1];

final boolean isValueHighSurrogate = value >= 0xD800 && value <= 0xDBFF;

/* If resolving entity surrogate pairs enabled and if current entity
* is in range of high surrogate value, try to find surrogate pair
*/
if (isValueHighSurrogate && mConfig.allowsSurrogatePairEntities()
&& c == ';' && ptr + 1 < inputLen) {
c = buf[ptr++];

if (c == '&' && ptr + 1 < inputLen) {
c = buf[ptr++];
if (c == ';') {
break;
}
value = value << 4;
if (c <= '9' && c >= '0') {
value += (c - '0');
} else if (c >= 'a' && c <= 'f') {
value += (10 + (c - 'a'));
} else if (c >= 'A' && c <= 'F') {
value += (10 + (c - 'A'));
} else {
mInputPtr = ptr; // so error points to correct char
throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F).");
}
/* Need to check for overflow; easiest to do right as
* it happens...
*/
if (value > MAX_UNICODE_CHAR) {
reportUnicodeOverflow();
}
}
} else { // numeric (decimal)
while (c != ';') {
if (c <= '9' && c >= '0') {
value = (value * 10) + (c - '0');
// Overflow?
if (value > MAX_UNICODE_CHAR) {
reportUnicodeOverflow();

if (c == '#' && ptr + 1 < inputLen) {
try {
mInputPtr = ptr;
pairValue = resolveCharEnt(buffer, false);
ptr = mInputPtr;
c = buf[ptr -1];
} catch(WstxUnexpectedCharException wuce) {
reportNoSurrogatePair(value);
}
} else {
mInputPtr = ptr; // so error points to correct char
throwUnexpectedChar(c, "; expected a decimal number.");
}
if (ptr >= inputLen) {
break;
reportNoSurrogatePair(value);
}
c = buf[ptr++];
} else {
reportNoSurrogatePair(value);
}
} else if (isValueHighSurrogate
&& mConfig.allowsSurrogatePairEntities()
&& ptr + 1 >= inputLen) {
reportNoSurrogatePair(value);
}

/* We get here either if we got it all, OR if we ran out of
* input in current buffer.
*/
if (c == ';') { // got the full thing
mInputPtr = ptr;
validateChar(value);

if (mConfig.allowsSurrogatePairEntities() && pairValue > 0) {
/*
cowtowncoder marked this conversation as resolved.
Show resolved Hide resolved
* [woodstox-core#165]
* If pair value is not in range of low surrogate values, then throw an error
*/
if (pairValue < 0xDC00 || pairValue > 0xDFFF) {
reportInvalidSurrogatePair(value, pairValue);
}

value = 0x10000 + (value - 0xD800) * 0x400 + (pairValue - 0xDC00);
} else {
validateChar(value);
}

return value;
}

Expand Down Expand Up @@ -1355,7 +1367,7 @@ protected int resolveCharOnlyEntity(boolean checkStd)
// A char reference?
if (c == '#') { // yup
++mInputPtr;
return resolveCharEnt(null);
return resolveCharEnt(null, true);
}

// nope... except may be a pre-def?
Expand Down Expand Up @@ -1523,7 +1535,7 @@ protected int fullyResolveEntity(boolean allowExt)
// Do we have a (numeric) character entity reference?
if (c == '#') { // numeric
final StringBuffer originalSurface = new StringBuffer("#");
int ch = resolveCharEnt(originalSurface);
int ch = resolveCharEnt(originalSurface, true);
if (mCfgTreatCharRefsAsEntities) {
final char[] originalChars = new char[originalSurface.length()];
originalSurface.getChars(0, originalSurface.length(), originalChars, 0);
Expand Down Expand Up @@ -2320,16 +2332,16 @@ protected final void parseUntil(TextBuffer tb, char endChar, boolean convertLFs,
///////////////////////////////////////////////////////////////////////
*/

private int resolveCharEnt(StringBuffer originalCharacters)
private int resolveCharEnt(StringBuffer originalCharacters, boolean validateChar)
throws XMLStreamException
{
int value = 0;
char c = getNextChar(SUFFIX_IN_ENTITY_REF);

if (originalCharacters != null) {
originalCharacters.append(c);
}

if (c == 'x') { // hex
while (true) {
c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
Expand Down Expand Up @@ -2375,7 +2387,9 @@ private int resolveCharEnt(StringBuffer originalCharacters)
}
}
}
validateChar(value);
if (validateChar) {
validateChar(value);
}
return value;
}

Expand All @@ -2386,9 +2400,8 @@ private int resolveCharEnt(StringBuffer originalCharacters)
private final void validateChar(int value)
throws XMLStreamException
{
/* 24-Jan-2006, TSa: Ok, "high" Unicode chars are problematic,
* need to be reported by a surrogate pair..
*/
// 24-Jan-2006, TSa: Ok, "high" Unicode chars are problematic,
// need to be reported by a surrogate pair..
if (value >= 0xD800) {
if (value < 0xE000) { // no surrogates via entity expansion
reportIllegalChar(value);
Expand Down Expand Up @@ -2462,7 +2475,19 @@ private void reportUnicodeOverflow()
private void reportIllegalChar(int value)
throws XMLStreamException
{
throwParseError("Illegal character entity: expansion character (code 0x{0}", Integer.toHexString(value), null);
throwParseError("Illegal character entity: expansion character (code 0x{0})", Integer.toHexString(value), null);
}

private void reportNoSurrogatePair(int highSurrogate)
throws XMLStreamException
{
throwParseError("Cannot find surrogate pair: high surrogate character (code 0x{0})", Integer.toHexString(highSurrogate), null);
}

private void reportInvalidSurrogatePair(int firstSurrogate, int secondSurrogate)
throws XMLStreamException
{
throwParseError("Invalid surrogate pair: first surrogate character (code 0x{0}), second surrogate character (code 0x{1})", Integer.toHexString(firstSurrogate), Integer.toHexString(secondSurrogate));
}

protected void verifyLimit(String type, long maxValue, long currentValue)
Expand Down
10 changes: 10 additions & 0 deletions src/test/java/org/codehaus/stax/test/BaseStaxTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import javax.xml.stream.*;
import javax.xml.stream.events.XMLEvent;

import com.ctc.wstx.api.WstxInputProperties;

/* Latest updates:
*
* - 07-Sep-2007, TSa: Updating based on latest understanding of
Expand Down Expand Up @@ -275,6 +277,14 @@ protected static boolean setSupportExternalEntities(XMLInputFactory f, boolean s
return false;
}
}

protected static void setResolveEntitySurrogatePairs(XMLInputFactory f, boolean state)
throws XMLStreamException
{
Boolean b = state ? Boolean.TRUE : Boolean.FALSE;
f.setProperty(WstxInputProperties.P_ALLOW_SURROGATE_PAIR_ENTITIES, b);
assertEquals(b, f.getProperty(WstxInputProperties.P_ALLOW_SURROGATE_PAIR_ENTITIES));
}

protected static void setResolver(XMLInputFactory f, XMLResolver resolver)
throws XMLStreamException
Expand Down
Loading