Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Conform encoding handling to Encoding spec #48

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions src/nu/validator/htmlparser/extra/ChardetSniffer.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ public Encoding sniff() throws IOException {
detector.Init(this);
detector.DoIt(source, length, false);
detector.DataEnd();
if (returnValue != null && returnValue != Encoding.WINDOWS1252 && returnValue.isAsciiSuperset()) {
if (returnValue != null && returnValue != Encoding.WINDOWS1252
&& returnValue != Encoding.UTF16BE
&& returnValue != Encoding.UTF16LE) {
return returnValue;
} else {
return null;
Expand All @@ -72,10 +74,6 @@ public static void main(String[] args) {
public void Notify(String charsetName) {
try {
Encoding enc = Encoding.forName(charsetName);
Encoding actual = enc.getActualHtmlEncoding();
if (actual != null) {
enc = actual;
}
returnValue = enc;
} catch (UnsupportedCharsetException e) {
returnValue = null;
Expand Down
7 changes: 2 additions & 5 deletions src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,8 @@ public Encoding sniff() throws IOException {
detector.setText(this);
CharsetMatch match = detector.detect();
Encoding enc = Encoding.forName(match.getName());
Encoding actual = enc.getActualHtmlEncoding();
if (actual != null) {
enc = actual;
}
if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) {
if (enc != Encoding.WINDOWS1252 //
&& enc != Encoding.UTF16BE && enc != Encoding.UTF16LE) {
return enc;
} else {
return null;
Expand Down
171 changes: 90 additions & 81 deletions src/nu/validator/htmlparser/io/Driver.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@
* DEALINGS IN THE SOFTWARE.
*/

/*
* The comments following this one that use the same comment syntax as this
* comment are quotes from the HTML Standard at https://html.spec.whatwg.org/
* as of 10 September 2020. That document came with this statement:
* Copyright © WHATWG (Apple, Google, Mozilla, Microsoft). This work is
* licensed under a Creative Commons Attribution 4.0 International License.
*/

package nu.validator.htmlparser.io;

import java.io.IOException;
Expand Down Expand Up @@ -214,9 +222,8 @@ public void tokenize(InputSource is, int bufferSize)
tokenizer.getErrorHandler(), tokenizer, this, heuristics);
} else {
if (this.characterEncoding != Encoding.UTF8) {
errorWithoutLocation("Legacy encoding \u201C"
+ this.characterEncoding.getCanonName()
+ "\u201D used. Documents must use UTF-8.");
errorWithoutLocation(Encoding.msgLegacyEncoding(
this.characterEncoding.getCanonName()));
}
becomeConfident();
this.reader = new HtmlInputStreamReader(inputStream,
Expand Down Expand Up @@ -350,57 +357,92 @@ public void setEncoding(Encoding encoding, Confidence confidence) {
}
}

private void errInternalActualDiffer(String internalCharset, String actual)
throws SAXException {
if (!internalCharset.equals(actual)) {
tokenizer.errTreeBuilder(
"Ignoring internal encoding declaration \u201C"
+ internalCharset + "\u201D, which disagrees with"
+ " the actual encoding of the document (\u201C"
+ actual + "\u201D).");
}
}

public boolean internalEncodingDeclaration(String internalCharset)
throws SAXException {
String actual = characterEncoding.getCanonName();
if (confidence == Confidence.CERTAIN) {
errInternalActualDiffer(internalCharset, actual);
return true;
}
/* https://html.spec.whatwg.org/#changing-the-encoding-while-parsing */
try {
internalCharset = Encoding.toAsciiLowerCase(internalCharset);
Encoding cs;
if ("utf-16".equals(internalCharset)
|| "utf-16be".equals(internalCharset)
if ("utf-16be".equals(actual) || "utf-16le".equals(actual)) {
errInternalActualDiffer(internalCharset, actual);
/*
* 1. If the encoding that is already being used to interpret
* the input stream is a UTF-16 encoding, then set the
* confidence to certain and return. The new encoding is ignored
* becomeConfident();
*/
return true;
}
internalCharset = internalCharset.toLowerCase();
Encoding cs = Encoding.forName(internalCharset);
if ("utf-16be".equals(internalCharset)
|| "utf-16le".equals(internalCharset)) {
tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C"
+ internalCharset
+ "\u201D which is not an ASCII superset. Continuing as if the encoding had been \u201Cutf-8\u201D.");
/*
* 2. If the new encoding is a UTF-16 encoding, then change it
* to UTF-8.
*/
tokenizer.errTreeBuilder(
Encoding.msgIgnoredCharset(internalCharset, "utf-8"));
cs = Encoding.UTF8;
internalCharset = "utf-8";
} else {
cs = Encoding.forName(internalCharset);
}
Encoding actual = cs.getActualHtmlEncoding();
if (actual == null) {
actual = cs;
}
if (!actual.isAsciiSuperset()) {
tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C"
+ internalCharset
+ "\u201D which is not an ASCII superset. Not changing the encoding.");
return false;
} else if ("x-user-defined".equals(internalCharset)) {
/*
* 3. If the new encoding is x-user-defined, then change it to
* windows-1252.
*/
tokenizer.errTreeBuilder(Encoding.msgIgnoredCharset(
"x-user-defined", "windows-1252"));
cs = Encoding.WINDOWS1252;
internalCharset = "windows-1252";
}
if (characterEncoding == null) {
// Reader case
return true;
}
if (characterEncoding == actual) {
if (characterEncoding == cs) {
/*
* 4. If the new encoding is identical or equivalent to the
* encoding that is already being used to interpret the input
* stream, then set the confidence to certain and return.
*/
becomeConfident();
return true;
}
if (confidence == Confidence.CERTAIN && actual != characterEncoding) {
tokenizer.errTreeBuilder("Internal encoding declaration \u201C"
+ internalCharset
+ "\u201D disagrees with the actual encoding of the document (\u201C"
+ characterEncoding.getCanonName() + "\u201D).");
} else {
Encoding newEnc = whineAboutEncodingAndReturnActual(
internalCharset, cs);
tokenizer.errTreeBuilder("Changing character encoding \u201C"
+ internalCharset + "\u201D and reparsing.");
characterEncoding = newEnc;
throw new ReparseException();
}
return true;
/*
* 6. Otherwise, navigate to the document again, with
* historyHandling set to "replace", and using the same source
* browsing context, but this time skip the encoding sniffing
* algorithm and instead just set the encoding to the new encoding
*/
Encoding newEnc = whineAboutEncodingAndReturnCanonical(
internalCharset, cs);
tokenizer.errTreeBuilder("Changing character encoding to \u201C"
+ internalCharset + "\u201D and reparsing.");
characterEncoding = newEnc;
// Note: We intentionally don’t call becomeConfident() at this
// point. If we did, it would end up causing the exception
// java.lang.IllegalStateException: rewind() after willNotRewind()
// to be thrown later. So we are departing here from strictly
// following the ordering in the corresponding spec language, which
// specifies setting the confidence to "certain" at this point.
throw new ReparseException();
} catch (UnsupportedCharsetException e) {
tokenizer.errTreeBuilder("Internal encoding declaration named an unsupported chararacter encoding \u201C"
+ internalCharset + "\u201D.");
tokenizer.errTreeBuilder(
Encoding.msgBadInternalCharset(internalCharset));
return false;
}
}
Expand Down Expand Up @@ -451,17 +493,16 @@ protected Encoding encodingFromExternalDeclaration(String encoding)
if (encoding == null) {
return null;
}
encoding = Encoding.toAsciiLowerCase(encoding);
encoding = encoding.toLowerCase();
try {
Encoding cs = Encoding.forName(encoding);
if ("utf-16".equals(cs.getCanonName())
|| "utf-32".equals(cs.getCanonName())) {
if ("utf-16be".equals(cs.getCanonName())
|| "utf-16le".equals(cs.getCanonName())) {
swallowBom = false;
}
return whineAboutEncodingAndReturnActual(encoding, cs);
return whineAboutEncodingAndReturnCanonical(encoding, cs);
} catch (UnsupportedCharsetException e) {
tokenizer.err("Unsupported character encoding name: \u201C" + encoding
+ "\u201D. Will sniff.");
tokenizer.err(Encoding.msgBadEncoding(encoding) + " Will sniff.");
swallowBom = true;
}
return null; // keep the compiler happy
Expand All @@ -473,45 +514,13 @@ protected Encoding encodingFromExternalDeclaration(String encoding)
* @return
* @throws SAXException
*/
protected Encoding whineAboutEncodingAndReturnActual(String encoding,
protected Encoding whineAboutEncodingAndReturnCanonical(String encoding,
Encoding cs) throws SAXException {
String canonName = cs.getCanonName();
if (!cs.isRegistered()) {
if (encoding.startsWith("x-")) {
tokenizer.err("The encoding \u201C"
+ encoding
+ "\u201D is not an IANA-registered encoding. (Charmod C022)");
} else {
tokenizer.err("The encoding \u201C"
+ encoding
+ "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)");
}
} else if (!canonName.equals(encoding)) {
tokenizer.err("The encoding \u201C"
+ encoding
+ "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C"
+ canonName + "\u201D. (Charmod C024)");
}
if (cs.isShouldNot()) {
tokenizer.warn("Authors should not use the character encoding \u201C"
+ encoding
+ "\u201D. It is recommended to use \u201CUTF-8\u201D.");
} else if (cs.isLikelyEbcdic()) {
tokenizer.warn("Authors should not use EBCDIC-based encodings. It is recommended to use \u201CUTF-8\u201D.");
} else if (cs.isObscure()) {
tokenizer.warn("The character encoding \u201C"
+ encoding
+ "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
}
Encoding actual = cs.getActualHtmlEncoding();
if (actual == null) {
return cs;
} else {
tokenizer.warn("Using \u201C" + actual.getCanonName()
+ "\u201D instead of the declared encoding \u201C"
+ encoding + "\u201D.");
return actual;
if (!canonName.equals(encoding)) {
tokenizer.err(Encoding.msgNotCanonicalName(encoding, canonName));
}
return cs;
}

private class ReparseException extends SAXException {
Expand Down
Loading