validator · sideshowbarker · Sep 12, 2020 · Sep 12, 2020 · Sep 12, 2020 · Sep 12, 2020
diff --git a/src/nu/validator/htmlparser/extra/ChardetSniffer.java b/src/nu/validator/htmlparser/extra/ChardetSniffer.java
@@ -54,7 +54,9 @@ public Encoding sniff() throws IOException {
         detector.Init(this);
         detector.DoIt(source, length, false);
         detector.DataEnd();
-        if (returnValue != null && returnValue != Encoding.WINDOWS1252 && returnValue.isAsciiSuperset()) {
+        if (returnValue != null && returnValue != Encoding.WINDOWS1252
+                && returnValue != Encoding.UTF16BE
+                && returnValue != Encoding.UTF16LE) {
             return returnValue;
         } else {
             return null;
@@ -72,10 +74,6 @@ public static void main(String[] args) {
     public void Notify(String charsetName) {
         try {
             Encoding enc = Encoding.forName(charsetName);
-            Encoding actual = enc.getActualHtmlEncoding();
-            if (actual != null) {
-                enc = actual;
-            }
             returnValue = enc;
         } catch (UnsupportedCharsetException e) {
             returnValue = null;

diff --git a/src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java b/src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java
@@ -53,11 +53,8 @@ public Encoding sniff() throws IOException {
             detector.setText(this);
             CharsetMatch match = detector.detect();
             Encoding enc = Encoding.forName(match.getName());
-            Encoding actual = enc.getActualHtmlEncoding();
-            if (actual != null) {
-                enc = actual;
-            }
-            if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) {
+            if (enc != Encoding.WINDOWS1252 //
+                    && enc != Encoding.UTF16BE && enc != Encoding.UTF16LE) {
                 return enc;
             } else {
                 return null;

diff --git a/src/nu/validator/htmlparser/io/Driver.java b/src/nu/validator/htmlparser/io/Driver.java
@@ -21,6 +21,14 @@
  * DEALINGS IN THE SOFTWARE.
  */
 
+/*
+ * The comments following this one that use the same comment syntax as this
+ * comment are quotes from the HTML Standard at https://html.spec.whatwg.org/
+ * as of 10 September 2020. That document came with this statement:
+ * Copyright © WHATWG (Apple, Google, Mozilla, Microsoft). This work is
+ * licensed under a Creative Commons Attribution 4.0 International License.
+ */
+
 package nu.validator.htmlparser.io;
 
 import java.io.IOException;
@@ -214,9 +222,8 @@ public void tokenize(InputSource is, int bufferSize)
                         tokenizer.getErrorHandler(), tokenizer, this, heuristics);
             } else {
                 if (this.characterEncoding != Encoding.UTF8) {
-                    errorWithoutLocation("Legacy encoding \u201C"
-                            + this.characterEncoding.getCanonName()
-                            + "\u201D used. Documents must use UTF-8.");
+                    errorWithoutLocation(Encoding.msgLegacyEncoding(
+                            this.characterEncoding.getCanonName()));
                 }
                 becomeConfident();
                 this.reader = new HtmlInputStreamReader(inputStream,
@@ -350,57 +357,92 @@ public void setEncoding(Encoding encoding, Confidence confidence) {
         }
     }
 
+    private void errInternalActualDiffer(String internalCharset, String actual)
+            throws SAXException {
+        if (!internalCharset.equals(actual)) {
+            tokenizer.errTreeBuilder(
+                    "Ignoring internal encoding declaration \u201C"
+                            + internalCharset + "\u201D, which disagrees with"
+                            + " the actual encoding of the document (\u201C"
+                            + actual + "\u201D).");
+        }
+    }
+
     public boolean internalEncodingDeclaration(String internalCharset)
             throws SAXException {
+        String actual = characterEncoding.getCanonName();
+        if (confidence == Confidence.CERTAIN) {
+            errInternalActualDiffer(internalCharset, actual);
+            return true;
+        }
+        /* https://html.spec.whatwg.org/#changing-the-encoding-while-parsing */
         try {
-            internalCharset = Encoding.toAsciiLowerCase(internalCharset);
-            Encoding cs;
-            if ("utf-16".equals(internalCharset)
-                    || "utf-16be".equals(internalCharset)
+            if ("utf-16be".equals(actual) || "utf-16le".equals(actual)) {
+                errInternalActualDiffer(internalCharset, actual);
+                /*
+                 * 1. If the encoding that is already being used to interpret
+                 * the input stream is a UTF-16 encoding, then set the
+                 * confidence to certain and return. The new encoding is ignored
+                 * becomeConfident();
+                 */
+                return true;
+            }
+            internalCharset = internalCharset.toLowerCase();
+            Encoding cs = Encoding.forName(internalCharset);
+            if ("utf-16be".equals(internalCharset)
                     || "utf-16le".equals(internalCharset)) {
-                tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C"
-                        + internalCharset
-                        + "\u201D which is not an ASCII superset. Continuing as if the encoding had been \u201Cutf-8\u201D.");
+                /*
+                 * 2. If the new encoding is a UTF-16 encoding, then change it
+                 * to UTF-8.
+                 */
+                tokenizer.errTreeBuilder(
+                        Encoding.msgIgnoredCharset(internalCharset, "utf-8"));
                 cs = Encoding.UTF8;
                 internalCharset = "utf-8";
-            } else {
-                cs = Encoding.forName(internalCharset);
-            }
-            Encoding actual = cs.getActualHtmlEncoding();
-            if (actual == null) {
-                actual = cs;
-            }
-            if (!actual.isAsciiSuperset()) {
-                tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C"
-                        + internalCharset
-                        + "\u201D which is not an ASCII superset. Not changing the encoding.");
-                return false;
+            } else if ("x-user-defined".equals(internalCharset)) {
+                /*
+                 * 3. If the new encoding is x-user-defined, then change it to
+                 * windows-1252.
+                 */
+                tokenizer.errTreeBuilder(Encoding.msgIgnoredCharset(
+                        "x-user-defined", "windows-1252"));
+                cs = Encoding.WINDOWS1252;
+                internalCharset = "windows-1252";
             }
             if (characterEncoding == null) {
                 // Reader case
                 return true;
             }
-            if (characterEncoding == actual) {
+            if (characterEncoding == cs) {
+                /*
+                 * 4. If the new encoding is identical or equivalent to the
+                 * encoding that is already being used to interpret the input
+                 * stream, then set the confidence to certain and return.
+                 */
                 becomeConfident();
                 return true;
             }
-            if (confidence == Confidence.CERTAIN && actual != characterEncoding) {
-                tokenizer.errTreeBuilder("Internal encoding declaration \u201C"
-                        + internalCharset
-                        + "\u201D disagrees with the actual encoding of the document (\u201C"
-                        + characterEncoding.getCanonName() + "\u201D).");
-            } else {
-                Encoding newEnc = whineAboutEncodingAndReturnActual(
-                        internalCharset, cs);
-                tokenizer.errTreeBuilder("Changing character encoding \u201C"
-                        + internalCharset + "\u201D and reparsing.");
-                characterEncoding = newEnc;
-                throw new ReparseException();
-            }
-            return true;
+            /*
+             * 6. Otherwise, navigate to the document again, with
+             * historyHandling set to "replace", and using the same source
+             * browsing context, but this time skip the encoding sniffing
+             * algorithm and instead just set the encoding to the new encoding
+             */
+            Encoding newEnc = whineAboutEncodingAndReturnCanonical(
+                    internalCharset, cs);
+            tokenizer.errTreeBuilder("Changing character encoding to \u201C"
+                    + internalCharset + "\u201D and reparsing.");
+            characterEncoding = newEnc;
+            // Note: We intentionally don’t call becomeConfident() at this
+            // point. If we did, it would end up causing the exception
+            // java.lang.IllegalStateException: rewind() after willNotRewind()
+            // to be thrown later. So we are departing here from strictly
+            // following the ordering in the corresponding spec language, which
+            // specifies setting the confidence to "certain" at this point.
+            throw new ReparseException();
         } catch (UnsupportedCharsetException e) {
-            tokenizer.errTreeBuilder("Internal encoding declaration named an unsupported chararacter encoding \u201C"
-                    + internalCharset + "\u201D.");
+            tokenizer.errTreeBuilder(
+                    Encoding.msgBadInternalCharset(internalCharset));
             return false;
         }
     }
@@ -451,17 +493,16 @@ protected Encoding encodingFromExternalDeclaration(String encoding)
         if (encoding == null) {
             return null;
         }
-        encoding = Encoding.toAsciiLowerCase(encoding);
+        encoding = encoding.toLowerCase();
         try {
             Encoding cs = Encoding.forName(encoding);
-            if ("utf-16".equals(cs.getCanonName())
-                    || "utf-32".equals(cs.getCanonName())) {
+            if ("utf-16be".equals(cs.getCanonName())
+                    || "utf-16le".equals(cs.getCanonName())) {
                 swallowBom = false;
             }
-            return whineAboutEncodingAndReturnActual(encoding, cs);
+            return whineAboutEncodingAndReturnCanonical(encoding, cs);
         } catch (UnsupportedCharsetException e) {
-            tokenizer.err("Unsupported character encoding name: \u201C" + encoding
-                    + "\u201D. Will sniff.");
+            tokenizer.err(Encoding.msgBadEncoding(encoding) + " Will sniff.");
             swallowBom = true;
         }
         return null; // keep the compiler happy
@@ -473,45 +514,13 @@ protected Encoding encodingFromExternalDeclaration(String encoding)
      * @return
      * @throws SAXException
      */
-    protected Encoding whineAboutEncodingAndReturnActual(String encoding,
+    protected Encoding whineAboutEncodingAndReturnCanonical(String encoding,
             Encoding cs) throws SAXException {
         String canonName = cs.getCanonName();
-        if (!cs.isRegistered()) {
-            if (encoding.startsWith("x-")) {
-                tokenizer.err("The encoding \u201C"
-                        + encoding
-                        + "\u201D is not an IANA-registered encoding. (Charmod C022)");
-            } else {
-                tokenizer.err("The encoding \u201C"
-                        + encoding
-                        + "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)");
-            }
-        } else if (!canonName.equals(encoding)) {
-            tokenizer.err("The encoding \u201C"
-                    + encoding
-                    + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C"
-                    + canonName + "\u201D. (Charmod C024)");
-        }
-        if (cs.isShouldNot()) {
-            tokenizer.warn("Authors should not use the character encoding \u201C"
-                    + encoding
-                    + "\u201D. It is recommended to use \u201CUTF-8\u201D.");
-        } else if (cs.isLikelyEbcdic()) {
-            tokenizer.warn("Authors should not use EBCDIC-based encodings. It is recommended to use \u201CUTF-8\u201D.");
-        } else if (cs.isObscure()) {
-            tokenizer.warn("The character encoding \u201C"
-                    + encoding
-                    + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
-        }
-        Encoding actual = cs.getActualHtmlEncoding();
-        if (actual == null) {
-            return cs;
-        } else {
-            tokenizer.warn("Using \u201C" + actual.getCanonName()
-                    + "\u201D instead of the declared encoding \u201C"
-                    + encoding + "\u201D.");
-            return actual;
+        if (!canonName.equals(encoding)) {
+            tokenizer.err(Encoding.msgNotCanonicalName(encoding, canonName));
         }
+        return cs;
     }
 
     private class ReparseException extends SAXException {