mozilla · inhumantsar · May 20, 2024 · Jun 11, 2024 · Jun 11, 2024 · Jun 11, 2024
diff --git a/.eslintrc.js b/.eslintrc.js
@@ -3,7 +3,7 @@
 
 module.exports = {
   "parserOptions": {
-    "ecmaVersion": 6,
+    "ecmaVersion": 2017,
   },
   "env": {
     "es6": true,

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,7 @@ reasonable).
 - [Add Parsely tags as a fallback metadata source](https://github.com/mozilla/readability/pull/865)
 - [Fix the case that jsonld parse process is ignored when context url include the trailing slash](https://github.com/mozilla/readability/pull/833)
 - [Fixed situations where short paragraphs of legitimate content would be excluded](https://github.com/mozilla/readability/pull/867)
+- [Add `citation`, `prism`, and more `dc` metadata](https://github.com/mozilla/readability/pull/871)
 
 ## [0.5.0] - 2023-12-15
 

diff --git a/Readability.js b/Readability.js
@@ -495,7 +495,7 @@ Readability.prototype = {
       // could assume it's the full title.
       var headings = this._concatNodeLists(
         doc.getElementsByTagName("h1"),
-        doc.getElementsByTagName("h2")
+        doc.getElementsByTagName("h2"),
       );
       var trimmedTitle = curTitle.trim();
       var match = this._someNode(headings, function(heading) {
@@ -1367,6 +1367,58 @@ Readability.prototype = {
     });
   },
 
+  _extractJSONLDMetadata: function (parsed) {
+    var metadata = {};
+
+    if (typeof parsed.name === "string" && typeof parsed.headline === "string" && parsed.name !== parsed.headline) {
+      // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz
+      // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either
+      // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default.
+
+      var title = this._getArticleTitle();
+      var nameMatches = this._textSimilarity(parsed.name, title) > 0.75;
+      var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75;
+
+      if (headlineMatches && !nameMatches) {
+        metadata.title = parsed.headline;
+      } else {
+        metadata.title = parsed.name;
+      }
+    } else if (typeof parsed.name === "string") {
+      metadata.title = parsed.name.trim();
+    } else if (typeof parsed.headline === "string") {
+      metadata.title = parsed.headline.trim();
+    }
+    if (parsed.author) {
+      if (typeof parsed.author.name === "string") {
+        metadata.byline = parsed.author.name.trim();
+      } else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") {
+        metadata.byline = parsed.author
+          .filter(function(author) {
+            return author && typeof author.name === "string";
+          })
+          .map(function(author) {
+            return author.name.trim();
+          })
+          .join(", ");
+      }
+    }
+    if (typeof parsed.description === "string") {
+      metadata.excerpt = parsed.description.trim();
+    }
+    if (
+      parsed.publisher &&
+      typeof parsed.publisher.name === "string"
+    ) {
+      metadata.siteName = parsed.publisher.name.trim();
+    }
+    if (typeof parsed.datePublished === "string") {
+      metadata.datePublished = parsed.datePublished.trim();
+    }
+
+    return metadata;
+  },
+
   /**
    * Try to extract metadata from JSON-LD object.
    * For now, only Schema.org objects of type Article or its subtypes are supported.
@@ -1383,6 +1435,7 @@ Readability.prototype = {
           // Strip CDATA markers if present
           var content = jsonLdElement.textContent.replace(/^\s*<!\[CDATA\[|\]\]>\s*$/g, "");
           var parsed = JSON.parse(content);
+
           if (
             !parsed["@context"] ||
             !parsed["@context"].match(/^https?\:\/\/schema\.org\/?$/)
@@ -1393,7 +1446,7 @@ Readability.prototype = {
           if (!parsed["@type"] && Array.isArray(parsed["@graph"])) {
             parsed = parsed["@graph"].find(function(it) {
               return (it["@type"] || "").match(
-                this.REGEXPS.jsonLdArticleTypes
+                this.REGEXPS.jsonLdArticleTypes,
               );
             });
           }
@@ -1406,54 +1459,15 @@ Readability.prototype = {
             return;
           }
 
-          metadata = {};
+          metadata = this._extractJSONLDMetadata(parsed);
 
-          if (typeof parsed.name === "string" && typeof parsed.headline === "string" && parsed.name !== parsed.headline) {
-            // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz
-            // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either
-            // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default.
-
-            var title = this._getArticleTitle();
-            var nameMatches = this._textSimilarity(parsed.name, title) > 0.75;
-            var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75;
-
-            if (headlineMatches && !nameMatches) {
-              metadata.title = parsed.headline;
-            } else {
-              metadata.title = parsed.name;
-            }
-          } else if (typeof parsed.name === "string") {
-            metadata.title = parsed.name.trim();
-          } else if (typeof parsed.headline === "string") {
-            metadata.title = parsed.headline.trim();
-          }
-          if (parsed.author) {
-            if (typeof parsed.author.name === "string") {
-              metadata.byline = parsed.author.name.trim();
-            } else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") {
-              metadata.byline = parsed.author
-                .filter(function(author) {
-                  return author && typeof author.name === "string";
-                })
-                .map(function(author) {
-                  return author.name.trim();
-                })
-                .join(", ");
-            }
-          }
-          if (typeof parsed.description === "string") {
-            metadata.excerpt = parsed.description.trim();
-          }
-          if (
-            parsed.publisher &&
-            typeof parsed.publisher.name === "string"
-          ) {
-            metadata.siteName = parsed.publisher.name.trim();
+          // some sites, like ones for academic journals, separate metadata for a journal article or paper from the
+          // site's own metadata. eg: nature has only @context, @type (WebPage), and mainEntity so *all* relevant metadata
+          // would be invisible unless we retry using mainEntity.
+          if (parsed["mainEntity"] && Object.keys(metadata).length === 0) {
+            metadata = this._extractJSONLDMetadata(parsed["mainEntity"]);
           }
-          if (typeof parsed.datePublished === "string") {
-            metadata.datePublished = parsed.datePublished.trim();
-          }
-          return;
+
         } catch (err) {
           this.log(err.message);
         }
@@ -1462,6 +1476,29 @@ Readability.prototype = {
     return metadata ? metadata : {};
   },
 
+  /**
+   * Swaps the "Surname, GivenName" formatted bylines to "GivenName Surname".
+   *
+   * @param {string|string[]} name
+   * @returns Name or names in "GivenName Surname" format
+   */
+  _normalizeByline: function(name) {
+    if (!name) {
+      return name;
+    }
+
+    var result = name;
+
+    if (Array.isArray(name)) {
+      return name.map((n) => this._normalizeByline(n));
+    }
+
+    // remove things like "By:" and "http://"
+    result = result.replace(/\w+:\/{0,2}/, "");
+
+    return this._unescapeHtmlEntities(result);
+  },
+
   /**
    * Attempts to get excerpt and byline metadata for the article.
    *
@@ -1479,7 +1516,12 @@ Readability.prototype = {
     var propertyPattern = /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/gi;
 
     // name is a single value
-    var namePattern = /^\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-\.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*$/i;
+    var namePattern = /^\s*(?:(prism|citation|dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-_\.:]\s*)?(author|creator|pub-date|publicationDate|publication|description|title|site_name)\s*$/i;
+
+    // fields which are permitted to have multiple distinct values, eg: byline
+    var byline_properties = [ "dc:creator", "dcterm:creator", "author", "parsely-author", "citation_author"];
+    var multi_props = byline_properties; // concat others here. somewhat pointless atm, but there will be more...
+
 
     // Find description tags.
     this._forEachNode(metaElements, function(element) {
@@ -1491,6 +1533,7 @@ Readability.prototype = {
       }
       var matches = null;
       var name = null;
+      var result = null;
 
       if (elementProperty) {
         matches = elementProperty.match(propertyPattern);
@@ -1499,7 +1542,7 @@ Readability.prototype = {
           // so we can match below.
           name = matches[0].toLowerCase().replace(/\s/g, "");
           // multiple authors
-          values[name] = content.trim();
+          result = content.trim();
         }
       }
       if (!matches && elementName && namePattern.test(elementName)) {
@@ -1508,8 +1551,24 @@ Readability.prototype = {
           // Convert to lowercase, remove any whitespace, and convert dots
           // to colons so we can match below.
           name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":");
-          values[name] = content.trim();
+          result = content.trim();
+        }
+      }
+
+      if (result) {
+        // handle properties which might have multiple distinct values
+        if (values[name] && multi_props.includes(name)) {
+          if (Array.isArray(values[name]) && typeof result == "string") {
+            values[name].push(result);
+          }
+          if (typeof values[name] == "string" && values[name] !== result) {
+            values[name] = [values[name], result];
+          }
+        } else {
+          values[name] = result;
         }
+
+        this.log(`found metadata: ${name}=${values[name]}`);
       }
     });
 
@@ -1529,11 +1588,12 @@ Readability.prototype = {
     }
 
     // get author
-    metadata.byline = jsonld.byline ||
-                      values["dc:creator"] ||
-                      values["dcterm:creator"] ||
-                      values["author"] ||
-                      values["parsely-author"];
+    metadata.byline = jsonld.byline;
+    for (const n of byline_properties) {
+      if (metadata.byline)
+        break;
+      metadata.byline = values[n];
+    }
 
     // get description
     metadata.excerpt = jsonld.excerpt ||
@@ -1553,15 +1613,18 @@ Readability.prototype = {
     metadata.publishedTime = jsonld.datePublished ||
                              values["article:published_time"] ||
                              values["parsely-pub-date"] ||
+                             values["citation_publication_date"] ||
+                             values["prism:publicationDate"] ||
                              null;
 
     // in many sites the meta value is escaped with HTML entities,
     // so here we need to unescape it
     metadata.title = this._unescapeHtmlEntities(metadata.title);
-    metadata.byline = this._unescapeHtmlEntities(metadata.byline);
+    metadata.byline = this._normalizeByline(metadata.byline);
     metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt);
     metadata.siteName = this._unescapeHtmlEntities(metadata.siteName);
     metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime);
+    this.log(`getArticleMetadata complete: ${JSON.stringify(metadata)}`);
 
     return metadata;
   },

diff --git a/test/generate-testcase.js b/test/generate-testcase.js
@@ -1,12 +1,13 @@
 /* eslint-env node, mocha */
 
-var debug = false;
+var debug = true;
 
 var path = require("path");
 var fs = require("fs");
 var JSDOM = require("jsdom").JSDOM;
 var prettyPrint = require("./utils").prettyPrint;
 var http = require("http");
+var https = require("https");
 var urlparse = require("url").parse;
 var htmltidy = require("htmltidy2").tidy;
 
@@ -49,38 +50,46 @@ function generateTestcase(slug) {
   });
 }
 
-function fetchSource(url, callbackFn) {
-  if (!url) {
-    console.error("You should pass a URL if the source doesn't exist yet!");
-    process.exit(1);
-    return;
-  }
-  var client = http;
-  if (url.indexOf("https") == 0) {
-    client = require("https");
-  }
+function getWithRedirects(url, cb) {
+  var client = (url.indexOf("https") == 0) ? https : http;
+
   var options = urlparse(url);
   options.headers = {"User-Agent": FFX_UA};
 
-  client.get(options, function(response) {
+  client.get(options, async (response) => {
     if (debug) {
       console.log("STATUS:", response.statusCode);
       console.log("HEADERS:", JSON.stringify(response.headers));
     }
+
+    if (response.statusCode > 300 && response.statusCode <= 303) {
+      if (debug)
+        console.log("following redirect", response.headers.location);
+      await getWithRedirects(response.headers.location, cb);
+    }
+
     response.setEncoding("utf-8");
     var rv = "";
-    response.on("data", function(chunk) {
-      rv += chunk;
-    });
-    response.on("end", function() {
-      if (debug) {
+
+    response.on("data", (chunk) => rv += chunk);
+
+    response.on("end", () => {
+      if (debug)
         console.log("End received");
-      }
-      sanitizeSource(rv, callbackFn);
+      cb(rv);
     });
   });
 }
 
+function fetchSource(url, callbackFn) {
+  if (!url) {
+    console.error("You should pass a URL if the source doesn't exist yet!");
+    process.exit(1);
+  }
+
+  getWithRedirects(url, (rv) => sanitizeSource(rv, callbackFn));
+}
+
 function sanitizeSource(html, callbackFn) {
   htmltidy(new JSDOM(html).serialize(), {
     "indent": true,

diff --git a/test/test-pages/003-metadata-preferred/expected-metadata.json b/test/test-pages/003-metadata-preferred/expected-metadata.json
@@ -1,6 +1,6 @@
 {
   "title": "Dublin Core property title",
-  "byline": "Dublin Core property author",
+  "byline": "Dublin Core author",
   "dir": null,
   "excerpt": "Dublin Core property description",
   "siteName": null,

diff --git a/test/test-pages/003-metadata-preferred/source.html b/test/test-pages/003-metadata-preferred/source.html
@@ -11,9 +11,11 @@
     <meta property="twitter:title" content="Twitter property title"/>
     <meta property="og:title" content="Open Graph property title"/>
     <meta name="author" content="Meta name author"/>
-    <meta name="DC.creator" content="Dublin Core name author"/>
-    <meta property="dc:creator" content="Dublin Core property author"/>
-     <meta name="description" content="Meta name description"/>
+    <!-- now that multiple authors are supported, these have to be identical to prevent them from showing up
+         as two separate authors -->
+    <meta name="DC.creator" content="Dublin Core author"/>
+    <meta property="dc:creator" content="Dublin Core author"/>
+    <meta name="description" content="Meta name description"/>
     <meta name="og:description" content="Open Graph name description"/>
     <meta name="twitter:description" content="Twitter name description"/>
     <meta name="DC.description" content="Dublin Core name description"/>

diff --git a/test/test-pages/ebb-org/expected-metadata.json b/test/test-pages/ebb-org/expected-metadata.json
@@ -1,6 +1,6 @@
 {
   "title": "On Recent Controversial Events - Bradley M. Kuhn ( Brad ) ( bkuhn )",
-  "byline": "Bradley M. Kuhn (http://ebb.org/bkuhn/)",
+  "byline": "Bradley M. Kuhn (ebb.org/bkuhn/)",
   "dir": null,
   "lang": "en-US",
   "excerpt": "The website of Bradley M. Kuhn, aka Brad, aka bkuhn. This site includes his GPG keys, resume, blog, projects list, software, interviews, speeches and writing.",