Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add citation, prism, and dc metadata #871

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .eslintrc.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

module.exports = {
"parserOptions": {
"ecmaVersion": 6,
"ecmaVersion": 2017,
},
"env": {
"es6": true,
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ reasonable).
- [Add Parsely tags as a fallback metadata source](https://github.com/mozilla/readability/pull/865)
- [Fix the case that jsonld parse process is ignored when context url include the trailing slash](https://github.com/mozilla/readability/pull/833)
- [Fixed situations where short paragraphs of legitimate content would be excluded](https://github.com/mozilla/readability/pull/867)
- [Add `citation`, `prism`, and more `dc` metadata](https://github.com/mozilla/readability/pull/871)

## [0.5.0] - 2023-12-15

Expand Down
177 changes: 120 additions & 57 deletions Readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,7 @@ Readability.prototype = {
// could assume it's the full title.
var headings = this._concatNodeLists(
doc.getElementsByTagName("h1"),
doc.getElementsByTagName("h2")
doc.getElementsByTagName("h2"),
);
var trimmedTitle = curTitle.trim();
var match = this._someNode(headings, function(heading) {
Expand Down Expand Up @@ -1367,6 +1367,58 @@ Readability.prototype = {
});
},

_extractJSONLDMetadata: function (parsed) {
var metadata = {};

if (typeof parsed.name === "string" && typeof parsed.headline === "string" && parsed.name !== parsed.headline) {
// we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz
// put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either
// "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default.

var title = this._getArticleTitle();
var nameMatches = this._textSimilarity(parsed.name, title) > 0.75;
var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75;

if (headlineMatches && !nameMatches) {
metadata.title = parsed.headline;
} else {
metadata.title = parsed.name;
}
} else if (typeof parsed.name === "string") {
metadata.title = parsed.name.trim();
} else if (typeof parsed.headline === "string") {
metadata.title = parsed.headline.trim();
}
if (parsed.author) {
if (typeof parsed.author.name === "string") {
metadata.byline = parsed.author.name.trim();
} else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") {
metadata.byline = parsed.author
.filter(function(author) {
return author && typeof author.name === "string";
})
.map(function(author) {
return author.name.trim();
})
.join(", ");
}
}
if (typeof parsed.description === "string") {
metadata.excerpt = parsed.description.trim();
}
if (
parsed.publisher &&
typeof parsed.publisher.name === "string"
) {
metadata.siteName = parsed.publisher.name.trim();
}
if (typeof parsed.datePublished === "string") {
metadata.datePublished = parsed.datePublished.trim();
}

return metadata;
},

/**
* Try to extract metadata from JSON-LD object.
* For now, only Schema.org objects of type Article or its subtypes are supported.
Expand All @@ -1383,6 +1435,7 @@ Readability.prototype = {
// Strip CDATA markers if present
var content = jsonLdElement.textContent.replace(/^\s*<!\[CDATA\[|\]\]>\s*$/g, "");
var parsed = JSON.parse(content);

if (
!parsed["@context"] ||
!parsed["@context"].match(/^https?\:\/\/schema\.org\/?$/)
Expand All @@ -1393,7 +1446,7 @@ Readability.prototype = {
if (!parsed["@type"] && Array.isArray(parsed["@graph"])) {
parsed = parsed["@graph"].find(function(it) {
return (it["@type"] || "").match(
this.REGEXPS.jsonLdArticleTypes
this.REGEXPS.jsonLdArticleTypes,
);
});
}
Expand All @@ -1406,54 +1459,15 @@ Readability.prototype = {
return;
}

metadata = {};
metadata = this._extractJSONLDMetadata(parsed);

if (typeof parsed.name === "string" && typeof parsed.headline === "string" && parsed.name !== parsed.headline) {
// we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz
// put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either
// "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default.

var title = this._getArticleTitle();
var nameMatches = this._textSimilarity(parsed.name, title) > 0.75;
var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75;

if (headlineMatches && !nameMatches) {
metadata.title = parsed.headline;
} else {
metadata.title = parsed.name;
}
} else if (typeof parsed.name === "string") {
metadata.title = parsed.name.trim();
} else if (typeof parsed.headline === "string") {
metadata.title = parsed.headline.trim();
}
if (parsed.author) {
if (typeof parsed.author.name === "string") {
metadata.byline = parsed.author.name.trim();
} else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") {
metadata.byline = parsed.author
.filter(function(author) {
return author && typeof author.name === "string";
})
.map(function(author) {
return author.name.trim();
})
.join(", ");
}
}
if (typeof parsed.description === "string") {
metadata.excerpt = parsed.description.trim();
}
if (
parsed.publisher &&
typeof parsed.publisher.name === "string"
) {
metadata.siteName = parsed.publisher.name.trim();
// some sites, like ones for academic journals, separate metadata for a journal article or paper from the
// site's own metadata. eg: nature has only @context, @type (WebPage), and mainEntity so *all* relevant metadata
// would be invisible unless we retry using mainEntity.
if (parsed["mainEntity"] && Object.keys(metadata).length === 0) {
metadata = this._extractJSONLDMetadata(parsed["mainEntity"]);
}
if (typeof parsed.datePublished === "string") {
metadata.datePublished = parsed.datePublished.trim();
}
return;

} catch (err) {
this.log(err.message);
}
Expand All @@ -1462,6 +1476,29 @@ Readability.prototype = {
return metadata ? metadata : {};
},

/**
* Swaps the "Surname, GivenName" formatted bylines to "GivenName Surname".
*
* @param {string|string[]} name
* @returns Name or names in "GivenName Surname" format
*/
_normalizeByline: function(name) {
if (!name) {
return name;
}

var result = name;

if (Array.isArray(name)) {
return name.map((n) => this._normalizeByline(n));
}

// remove things like "By:" and "http://"
result = result.replace(/\w+:\/{0,2}/, "");

return this._unescapeHtmlEntities(result);
},

/**
* Attempts to get excerpt and byline metadata for the article.
*
Expand All @@ -1479,7 +1516,12 @@ Readability.prototype = {
var propertyPattern = /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/gi;

// name is a single value
var namePattern = /^\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-\.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*$/i;
var namePattern = /^\s*(?:(prism|citation|dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-_\.:]\s*)?(author|creator|pub-date|publicationDate|publication|description|title|site_name)\s*$/i;

// fields which are permitted to have multiple distinct values, eg: byline
var byline_properties = [ "dc:creator", "dcterm:creator", "author", "parsely-author", "citation_author"];
var multi_props = byline_properties; // concat others here. somewhat pointless atm, but there will be more...


// Find description tags.
this._forEachNode(metaElements, function(element) {
Expand All @@ -1491,6 +1533,7 @@ Readability.prototype = {
}
var matches = null;
var name = null;
var result = null;

if (elementProperty) {
matches = elementProperty.match(propertyPattern);
Expand All @@ -1499,7 +1542,7 @@ Readability.prototype = {
// so we can match below.
name = matches[0].toLowerCase().replace(/\s/g, "");
// multiple authors
values[name] = content.trim();
result = content.trim();
}
}
if (!matches && elementName && namePattern.test(elementName)) {
Expand All @@ -1508,8 +1551,24 @@ Readability.prototype = {
// Convert to lowercase, remove any whitespace, and convert dots
// to colons so we can match below.
name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":");
values[name] = content.trim();
result = content.trim();
}
}

if (result) {
// handle properties which might have multiple distinct values
if (values[name] && multi_props.includes(name)) {
if (Array.isArray(values[name]) && typeof result == "string") {
values[name].push(result);
}
if (typeof values[name] == "string" && values[name] !== result) {
values[name] = [values[name], result];
}
} else {
values[name] = result;
}

this.log(`found metadata: ${name}=${values[name]}`);
}
});

Expand All @@ -1529,11 +1588,12 @@ Readability.prototype = {
}

// get author
metadata.byline = jsonld.byline ||
values["dc:creator"] ||
values["dcterm:creator"] ||
values["author"] ||
values["parsely-author"];
metadata.byline = jsonld.byline;
for (const n of byline_properties) {
if (metadata.byline)
break;
metadata.byline = values[n];
}

// get description
metadata.excerpt = jsonld.excerpt ||
Expand All @@ -1553,15 +1613,18 @@ Readability.prototype = {
metadata.publishedTime = jsonld.datePublished ||
values["article:published_time"] ||
values["parsely-pub-date"] ||
values["citation_publication_date"] ||
values["prism:publicationDate"] ||
null;

// in many sites the meta value is escaped with HTML entities,
// so here we need to unescape it
metadata.title = this._unescapeHtmlEntities(metadata.title);
metadata.byline = this._unescapeHtmlEntities(metadata.byline);
metadata.byline = this._normalizeByline(metadata.byline);
metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt);
metadata.siteName = this._unescapeHtmlEntities(metadata.siteName);
metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime);
this.log(`getArticleMetadata complete: ${JSON.stringify(metadata)}`);

return metadata;
},
Expand Down
47 changes: 28 additions & 19 deletions test/generate-testcase.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
/* eslint-env node, mocha */

var debug = false;
var debug = true;

var path = require("path");
var fs = require("fs");
var JSDOM = require("jsdom").JSDOM;
var prettyPrint = require("./utils").prettyPrint;
var http = require("http");
var https = require("https");
var urlparse = require("url").parse;
var htmltidy = require("htmltidy2").tidy;

Expand Down Expand Up @@ -49,38 +50,46 @@ function generateTestcase(slug) {
});
}

function fetchSource(url, callbackFn) {
if (!url) {
console.error("You should pass a URL if the source doesn't exist yet!");
process.exit(1);
return;
}
var client = http;
if (url.indexOf("https") == 0) {
client = require("https");
}
function getWithRedirects(url, cb) {
var client = (url.indexOf("https") == 0) ? https : http;

var options = urlparse(url);
options.headers = {"User-Agent": FFX_UA};

client.get(options, function(response) {
client.get(options, async (response) => {
if (debug) {
console.log("STATUS:", response.statusCode);
console.log("HEADERS:", JSON.stringify(response.headers));
}

if (response.statusCode > 300 && response.statusCode <= 303) {
if (debug)
console.log("following redirect", response.headers.location);
await getWithRedirects(response.headers.location, cb);
}

response.setEncoding("utf-8");
var rv = "";
response.on("data", function(chunk) {
rv += chunk;
});
response.on("end", function() {
if (debug) {

response.on("data", (chunk) => rv += chunk);

response.on("end", () => {
if (debug)
console.log("End received");
}
sanitizeSource(rv, callbackFn);
cb(rv);
});
});
}

function fetchSource(url, callbackFn) {
if (!url) {
console.error("You should pass a URL if the source doesn't exist yet!");
process.exit(1);
}

getWithRedirects(url, (rv) => sanitizeSource(rv, callbackFn));
}

function sanitizeSource(html, callbackFn) {
htmltidy(new JSDOM(html).serialize(), {
"indent": true,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"title": "Dublin Core property title",
"byline": "Dublin Core property author",
"byline": "Dublin Core author",
"dir": null,
"excerpt": "Dublin Core property description",
"siteName": null,
Expand Down
8 changes: 5 additions & 3 deletions test/test-pages/003-metadata-preferred/source.html
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@
<meta property="twitter:title" content="Twitter property title"/>
<meta property="og:title" content="Open Graph property title"/>
<meta name="author" content="Meta name author"/>
<meta name="DC.creator" content="Dublin Core name author"/>
<meta property="dc:creator" content="Dublin Core property author"/>
<meta name="description" content="Meta name description"/>
<!-- now that multiple authors are supported, these have to be identical to prevent them from showing up
as two separate authors -->
<meta name="DC.creator" content="Dublin Core author"/>
<meta property="dc:creator" content="Dublin Core author"/>
<meta name="description" content="Meta name description"/>
<meta name="og:description" content="Open Graph name description"/>
<meta name="twitter:description" content="Twitter name description"/>
<meta name="DC.description" content="Dublin Core name description"/>
Expand Down
2 changes: 1 addition & 1 deletion test/test-pages/ebb-org/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"title": "On Recent Controversial Events - Bradley M. Kuhn ( Brad ) ( bkuhn )",
"byline": "Bradley M. Kuhn (http://ebb.org/bkuhn/)",
"byline": "Bradley M. Kuhn (ebb.org/bkuhn/)",
"dir": null,
"lang": "en-US",
"excerpt": "The website of Bradley M. Kuhn, aka Brad, aka bkuhn. This site includes his GPG keys, resume, blog, projects list, software, interviews, speeches and writing.",
Expand Down
Loading