Skip to content

Commit

Permalink
WIP: add citation, prism, and dc metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
inhumantsar committed Jun 11, 2024
1 parent 5abeedd commit 740ddd3
Show file tree
Hide file tree
Showing 8 changed files with 4,160 additions and 23 deletions.
15 changes: 13 additions & 2 deletions Readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -1383,6 +1383,14 @@ Readability.prototype = {
// Strip CDATA markers if present
var content = jsonLdElement.textContent.replace(/^\s*<!\[CDATA\[|\]\]>\s*$/g, "");
var parsed = JSON.parse(content);

// some sites, like ones for academic journals, separate metadata for a journal article or paper from the
// site's own metadata. eg: nature has only @context, @type (WebPage), and mainEntity so *all* relevant metadata
// would be invisible without this.
if (parsed["mainEntity"]) {
parsed = parsed["mainEntity"];
}

if (
!parsed["@context"] ||
!parsed["@context"].match(/^https?\:\/\/schema\.org\/?$/)
Expand Down Expand Up @@ -1479,7 +1487,7 @@ Readability.prototype = {
var propertyPattern = /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/gi;

// name is a single value
var namePattern = /^\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-\.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*$/i;
var namePattern = /^\s*(?:(prism|citation|dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-_\.:]\s*)?(author|creator|pub-date|publicationDate|publication|description|title|site_name)\s*$/i;

// Find description tags.
this._forEachNode(metaElements, function(element) {
Expand Down Expand Up @@ -1533,7 +1541,8 @@ Readability.prototype = {
values["dc:creator"] ||
values["dcterm:creator"] ||
values["author"] ||
values["parsely-author"];
values["parsely-author"] ||
values["citation_author"];

// get description
metadata.excerpt = jsonld.excerpt ||
Expand All @@ -1553,6 +1562,8 @@ Readability.prototype = {
metadata.publishedTime = jsonld.datePublished ||
values["article:published_time"] ||
values["parsely-pub-date"] ||
values["citation_publication_date"] ||
values["prism:publicationDate"] ||
null;

// in many sites the meta value is escaped with HTML entities,
Expand Down
49 changes: 28 additions & 21 deletions test/generate-testcase.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
/* eslint-env node, mocha */

var debug = false;
var debug = true;

var path = require("path");
var fs = require("fs");
var JSDOM = require("jsdom").JSDOM;
var prettyPrint = require("./utils").prettyPrint;
var http = require("http");
var https = require("https");
var urlparse = require("url").parse;
var htmltidy = require("htmltidy2").tidy;

Expand Down Expand Up @@ -49,38 +50,44 @@ function generateTestcase(slug) {
});
}

function fetchSource(url, callbackFn) {
if (!url) {
console.error("You should pass a URL if the source doesn't exist yet!");
process.exit(1);
return;
}
var client = http;
if (url.indexOf("https") == 0) {
client = require("https");
}
function getWithRedirects(url, cb) {
var client = (url.indexOf("https") == 0) ? https : http;

var options = urlparse(url);
options.headers = {"User-Agent": FFX_UA};

client.get(options, function(response) {
client.get(options, async (response) => {
if (debug) {
console.log("STATUS:", response.statusCode);
console.log("HEADERS:", JSON.stringify(response.headers));
}

if(response.statusCode > 300 && response.statusCode <= 303) {
if (debug) console.log("following redirect", response.headers.location);
await getWithRedirects(response.headers.location, cb);
}

response.setEncoding("utf-8");
var rv = "";
response.on("data", function(chunk) {
rv += chunk;
});
response.on("end", function() {
if (debug) {
console.log("End received");
}
sanitizeSource(rv, callbackFn);

response.on("data", (chunk) => rv += chunk);

response.on("end", () => {
if (debug) console.log("End received");
cb(rv);
});
});
}

function fetchSource(url, callbackFn) {
if (!url) {
console.error("You should pass a URL if the source doesn't exist yet!");
process.exit(1);
}

getWithRedirects(url, (rv) => sanitizeSource(rv, callbackFn));
}

function sanitizeSource(html, callbackFn) {
htmltidy(new JSDOM(html).serialize(), {
"indent": true,
Expand Down Expand Up @@ -185,4 +192,4 @@ if (process.argv[2] === "all") {
});
} else {
generateTestcase(process.argv[2]);
}
}
10 changes: 10 additions & 0 deletions test/test-pages/nature/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"title": "Worldwide divergence of values",
"byline": "Medvedev, Danila",
"dir": null,
"lang": "en",
"excerpt": "Social scientists have long debated the nature of cultural change in a modernizing and globalizing world. Some scholars predicted that national cultures would converge by adopting social values typical of Western democracies. Others predicted that cultural differences in values would persist or even increase over time. We test these competing predictions by analyzing survey data from 1981 to 2022 (n = 406,185) from 76 national cultures. We find evidence of global value divergence. Values emphasizing tolerance and self-expression have diverged most sharply, especially between high-income Western countries and the rest of the world. We also find that countries with similar per-capita GDP levels have held similar values over the last 40 years. Over time, however, geographic proximity has emerged as an increasingly strong correlate of value similarity, indicating that values have diverged globally but converged regionally. The authors test whether social values have become converged or diverged across national cultures over the last 40 years using a 76-country analysis of the World Values Survey. They show that values have diverged, especially between high-income Western countries and the rest of the world.",
"siteName": "Nature",
"publishedTime": null,
"readerable": true
}
702 changes: 702 additions & 0 deletions test/test-pages/nature/expected.html

Large diffs are not rendered by default.

2,622 changes: 2,622 additions & 0 deletions test/test-pages/nature/source.html

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions test/test-pages/ourworldindata/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"title": "Why do we need to know about progress if we are concerned about the world's largest problems?",
"byline": "By: Max Roser",
"dir": null,
"excerpt": "Why have we made it our mission to publish “research and data to make progress against the world’s largest problems”?",
"siteName": "Our World in Data",
"publishedTime": null,
"readerable": true
}
Loading

0 comments on commit 740ddd3

Please sign in to comment.