From 110c4ad1d20a9617c47c9aa85bfc2e27bf8e673a Mon Sep 17 00:00:00 2001 From: "J. King" Date: Wed, 21 Aug 2024 21:02:46 -0400 Subject: [PATCH 1/2] Clean up byline validation - Consolidate _checkByline and _isValidByline into one function - Avoid side effects and short circuiting - Move pre-existence checks and side effects out to _grabArticle - Add basic documentation --- Readability.js | 57 +++++++++++++++++++------------------------------- 1 file changed, 22 insertions(+), 35 deletions(-) diff --git a/Readability.js b/Readability.js index 535afffd..dfb46407 100644 --- a/Readability.js +++ b/Readability.js @@ -978,27 +978,25 @@ Readability.prototype = { return 1 - distanceB; }, - _checkByline(node, matchString) { - if (this._articleByline || this._metadata.byline) { - return false; - } - - if (node.getAttribute !== undefined) { - var rel = node.getAttribute("rel"); - var itemprop = node.getAttribute("itemprop"); - } + /** + * Checks whether an element node contains a valid byline + * + * @param node {Element} + * @param matchString {string} + * @return boolean + */ + _isValidByline(node, matchString) { + var rel = node.getAttribute("rel"); + var itemprop = node.getAttribute("itemprop"); + var byline = node.textContent.trim(); - if ( + return ( (rel === "author" || (itemprop && itemprop.includes("author")) || this.REGEXPS.byline.test(matchString)) && - this._isValidByline(node.textContent) - ) { - this._articleByline = node.textContent.trim(); - return true; - } - - return false; + !!byline.length && + byline.length < 100 + ); }, _getNodeAncestors(node, maxDepth) { @@ -1073,8 +1071,13 @@ Readability.prototype = { continue; } - // Check to see if this node is a byline, and remove it if it is. - if (this._checkByline(node, matchString)) { + // If we don't have a byline yet check to see if this node is a byline; if it is store the byline and remove the node. + if ( + !this._articleByline && + !this._metadata.byline && + this._isValidByline(node, matchString) + ) { + this._articleByline = node.textContent.trim(); node = this._removeAndGetNext(node); continue; } @@ -1573,22 +1576,6 @@ Readability.prototype = { } }, - /** - * Check whether the input string could be a byline. - * This verifies that the input is a string, and that the length - * is less than 100 chars. - * - * @param possibleByline {string} - a string to check whether its a byline. - * @return Boolean - whether the input string is a byline. - */ - _isValidByline(byline) { - if (typeof byline == "string" || byline instanceof String) { - byline = byline.trim(); - return !!byline.length && byline.length < 100; - } - return false; - }, - /** * Converts some of the common HTML entities in string to their corresponding characters. * From 41282873b6a88626b970308681f6b423e872218e Mon Sep 17 00:00:00 2001 From: "J. King" Date: Thu, 22 Aug 2024 08:35:03 -0400 Subject: [PATCH 2/2] Make clearer that byline validity only cares about text length --- Readability.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Readability.js b/Readability.js index dfb46407..9684e41d 100644 --- a/Readability.js +++ b/Readability.js @@ -988,14 +988,14 @@ Readability.prototype = { _isValidByline(node, matchString) { var rel = node.getAttribute("rel"); var itemprop = node.getAttribute("itemprop"); - var byline = node.textContent.trim(); + var bylineLength = node.textContent.trim().length; return ( (rel === "author" || (itemprop && itemprop.includes("author")) || this.REGEXPS.byline.test(matchString)) && - !!byline.length && - byline.length < 100 + !!bylineLength && + bylineLength < 100 ); },