From dfc7a694bfabf67274f07391a1239dc205e41299 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Sat, 31 Aug 2013 18:36:38 -0400 Subject: [PATCH] Improve content grabber and add new rules --- lib/PicoFeed/Grabber.php | 15 +++++++------ lib/PicoFeed/Rules/.wikipedia.org.php | 25 ++++++++++++++++++++++ lib/PicoFeed/Rules/techcrunch.com.php | 12 +++++++++++ lib/PicoFeed/Rules/www.cnn.com.php | 18 ++++++++++++++-- lib/PicoFeed/Rules/www.theguardian.com.php | 9 ++++++++ 5 files changed, 71 insertions(+), 8 deletions(-) create mode 100644 lib/PicoFeed/Rules/.wikipedia.org.php create mode 100644 lib/PicoFeed/Rules/techcrunch.com.php create mode 100644 lib/PicoFeed/Rules/www.theguardian.com.php diff --git a/lib/PicoFeed/Grabber.php b/lib/PicoFeed/Grabber.php index 355e55d..cbfc264 100644 --- a/lib/PicoFeed/Grabber.php +++ b/lib/PicoFeed/Grabber.php @@ -21,6 +21,7 @@ class Grabber 'articlecontent', 'articlePage', 'post-content', + 'entry-content', 'content', 'main', ); @@ -37,6 +38,7 @@ class Grabber 'nav', 'header', 'social', + 'entry-utility', ); public $stripTags = array( @@ -168,17 +170,18 @@ public function parseContentWithCandidates() if ($nodes !== false && $nodes->length > 0) { $this->content = $dom->saveXML($nodes->item(0)); - return; } // Try to lookup in each
- foreach ($this->candidatesAttributes as $candidate) { + if (! $this->content) { - $nodes = $xpath->query('//div[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]'); + foreach ($this->candidatesAttributes as $candidate) { - if ($nodes !== false && $nodes->length > 0) { - $this->content = $dom->saveXML($nodes->item(0)); - return; + $nodes = $xpath->query('//div[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]'); + + if ($nodes !== false && $nodes->length > 0) { + $this->content = $dom->saveXML($nodes->item(0)); + } } } diff --git a/lib/PicoFeed/Rules/.wikipedia.org.php b/lib/PicoFeed/Rules/.wikipedia.org.php new file mode 100644 index 0000000..ea99ab6 --- /dev/null +++ b/lib/PicoFeed/Rules/.wikipedia.org.php @@ -0,0 +1,25 @@ + 'https://en.wikipedia.org/wiki/Grace_Hopper', + 'body' => array( + '//div[@id="bodyContent"]', + ), + 'strip' => array( + "//div[@id='toc']", + "//div[@id='catlinks']", + "//div[@id='jump-to-nav']", + "//div[@class='thumbcaption']//div[@class='magnify']", + "//table[@class='navbox']", + "//table[contains(@class, 'infobox')]", + "//div[@class='dablink']", + "//div[@id='contentSub']", + "//div[@id='siteSub']", + "//table[@id='persondata']", + "//table[contains(@class, 'metadata')]", + "//*[contains(@class, 'noprint')]", + "//*[contains(@class, 'printfooter')]", + "//*[contains(@class, 'editsection')]", + "//*[contains(@class, 'error')]", + "//span[@title='pronunciation:']", + ), +); diff --git a/lib/PicoFeed/Rules/techcrunch.com.php b/lib/PicoFeed/Rules/techcrunch.com.php new file mode 100644 index 0000000..5ad42ad --- /dev/null +++ b/lib/PicoFeed/Rules/techcrunch.com.php @@ -0,0 +1,12 @@ + 'http://techcrunch.com/2013/08/31/indias-visa-maze/', + 'body' => array( + '//div[contains(@class, "media-container")]', + '//div[@class="body-copy"]', + ), + 'strip' => array( + '//script', + '//style', + ) +); diff --git a/lib/PicoFeed/Rules/www.cnn.com.php b/lib/PicoFeed/Rules/www.cnn.com.php index 2207b7c..472832f 100644 --- a/lib/PicoFeed/Rules/www.cnn.com.php +++ b/lib/PicoFeed/Rules/www.cnn.com.php @@ -2,7 +2,21 @@ return array( 'test_url' => 'http://www.cnn.com/2013/08/31/world/meast/syria-civil-war/index.html?hpt=hp_t1', 'body' => array( - '//*[contains(@class, "cnn_storypgraphtxt")]', - '//*[contains(@class, "cnnvideo_wrapper")]', + '//div[@class="cnn_strycntntlft"]', ), + 'strip' => array( + '//script', + '//style', + '//div[@class="cnn_stryshrwdgtbtm"]', + '//div[@class="cnn_strybtmcntnt"]', + '//div[@class="cnn_strylftcntnt"]', + '//div[contains(@class, "cnnGalleryContainer")]', + '//div[contains(@class, "cnn_strylftcexpbx")]', + '//div[contains(@class, "articleGalleryNavContainer")]', + '//div[contains(@class, "cnnArticleGalleryCaptionControl")]', + '//div[contains(@class, "cnnArticleGalleryNavPrevNextDisabled")]', + '//div[contains(@class, "cnnArticleGalleryNavPrevNext")]', + '//div[contains(@class, "cnn_html_media_title_new")]', + '//div[contains(@id, "disqus")]', + ) ); diff --git a/lib/PicoFeed/Rules/www.theguardian.com.php b/lib/PicoFeed/Rules/www.theguardian.com.php new file mode 100644 index 0000000..ddb0b0a --- /dev/null +++ b/lib/PicoFeed/Rules/www.theguardian.com.php @@ -0,0 +1,9 @@ + 'http://www.theguardian.com/law/2013/aug/31/microsoft-google-sue-us-fisa', + 'body' => array( + '//div[@id="article-wrapper"]', + ), + 'strip' => array( + ), +);