Skip to content

Commit

Permalink
Improve content grabber and add new rules
Browse files Browse the repository at this point in the history
  • Loading branch information
Frédéric Guillot committed Aug 31, 2013
1 parent c1124d2 commit dfc7a69
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 8 deletions.
15 changes: 9 additions & 6 deletions lib/PicoFeed/Grabber.php
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class Grabber
'articlecontent',
'articlePage',
'post-content',
'entry-content',
'content',
'main',
);
Expand All @@ -37,6 +38,7 @@ class Grabber
'nav',
'header',
'social',
'entry-utility',
);

public $stripTags = array(
Expand Down Expand Up @@ -168,17 +170,18 @@ public function parseContentWithCandidates()

if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
return;
}

// Try to lookup in each <div/>
foreach ($this->candidatesAttributes as $candidate) {
if (! $this->content) {

$nodes = $xpath->query('//div[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
foreach ($this->candidatesAttributes as $candidate) {

if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
return;
$nodes = $xpath->query('//div[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');

if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
}
}
}

Expand Down
25 changes: 25 additions & 0 deletions lib/PicoFeed/Rules/.wikipedia.org.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<?php
return array(
'test_url' => 'https://en.wikipedia.org/wiki/Grace_Hopper',
'body' => array(
'//div[@id="bodyContent"]',
),
'strip' => array(
"//div[@id='toc']",
"//div[@id='catlinks']",
"//div[@id='jump-to-nav']",
"//div[@class='thumbcaption']//div[@class='magnify']",
"//table[@class='navbox']",
"//table[contains(@class, 'infobox')]",
"//div[@class='dablink']",
"//div[@id='contentSub']",
"//div[@id='siteSub']",
"//table[@id='persondata']",
"//table[contains(@class, 'metadata')]",
"//*[contains(@class, 'noprint')]",
"//*[contains(@class, 'printfooter')]",
"//*[contains(@class, 'editsection')]",
"//*[contains(@class, 'error')]",
"//span[@title='pronunciation:']",
),
);
12 changes: 12 additions & 0 deletions lib/PicoFeed/Rules/techcrunch.com.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<?php
return array(
'test_url' => 'http://techcrunch.com/2013/08/31/indias-visa-maze/',
'body' => array(
'//div[contains(@class, "media-container")]',
'//div[@class="body-copy"]',
),
'strip' => array(
'//script',
'//style',
)
);
18 changes: 16 additions & 2 deletions lib/PicoFeed/Rules/www.cnn.com.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,21 @@
return array(
'test_url' => 'http://www.cnn.com/2013/08/31/world/meast/syria-civil-war/index.html?hpt=hp_t1',
'body' => array(
'//*[contains(@class, "cnn_storypgraphtxt")]',
'//*[contains(@class, "cnnvideo_wrapper")]',
'//div[@class="cnn_strycntntlft"]',
),
'strip' => array(
'//script',
'//style',
'//div[@class="cnn_stryshrwdgtbtm"]',
'//div[@class="cnn_strybtmcntnt"]',
'//div[@class="cnn_strylftcntnt"]',
'//div[contains(@class, "cnnGalleryContainer")]',
'//div[contains(@class, "cnn_strylftcexpbx")]',
'//div[contains(@class, "articleGalleryNavContainer")]',
'//div[contains(@class, "cnnArticleGalleryCaptionControl")]',
'//div[contains(@class, "cnnArticleGalleryNavPrevNextDisabled")]',
'//div[contains(@class, "cnnArticleGalleryNavPrevNext")]',
'//div[contains(@class, "cnn_html_media_title_new")]',
'//div[contains(@id, "disqus")]',
)
);
9 changes: 9 additions & 0 deletions lib/PicoFeed/Rules/www.theguardian.com.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<?php
return array(
'test_url' => 'http://www.theguardian.com/law/2013/aug/31/microsoft-google-sue-us-fisa',
'body' => array(
'//div[@id="article-wrapper"]',
),
'strip' => array(
),
);

0 comments on commit dfc7a69

Please sign in to comment.