Skip to content

Commit

Permalink
Improve content grabber
Browse files Browse the repository at this point in the history
  • Loading branch information
Frédéric Guillot committed Aug 31, 2013
1 parent b1c0c47 commit 0eec166
Show file tree
Hide file tree
Showing 14 changed files with 161 additions and 31 deletions.
39 changes: 28 additions & 11 deletions lib/PicoFeed/Grabber.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

require_once __DIR__.'/Client.php';
require_once __DIR__.'/Encoding.php';
require_once __DIR__.'/Logging.php';

class Grabber
{
Expand All @@ -17,6 +18,7 @@ class Grabber
'articlebody',
'articleContent',
'articlecontent',
'articlePage',
'post-content',
'content',
'main',
Expand Down Expand Up @@ -52,45 +54,60 @@ public function __construct($url)
}


public function download($timeout = 5, $user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)')
public function parse()
{
$client = Client::create();
$client->url = $this->url;
$client->timeout = $timeout;
$client->user_agent = $user_agent;
$client->execute();

$this->html = $client->getContent();
$this->url = $client->getUrl();

if ($this->html) {

$this->html = Encoding::toUTF8($this->html);
Logging::log(\get_called_class().' HTML fetched');

$rules = $this->getRules();

\libxml_use_internal_errors(true);
$dom = new \DOMDocument;
$dom->loadHTML($this->html);

if (is_array($rules)) {
Logging::log(\get_called_class().' Parse content with rules');
$this->parseContentWithRules($dom, $rules);
}
else {

Logging::log(\get_called_class().' Parse content with candidates');
$this->parseContentWithCandidates($dom);

if (strlen($this->content) < 50) {
Logging::log(\get_called_class().' No enought content fetched, get the full body');
$this->content = $dom->saveXML($dom->firstChild);
}

Logging::log(\get_called_class().' Strip garbage');
$this->stripGarbage();
}
}
else {

Logging::log(\get_called_class().' No content fetched');
}

Logging::log(\get_called_class().' Grabber done');

return $this->content !== '';
}


public function download($timeout = 5, $user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36')
{
$client = Client::create();
$client->url = $this->url;
$client->timeout = $timeout;
$client->user_agent = $user_agent;
$client->execute();
$this->html = $client->getContent();

return $this->html;
}


public function getRules()
{
$hostname = parse_url($this->url, PHP_URL_HOST);
Expand Down
10 changes: 7 additions & 3 deletions lib/PicoFeed/Parser.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
require_once __DIR__.'/Logging.php';
require_once __DIR__.'/Filter.php';
require_once __DIR__.'/Encoding.php';
require_once __DIR__.'/Grabber.php';

abstract class Parser
{
Expand All @@ -16,6 +17,9 @@ abstract class Parser
public $updated = '';
public $items = array();
public $grabber = false;
public $grabber_ignore_urls = array();
public $grabber_timeout = null;
public $grabber_user_agent = null;


abstract public function execute();
Expand All @@ -38,10 +42,10 @@ public function filterHtml($item_content, $item_url)
{
$content = '';

if ($this->grabber) {
if ($this->grabber && ! in_array($item_url, $this->grabber_ignore_urls)) {
$grabber = new Grabber($item_url);
$grabber->download();
if ($grabber->content) $item_content = $grabber->content;
$grabber->download($this->grabber_timeout, $this->grabber_user_agent);
if ($grabber->parse()) $item_content = $grabber->content;
}

if ($item_content) {
Expand Down
10 changes: 10 additions & 0 deletions lib/PicoFeed/Rules/.blog.lemonde.fr.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?php
return array(
'test_url' => 'http://combat.blog.lemonde.fr/2013/08/31/teddy-riner-le-rookie-devenu-rambo/#xtor=RSS-3208',
'body' => array(
'//div[@class="entry-content"]',
),
'strip' => array(
'//*[contains(@class, "fb-like") or contains(@class, "social")]'
)
);
8 changes: 0 additions & 8 deletions lib/PicoFeed/Rules/.ctv.ca.php

This file was deleted.

2 changes: 1 addition & 1 deletion lib/PicoFeed/Rules/.nytimes.com.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
'test_url' => 'http://www.nytimes.com/2011/05/15/world/middleeast/15prince.html',
'title' => '//h1[@class="articleHeadline"]',
'body' => array(
'//div[@class="articleBody"]',
'//div[@class="articleBody"]',
),
);
16 changes: 16 additions & 0 deletions lib/PicoFeed/Rules/.slate.com.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?php
return array(
'test_url' => 'http://www.slate.com/articles/business/moneybox/2013/08/microsoft_ceo_steve_ballmer_retires_a_firsthand_account_of_the_company_s.html',
'body' => array(
'//div[@class="sl-art-body"]',
),
'strip' => array(
'//*[contains(@class, "social") or contains(@class, "comments") or contains(@class, "sl-article-floatin-tools") or contains(@class, "sl-art-pag")]',
'//*[@id="mys_slate_logged_in"]',
'//*[@id="sl_article_tools_myslate_bottom"]',
'//*[@id="mys_myslate"]',
'//*[@class="sl-viral-container"]',
'//*[@class="sl-art-creds-cntr"]',
'//*[@class="sl-art-ad-midflex"]',
)
);
11 changes: 11 additions & 0 deletions lib/PicoFeed/Rules/.wsj.com.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?php
return array(
'test_url' => 'http://online.wsj.com/article/SB10001424127887324108204579023143974408428.html',
'body' => array(
'//div[@class="articlePage"]',
),
'strip' => array(
'//*[@id="articleThumbnail_2"]',
'//*[@class="socialByline"]',
)
);
9 changes: 9 additions & 0 deletions lib/PicoFeed/Rules/rue89.feedsportal.com.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<?php
return array(
'test_url' => 'http://rue89.feedsportal.com/c/33822/f/608948/s/30999fa0/sc/24/l/0L0Srue890N0C20A130C0A80C30A0Cfaisait0Eboris0Eboillon0Eex0Esarko0Eboy0E350A0E0A0A0A0Eeuros0Egare0Enord0E245315/story01.htm',
'body' => array(
'//*[@id="article"]/div[contains(@class, "content")]',
),
'strip' => array(
)
);
20 changes: 20 additions & 0 deletions lib/PicoFeed/Rules/www.bbc.co.uk.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<?php
return array(
'test_url' => 'http://www.bbc.co.uk/news/world-middle-east-23911833',
'body' => array(
'//div[@class="story-body"]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//*[@class="story-date"]',
'//*[@class="story-header"]',
'//*[@class="story-related"]',
'//*[contains(@class, "byline")]',
'//*[contains(@class, "story-feature")]',
'//*[@id="video-carousel-container"]',
'//*[@id="also-related-links"]',
'//*[contains(@class, "share") or contains(@class, "hidden") or contains(@class, "hyper")]',
)
);
8 changes: 8 additions & 0 deletions lib/PicoFeed/Rules/www.cnn.com.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<?php
return array(
'test_url' => 'http://www.cnn.com/2013/08/31/world/meast/syria-civil-war/index.html?hpt=hp_t1',
'body' => array(
'//*[contains(@class, "cnn_storypgraphtxt")]]',
'//*[contains(@class, "cnnvideo_wrapper")]]',
),
);
9 changes: 9 additions & 0 deletions lib/PicoFeed/Rules/www.lemonde.fr.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<?php
return array(
'test_url' => 'http://www.lemonde.fr/societe/article/2013/08/30/boris-boillon-ancien-ambassadeur-de-sarkozy-arrete-avec-350-000-euros-en-liquide_3469109_3224.html',
'body' => array(
'//div[@id="articleBody"]',
),
'strip' => array(
),
);
10 changes: 10 additions & 0 deletions lib/PicoFeed/Rules/www.numerama.com.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?php
return array(
'test_url' => 'http://www.numerama.com/magazine/26857-bientot-des-robots-dans-les-cuisines-de-mcdo.html',
'body' => array(
'//*[@id="general_content"]/table/tbody/tr/td[1]/div/div/div[6]/h2',
'//div[@id="newstext"]',
),
'strip' => array(
)
);
17 changes: 17 additions & 0 deletions lib/PicoFeed/Rules/www.slate.fr.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?php
return array(
'test_url' => 'http://www.slate.fr/monde/77034/allemagne-2013-couacs-campagne',
'body' => array(
'//div[@class="article_content"]',
),
'strip' => array(
'//script',
'//style',
'//*[@id="slate_associated_bn"]',
'//*[@id="ligatus-article"]',
'//*[@id="article_sidebar"]',
'//div[contains(@id, "reseaux")]',
'//*[contains(@class, "smart") or contains(@class, "article_tags") or contains(@class, "article_reactions")]',
'//*[contains(@class, "OUTBRAIN") or contains(@class, "related_item") or contains(@class, "share")]',
)
);
23 changes: 15 additions & 8 deletions tests/GrabberTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,24 @@ class GrabberTest extends PHPUnit_Framework_TestCase
public function testGrabContentWithCandidates()
{
$grabber = new Grabber('http://www.lemonde.fr/proche-orient/article/2013/08/30/la-france-nouvelle-plus-ancienne-alliee-des-etats-unis_3469218_3218.html');
$this->assertTrue($grabber->download());
$grabber->download();
$this->assertTrue($grabber->parse());

$grabber = new Grabber('http://www.rue89.com/2013/08/30/faisait-boris-boillon-ex-sarko-boy-350-000-euros-gare-nord-245315');
$this->assertTrue($grabber->download());
$grabber->download();
$this->assertTrue($grabber->parse());

$grabber = new Grabber('http://montreal.ctvnews.ca/quebec-premier-has-positive-words-for-enbridge-pipeline-project-1.1432695');
$this->assertTrue($grabber->download());
$grabber->download();
$this->assertTrue($grabber->parse());

$grabber = new Grabber('http://www.inc.com/suzanne-lucas/why-employee-turnover-is-so-costly.html');
$this->assertTrue($grabber->download());
$grabber->download();
$this->assertTrue($grabber->parse());

$grabber = new Grabber('http://arstechnica.com/information-technology/2013/08/sysadmin-security-fail-nsa-finds-snowden-hijacked-officials-logins/');
$this->assertTrue($grabber->download());
$grabber->download();
$this->assertTrue($grabber->parse());

//var_dump($grabber->content);
}
Expand All @@ -37,7 +42,8 @@ public function testGetRules()
public function testGrabContent()
{
$grabber = new Grabber('http://www.egscomics.com/index.php?id=1690');
$this->assertTrue($grabber->download());
$grabber->download();
$this->assertTrue($grabber->parse());

$this->assertEquals('<img title="2013-08-22" src="comics/../comics/1377151029-2013-08-22.png" id="comic" border="0" />', $grabber->content);
}
Expand Down Expand Up @@ -70,11 +76,12 @@ public function testAllFilters()
if (isset($rule['test_url'])) {

$grabber = new Grabber($rule['test_url']);
$r = $grabber->download();
$grabber->download();
$r = $grabber->parse();

if (! $r) {
var_dump($rule);
var_dump($grabber->content);
//var_dump($grabber->content);
}

$this->assertTrue($r);
Expand Down

0 comments on commit 0eec166

Please sign in to comment.