Skip to content

Commit

Permalink
first commit.
Browse files Browse the repository at this point in the history
  • Loading branch information
couhie committed Dec 6, 2012
0 parents commit f3023fb
Show file tree
Hide file tree
Showing 2 changed files with 290 additions and 0 deletions.
185 changes: 185 additions & 0 deletions Crawler.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
<?php
require_once(dirname(__FILE__).DIRECTORY_SEPARATOR.'Dommer.php');

class Crawler {

private $headerArray = array();
private $userAgent = '';
private $cookies = array();
private $lastDomain = '';
private $lastProtocol = '';
private $lastPath = '';

/**
* __construct
*/
public function __construct($timeout_second = null, $userAgent = null) {
if (is_null($timeout_second)) {
$timeout_second = 600;
}
if (is_null($userAgent)) {
$userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:15.0) Gecko/20100101 Firefox/15.0';
}
ini_set('default_socket_timeout', $timeout_second);
$this->initCookies();
$this->userAgent = $userAgent;
}

public function initCookies() {
$this->cookies = array();
}

public function setUserAgent($userAgent) {
$this->userAgent = $userAgent;
}

public function get($url, $encode = 'utf8', $parse = true) {
$sc = $this->createStreamContext('GET');
return $this->request($url, $sc, $encode, $parse);
}

public function post($url, $encode = 'utf8', $data = null, $parse = true) {
$sc = $this->createStreamContext('POST', $data);
return $this->request($url, $sc, $encode, $parse);
}

private function getHeaderValue($key) {
$ret = array();
if (!empty($this->headerArray)) {
foreach ($this->headerArray as $header) {
if (preg_match_all("|^{$key}: ([^;]*);?|", $header, $matches)) {
$ret[] = $matches[1][0];
}
}
}
return $ret;
}

private function getRedirect($body) {
$location = false;
$status = '';
if (!empty($this->headerArray)) {
foreach ($this->headerArray as $header) {
if (preg_match_all('|^HTTP/1.1 (\d+) .*$|', $header, $matches)) {
$status = $matches[1][0];
}
}
}
if ($status == '302') {
$locations = $this->getHeaderValue('Location');
$location = $locations[0];
}
if (preg_match('|<meta +http-equiv="refresh" +[^>]*? *content=".*?url=([^;"> ]+).*?" */?>|is', $body, $match) !== 0) {
$location = $match[1];
}
return $location;
}

private function getCookie() {
$tmp = array();
$cookies = $this->getHeaderValue('Set-Cookie');
foreach ($cookies as $cookie) {
if (preg_match_all('|^(.*)=(.*)$|', $cookie, $matches)) {
$this->cookies[$matches[1][0]] = $matches[2][0];
}
}
$ret = '';
foreach ($this->cookies as $key=>$value) {
if (!empty($ret)) {
$ret .= '; ';
}
$ret .= "{$key}={$value}";
}
return $ret;
}

private function createStreamContext($method, $data = null) {
$cookie = $this->getCookie();
if (empty($data)) {
$options = array(
"http" => array(
"method"=>$method,
"request_fulluri"=>false,
"max_redirects"=>0,
"header"=>"Cookie: {$cookie}\r\n"
."User-Agent: {$this->userAgent}\r\n"
."Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"
."Accept-Language: ja-JP-mac,ja;q=0.9,ja-JP;q=0.8,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6,zh;q=0.5,zh-TW;q=0.5,pt-PT;q=0.4,pt;q=0.3,de-DE;q=0.2,de;q=0.2,pt-br;q=0.1\r\n"
."Connection: keep-alive\r\n"
));
} else {
if (is_array($data)) {
$query = http_build_query($data);
} else {
$query = $data;
}
$options = array(
"http" => array(
"method"=>$method,
"content"=>$query,
"request_fulluri"=>false,
"max_redirects"=>0,
"header"=>"Cookie: {$cookie}\r\n"
."User-Agent: {$this->userAgent}\r\n"
."Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"
."Accept-Language: ja-JP-mac,ja;q=0.9,ja-JP;q=0.8,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6,zh;q=0.5,zh-TW;q=0.5,pt-PT;q=0.4,pt;q=0.3,de-DE;q=0.2,de;q=0.2,pt-br;q=0.1\r\n"
."Connection: keep-alive\r\n"
."Content-Type: application/x-www-form-urlencoded\r\n"
."Content-Length: ".strlen($query)
));
}

return stream_context_create($options);
}

private function formatUrl($url) {
if (preg_match('#^(http[s]?)://([^/]+)(.*)/?#', $url, $matches)) {
$this->lastProtocol = $matches[1];
$this->lastDomain = $matches[2];
$this->lastPath = $matches[3];
} else {
if (strpos($url, '/') === 0) {
$url = $this->lastProtocol.'://'.$this->lastDomain.$url;
} else {
$url = $this->lastProtocol.'://'.$this->lastDomain.$this->lastPath.'/'.$url;
}
}
return $url;
}

private function checkError() {
$error = error_get_last();
if (empty($error)) {
return;
}
if ($error['file'] != __FILE__) {
return;
}
if ($error['type'] == E_WARNING && preg_match('#Redirection limit reached#i', $error['message']) !== 0) {
return;
}
error_log(date('Y-m-d H:i:s').' ====================================================================================================');
error_log($error['type']);
error_log($error['file']);
error_log($error['line']);
error_log($error['message']);
error_log('');
}

private function request($url, $context, $encode, $parse) {
$url = $this->formatUrl($url);
$ret = @file_get_contents($url, false, $context);
$this->checkError();
$this->headerArray = array();
if (!empty($http_response_header)) {
$this->headerArray = $http_response_header;
}
if ($location = $this->getRedirect($ret)) {
$ret = $this->get($location, $encode, $parse);
} else if ($parse && $ret) {
$ret = new Dommer($ret, 'text', $encode);
}
return $ret;
}

}
105 changes: 105 additions & 0 deletions Dommer.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
<?php
class Dommer {

private $tidy = null;
private $sxe = null;
private $ns = null;

/**
* __construct
*/
public function __construct($xml = null, $type = 'text', $encode = null) {
$this->tidy = new tidy();
if (!is_null($xml)) {
if ($type == 'text') {
$this->initByText($xml, $encode);
} else if ($type == 'sxe') {
$this->initBySxe($xml, $encode);
}
}
return $this;
}

public function asXML($path = null) {
return $this->sxe->asXML();
}

public function xpath($path) {
$pattern = $path;
if (!is_null($this->ns)) {
preg_match_all('|\[.*?\]|', $pattern, $matches);
$replacements = array_unique($matches[0]);
foreach ($replacements as $index=>$replacement) {
$pattern = str_replace($replacement, 'DOMMERDUMMY'.$index, $pattern);
}
$pattern = preg_replace('|(/{1,2})|', "$1{$this->ns}", $pattern);
foreach ($replacements as $index=>$replacement) {
$pattern = str_replace('DOMMERDUMMY'.$index, $replacement, $pattern);
}
}
return $this->sxe->xpath($pattern);
}

public function html($path = null) {
$sxe = $this->sxe;
if (!empty($path)) {
$sxe = $this->xpath($path);
}
$ret = '';
foreach ($sxe as $one) {
$ret .= $one->asXML();
}
return $ret;
}

public function innerHtml($path) {
$sxe = $this->xpath($path.'/*');
$ret = '';
foreach ($sxe as $one) {
$ret .= $one->asXML();
}
return $ret;
}

private function shape($text, $encode = null) {
$config = array(
'indent' => true,
'input-xml' => true,
'wrap' => 200,
);
if (is_null($encode)) {
$encode = 'utf8';
}
if ($encode != 'utf8') {
$text = mb_convert_encoding($text, 'utf8', $encode);
}
$text = preg_replace('/&?nbsp;?/', ' ', $text);
$this->tidy->parseString($text, $config, 'utf8');
$this->tidy->cleanRepair();
$text = $this->tidy->value;
return $text;
}

private function setNamespace() {
$nsArray = $this->sxe->getNamespaces();
$ns = array_shift($nsArray);
if (!is_null($ns)) {
$this->ns = 'dommer';
$this->sxe->registerXPathNamespace($this->ns, $ns);
$this->ns .= ':';
}
}

private function initByText($xml, $encode = null) {
$xml = $this->shape($xml, $encode);
$this->sxe = simplexml_load_string($xml);
$this->setNamespace();
}

private function initBySxe($xml, $encode = null) {
$xml = $this->shape($xml->asXML(), $encode);
$this->sxe = simplexml_load_string($xml);
$this->setNamespace();
}

}

0 comments on commit f3023fb

Please sign in to comment.