diff --git a/Crawler.php b/Crawler.php new file mode 100644 index 0000000..8c1f3fe --- /dev/null +++ b/Crawler.php @@ -0,0 +1,185 @@ +initCookies(); + $this->userAgent = $userAgent; + } + + public function initCookies() { + $this->cookies = array(); + } + + public function setUserAgent($userAgent) { + $this->userAgent = $userAgent; + } + + public function get($url, $encode = 'utf8', $parse = true) { + $sc = $this->createStreamContext('GET'); + return $this->request($url, $sc, $encode, $parse); + } + + public function post($url, $encode = 'utf8', $data = null, $parse = true) { + $sc = $this->createStreamContext('POST', $data); + return $this->request($url, $sc, $encode, $parse); + } + + private function getHeaderValue($key) { + $ret = array(); + if (!empty($this->headerArray)) { + foreach ($this->headerArray as $header) { + if (preg_match_all("|^{$key}: ([^;]*);?|", $header, $matches)) { + $ret[] = $matches[1][0]; + } + } + } + return $ret; + } + + private function getRedirect($body) { + $location = false; + $status = ''; + if (!empty($this->headerArray)) { + foreach ($this->headerArray as $header) { + if (preg_match_all('|^HTTP/1.1 (\d+) .*$|', $header, $matches)) { + $status = $matches[1][0]; + } + } + } + if ($status == '302') { + $locations = $this->getHeaderValue('Location'); + $location = $locations[0]; + } + if (preg_match('|]*? *content=".*?url=([^;"> ]+).*?" */?>|is', $body, $match) !== 0) { + $location = $match[1]; + } + return $location; + } + + private function getCookie() { + $tmp = array(); + $cookies = $this->getHeaderValue('Set-Cookie'); + foreach ($cookies as $cookie) { + if (preg_match_all('|^(.*)=(.*)$|', $cookie, $matches)) { + $this->cookies[$matches[1][0]] = $matches[2][0]; + } + } + $ret = ''; + foreach ($this->cookies as $key=>$value) { + if (!empty($ret)) { + $ret .= '; '; + } + $ret .= "{$key}={$value}"; + } + return $ret; + } + + private function createStreamContext($method, $data = null) { + $cookie = $this->getCookie(); + if (empty($data)) { + $options = array( + "http" => array( + "method"=>$method, + "request_fulluri"=>false, + "max_redirects"=>0, + "header"=>"Cookie: {$cookie}\r\n" + ."User-Agent: {$this->userAgent}\r\n" + ."Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n" + ."Accept-Language: ja-JP-mac,ja;q=0.9,ja-JP;q=0.8,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6,zh;q=0.5,zh-TW;q=0.5,pt-PT;q=0.4,pt;q=0.3,de-DE;q=0.2,de;q=0.2,pt-br;q=0.1\r\n" + ."Connection: keep-alive\r\n" + )); + } else { + if (is_array($data)) { + $query = http_build_query($data); + } else { + $query = $data; + } + $options = array( + "http" => array( + "method"=>$method, + "content"=>$query, + "request_fulluri"=>false, + "max_redirects"=>0, + "header"=>"Cookie: {$cookie}\r\n" + ."User-Agent: {$this->userAgent}\r\n" + ."Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n" + ."Accept-Language: ja-JP-mac,ja;q=0.9,ja-JP;q=0.8,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6,zh;q=0.5,zh-TW;q=0.5,pt-PT;q=0.4,pt;q=0.3,de-DE;q=0.2,de;q=0.2,pt-br;q=0.1\r\n" + ."Connection: keep-alive\r\n" + ."Content-Type: application/x-www-form-urlencoded\r\n" + ."Content-Length: ".strlen($query) + )); + } + + return stream_context_create($options); + } + + private function formatUrl($url) { + if (preg_match('#^(http[s]?)://([^/]+)(.*)/?#', $url, $matches)) { + $this->lastProtocol = $matches[1]; + $this->lastDomain = $matches[2]; + $this->lastPath = $matches[3]; + } else { + if (strpos($url, '/') === 0) { + $url = $this->lastProtocol.'://'.$this->lastDomain.$url; + } else { + $url = $this->lastProtocol.'://'.$this->lastDomain.$this->lastPath.'/'.$url; + } + } + return $url; + } + + private function checkError() { + $error = error_get_last(); + if (empty($error)) { + return; + } + if ($error['file'] != __FILE__) { + return; + } + if ($error['type'] == E_WARNING && preg_match('#Redirection limit reached#i', $error['message']) !== 0) { + return; + } + error_log(date('Y-m-d H:i:s').' ===================================================================================================='); + error_log($error['type']); + error_log($error['file']); + error_log($error['line']); + error_log($error['message']); + error_log(''); + } + + private function request($url, $context, $encode, $parse) { + $url = $this->formatUrl($url); + $ret = @file_get_contents($url, false, $context); + $this->checkError(); + $this->headerArray = array(); + if (!empty($http_response_header)) { + $this->headerArray = $http_response_header; + } + if ($location = $this->getRedirect($ret)) { + $ret = $this->get($location, $encode, $parse); + } else if ($parse && $ret) { + $ret = new Dommer($ret, 'text', $encode); + } + return $ret; + } + +} diff --git a/Dommer.php b/Dommer.php new file mode 100644 index 0000000..a7ac120 --- /dev/null +++ b/Dommer.php @@ -0,0 +1,105 @@ +tidy = new tidy(); + if (!is_null($xml)) { + if ($type == 'text') { + $this->initByText($xml, $encode); + } else if ($type == 'sxe') { + $this->initBySxe($xml, $encode); + } + } + return $this; + } + + public function asXML($path = null) { + return $this->sxe->asXML(); + } + + public function xpath($path) { + $pattern = $path; + if (!is_null($this->ns)) { + preg_match_all('|\[.*?\]|', $pattern, $matches); + $replacements = array_unique($matches[0]); + foreach ($replacements as $index=>$replacement) { + $pattern = str_replace($replacement, 'DOMMERDUMMY'.$index, $pattern); + } + $pattern = preg_replace('|(/{1,2})|', "$1{$this->ns}", $pattern); + foreach ($replacements as $index=>$replacement) { + $pattern = str_replace('DOMMERDUMMY'.$index, $replacement, $pattern); + } + } + return $this->sxe->xpath($pattern); + } + + public function html($path = null) { + $sxe = $this->sxe; + if (!empty($path)) { + $sxe = $this->xpath($path); + } + $ret = ''; + foreach ($sxe as $one) { + $ret .= $one->asXML(); + } + return $ret; + } + + public function innerHtml($path) { + $sxe = $this->xpath($path.'/*'); + $ret = ''; + foreach ($sxe as $one) { + $ret .= $one->asXML(); + } + return $ret; + } + + private function shape($text, $encode = null) { + $config = array( + 'indent' => true, + 'input-xml' => true, + 'wrap' => 200, + ); + if (is_null($encode)) { + $encode = 'utf8'; + } + if ($encode != 'utf8') { + $text = mb_convert_encoding($text, 'utf8', $encode); + } + $text = preg_replace('/&?nbsp;?/', ' ', $text); + $this->tidy->parseString($text, $config, 'utf8'); + $this->tidy->cleanRepair(); + $text = $this->tidy->value; + return $text; + } + + private function setNamespace() { + $nsArray = $this->sxe->getNamespaces(); + $ns = array_shift($nsArray); + if (!is_null($ns)) { + $this->ns = 'dommer'; + $this->sxe->registerXPathNamespace($this->ns, $ns); + $this->ns .= ':'; + } + } + + private function initByText($xml, $encode = null) { + $xml = $this->shape($xml, $encode); + $this->sxe = simplexml_load_string($xml); + $this->setNamespace(); + } + + private function initBySxe($xml, $encode = null) { + $xml = $this->shape($xml->asXML(), $encode); + $this->sxe = simplexml_load_string($xml); + $this->setNamespace(); + } + +} \ No newline at end of file