Skip to content

Commit

Permalink
v6.1-a2 in dev: crawler lane
Browse files Browse the repository at this point in the history
  • Loading branch information
Hai Zheng committed Dec 5, 2023
1 parent 89fc3e5 commit c2bfe71
Showing 1 changed file with 69 additions and 0 deletions.
69 changes: 69 additions & 0 deletions src/crawler.cls.php
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,22 @@ public static function start($force = false)
private function _crawl_data($force)
{
self::debug('......crawler started......');
if ($force) {
// Log pid to prevent from multi running
if (!defined('LITESPEED_LANE_HASH')) {
define('LITESPEED_LANE_HASH', Str::rrand(8));
}
if (!$this->_check_valid_lane()) {
// Take over lane
$this->_take_over_lane();
}
} else {
if ($this->_check_valid_lane()) {
// Take over lane
$this->_take_over_lane();
}
}

// for the first time running
if (!$this->_summary || !Data::cls()->tb_exist('crawler') || !Data::cls()->tb_exist('crawler_blacklist')) {
$this->cls('Crawler_Map')->gen();
Expand All @@ -294,6 +310,7 @@ private function _crawl_data($force)
if (!$force && time() - $last_fnished_at < $this->conf(Base::O_CRAWLER_CRAWL_INTERVAL)) {
self::debug('Cron abort: cache warmed already.');
// if not reach whole crawling interval, exit
$this->_release_lane();
return;
}
self::debug('TouchedEnd. regenerate sitemap....');
Expand All @@ -310,6 +327,7 @@ private function _crawl_data($force)
if ($this->_summary['curr_crawler'] >= count($this->_crawlers)) {
$this->_end_reason = 'end';
$this->_terminate_running();
$this->_release_lane();
return;
}

Expand All @@ -322,6 +340,8 @@ private function _crawl_data($force)
$this->load_conf();

$this->_engine_start();

$this->_release_lane();
}

/**
Expand Down Expand Up @@ -557,6 +577,47 @@ private function _prepare_running()
self::save_summary();
}

/**
* Take over lane
* @since 6.1
*/
private function _take_over_lane()
{
file::save($this->json_path() . '.pid', LITESPEED_LANE_HASH);
}

/**
* Update lane file
* @since 6.1
*/
private function _touch_lane()
{
touch($this->json_path() . '.pid');
}

/**
* Release lane file
* @since 6.1
*/
private function _release_lane()
{
unlink($this->json_path() . '.pid');
}

/**
* Check if lane is used by other crawlers
* @since 6.1
*/
private function _check_valid_lane()
{
// Check lane hash
$pid = file::read($this->json_path() . '.pid');
if ($pid && LITESPEED_LANE_HASH != $pid) {
return false;
}
return true;
}

/**
* Run crawler
*
Expand All @@ -573,6 +634,14 @@ private function _do_running()
$urlChunks = array_chunk($urlChunks, $this->_cur_threads);
// self::debug('$urlChunks after array_chunk: ' . count($urlChunks));
foreach ($urlChunks as $rows) {
if (!$this->_check_valid_lane()) {
$this->_end_reason = 'lane_invalid';
self::debug('The crawler lane is used by newer crawler.');
return;
}
// Update time
$this->_touch_lane();

// self::debug('chunk fetching count($rows)= ' . count($rows));
// multi curl
$rets = $this->_multi_request($rows, $options);
Expand Down

0 comments on commit c2bfe71

Please sign in to comment.