From 933287a33dd124b49e5c76e5b29ea8a99b928751 Mon Sep 17 00:00:00 2001 From: Hai Zheng Date: Tue, 5 Dec 2023 17:52:24 -0500 Subject: [PATCH] v6.1-a2 in dev: crawler lane --- src/crawler.cls.php | 69 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/src/crawler.cls.php b/src/crawler.cls.php index 83e9a9a1e..e0af8fba0 100644 --- a/src/crawler.cls.php +++ b/src/crawler.cls.php @@ -282,6 +282,22 @@ public static function start($force = false) private function _crawl_data($force) { self::debug('......crawler started......'); + if ($force) { + // Log pid to prevent from multi running + if (!defined('LITESPEED_LANE_HASH')) { + define('LITESPEED_LANE_HASH', Str::rrand(8)); + } + if (!$this->_check_valid_lane()) { + // Take over lane + $this->_take_over_lane(); + } + } else { + if ($this->_check_valid_lane()) { + // Take over lane + $this->_take_over_lane(); + } + } + // for the first time running if (!$this->_summary || !Data::cls()->tb_exist('crawler') || !Data::cls()->tb_exist('crawler_blacklist')) { $this->cls('Crawler_Map')->gen(); @@ -294,6 +310,7 @@ private function _crawl_data($force) if (!$force && time() - $last_fnished_at < $this->conf(Base::O_CRAWLER_CRAWL_INTERVAL)) { self::debug('Cron abort: cache warmed already.'); // if not reach whole crawling interval, exit + $this->_release_lane(); return; } self::debug('TouchedEnd. regenerate sitemap....'); @@ -310,6 +327,7 @@ private function _crawl_data($force) if ($this->_summary['curr_crawler'] >= count($this->_crawlers)) { $this->_end_reason = 'end'; $this->_terminate_running(); + $this->_release_lane(); return; } @@ -322,6 +340,8 @@ private function _crawl_data($force) $this->load_conf(); $this->_engine_start(); + + $this->_release_lane(); } /** @@ -557,6 +577,47 @@ private function _prepare_running() self::save_summary(); } + /** + * Take over lane + * @since 6.1 + */ + private function _take_over_lane() + { + file::save($this->json_path() . '.pid', LITESPEED_LANE_HASH); + } + + /** + * Update lane file + * @since 6.1 + */ + private function _touch_lane() + { + touch($this->json_path() . '.pid'); + } + + /** + * Release lane file + * @since 6.1 + */ + private function _release_lane() + { + unlink($this->json_path() . '.pid'); + } + + /** + * Check if lane is used by other crawlers + * @since 6.1 + */ + private function _check_valid_lane() + { + // Check lane hash + $pid = file::read($this->json_path() . '.pid'); + if ($pid && LITESPEED_LANE_HASH != $pid) { + return false; + } + return true; + } + /** * Run crawler * @@ -573,6 +634,14 @@ private function _do_running() $urlChunks = array_chunk($urlChunks, $this->_cur_threads); // self::debug('$urlChunks after array_chunk: ' . count($urlChunks)); foreach ($urlChunks as $rows) { + if (!$this->_check_valid_lane()) { + $this->_end_reason = 'lane_invalid'; + self::debug('The crawler lane is used by newer crawler.'); + return; + } + // Update time + $this->_touch_lane(); + // self::debug('chunk fetching count($rows)= ' . count($rows)); // multi curl $rets = $this->_multi_request($rows, $options);