diff --git a/src/Command/ScrapCommand.php b/src/Command/ScrapCommand.php new file mode 100644 index 0000000..757b2e4 --- /dev/null +++ b/src/Command/ScrapCommand.php @@ -0,0 +1,80 @@ +writeln(sprintf('Finished scrapping %d urls', count($responses))); + // }; + yield [new ScrapUrlJob(), null, new FlattenIpStrategy()]; + yield static function ($data) { + dump($data['url']); + // $io->writeln(sprintf('Finished scrapping %d urls', count($responses))); + }; + }, ['driver' => $driver]); + + $urls = [ + 'https://www.google.fr', + 'https://www.apple.com', + 'https://www.microsoft.com', + 'https://www.amazon.com', + 'https://www.facebook.com', + 'https://www.netflix.com', + 'https://www.spotify.com', + 'https://www.wikipedia.org', + 'https://www.x.com', + 'https://www.instagram.com', + 'https://www.linkedin.com', + 'https://www.reddit.com', + 'https://www.ebay.com', + 'https://www.cnn.com', + 'https://www.bbc.co.uk', + 'https://www.yahoo.com', + 'https://www.bing.com', + 'https://www.pinterest.com', + 'https://www.tumblr.com', + 'https://www.paypal.com', + 'https://www.dropbox.com', + 'https://www.adobe.com', + 'https://www.salesforce.com', + ]; + + $flow(new Ip($urls)); + + $flow->await(); + + $io->success('Scraping is done.'); + + return Command::SUCCESS; + } +} diff --git a/src/IpStrategy/FlattenIpStrategy.php b/src/IpStrategy/FlattenIpStrategy.php new file mode 100644 index 0000000..f9dae7b --- /dev/null +++ b/src/IpStrategy/FlattenIpStrategy.php @@ -0,0 +1,71 @@ + + */ +class FlattenIpStrategy implements IpStrategyInterface +{ + /** + * @var IpPool + */ + private IpPool $ipPool; + + public function __construct() + { + $this->ipPool = new IpPool(); + } + + public static function getSubscribedEvents(): array + { + return [ + Event::PUSH => 'push', + Event::PULL => 'pull', + Event::POOL => 'pool', + ]; + } + + /** + * @param PushEvent $event + */ + public function push(PushEvent $event): void + { + $ip = $event->getIp(); + if (!is_iterable($ip->data)) { + throw new LogicException('Ip data must be iterable'); + } + foreach ($ip->data as $data) { + $this->ipPool->addIp(new Ip($data)); + } + } + + /** + * @param PullEvent $event + */ + public function pull(PullEvent $event): void + { + $ip = $this->ipPool->shiftIp(); + if ($ip !== null) { + $event->addIp($ip); + } + } + + public function pool(PoolEvent $event): void + { + $event->addIps($this->ipPool->getIps()); + } +} diff --git a/src/Job/ScrapUrlJob.php b/src/Job/ScrapUrlJob.php new file mode 100644 index 0000000..26d4244 --- /dev/null +++ b/src/Job/ScrapUrlJob.php @@ -0,0 +1,62 @@ + + */ +class ScrapUrlJob implements JobInterface +{ + private CurlMultiHandle $mh; + + public function __construct() + { + // Initialize a cURL multi handle + $this->mh = curl_multi_init(); + } + + public function __destruct() + { + curl_multi_close($this->mh); + } + + /** + * @param string $url The URL to scrape + * + * @return array Associative array with URL and its content + */ + public function __invoke($url): array + { + $ch = curl_init(); + curl_setopt($ch, CURLOPT_URL, $url); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_multi_add_handle($this->mh, $ch); + + do { + $status = curl_multi_exec($this->mh, $active); + curl_multi_exec($this->mh, $active); + + Fiber::suspend(); + + $info = curl_multi_info_read($this->mh); + } while ( + $active && $status === CURLM_OK // check curl_multi is active + && !($info !== false && $info['handle'] === $ch && $info['result'] === CURLE_OK) // check $ch is done + ); + + $content = curl_multi_getcontent($ch); + curl_multi_remove_handle($this->mh, $ch); + curl_close($ch); + + return [ + 'url' => $url, + 'content' => $content, + ]; + } +} diff --git a/src/Job/ScrapUrlsJob.php b/src/Job/ScrapUrlsJob.php new file mode 100644 index 0000000..a52e2d4 --- /dev/null +++ b/src/Job/ScrapUrlsJob.php @@ -0,0 +1,57 @@ +, array> + */ +class ScrapUrlsJob implements JobInterface +{ + public function __invoke($urls): array + { + // Initialize a cURL multi handle + $mh = curl_multi_init(); + + // Array to hold individual cURL handles + $curl_handles = []; + + // Initialize individual cURL handles and add them to the multi handle + foreach ($urls as $url) { + $ch = curl_init(); + curl_setopt($ch, CURLOPT_URL, $url); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_multi_add_handle($mh, $ch); + $curl_handles[] = [$ch, $url]; + } + + // Execute the multi handle + $running = null; + do { + curl_multi_exec($mh, $running); + + Fiber::suspend(); + } while ($running > 0); + + // Collect the content from each handle + $responses = []; + foreach ($curl_handles as $curl_handle) { + [$ch, $url] = $curl_handle; + $responses[] = [ + 'url' => $url, + 'content' => curl_multi_getcontent($ch), + ]; + curl_multi_remove_handle($mh, $ch); + curl_close($ch); + } + + // Close the multi handle + curl_multi_close($mh); + + return $responses; + } +}