Skip to content

Commit

Permalink
✨ Add scrap
Browse files Browse the repository at this point in the history
  • Loading branch information
matyo91 committed Sep 3, 2024
1 parent 22b4dad commit 76e787a
Show file tree
Hide file tree
Showing 4 changed files with 249 additions and 0 deletions.
83 changes: 83 additions & 0 deletions src/Command/ScrapCommand.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
<?php

declare(strict_types=1);

namespace App\Command;

use App\Job\FlowExamples\ScrapJob;
use App\Job\FlowExamples\ScrapUrlJob;
use App\Job\FlowExamples\ScrapUrlsJob;
use Flow\Driver\FiberDriver;
use Flow\Flow\Flow;
use Flow\Ip;
use Flow\IpStrategy\FlattenIpStrategy;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Console\Style\SymfonyStyle;
use Symfony\Contracts\HttpClient\HttpClientInterface;

use function array_slice;
use function sprintf;

#[AsCommand(
name: 'app:scrap',
description: 'This allows scrap pages with flow',
)]
class ScrapCommand extends Command
{
protected function execute(InputInterface $input, OutputInterface $output): int
{
$io = new SymfonyStyle($input, $output);

$driver = new FiberDriver();

$flow = Flow::do(function () use ($io) {
//yield new ScrapUrlsJob();
//yield static function ($responses) use ($io) {
// //printf('%s' . "\n", $responses['content']);
// $io->writeln(sprintf('Finished scrapping %d urls', count($responses)));
//};
yield [new ScrapUrlJob(), null, new FlattenIpStrategy()];
yield static function ($data) use ($io) {

Check failure on line 43 in src/Command/ScrapCommand.php

View workflow job for this annotation

GitHub Actions / Execute PHPStan analysis (8.3)

Anonymous function has an unused use $io.
dump($data);
//$io->writeln(sprintf('Finished scrapping %d urls', count($responses)));
};
}, ['driver' => $driver]);

$urls = [
'https://www.google.fr',
'https://www.apple.com',
'https://www.microsoft.com',
'https://www.amazon.com',
'https://www.facebook.com',
'https://www.netflix.com',
'https://www.spotify.com',
'https://www.wikipedia.org',
'https://www.twitter.com',
'https://www.instagram.com',
'https://www.linkedin.com',
'https://www.reddit.com',
'https://www.ebay.com',
'https://www.cnn.com',
'https://www.bbc.co.uk',
'https://www.yahoo.com',
'https://www.bing.com',
'https://www.pinterest.com',
'https://www.tumblr.com',
'https://www.paypal.com',
'https://www.dropbox.com',
'https://www.adobe.com',
'https://www.salesforce.com',
];

$flow(new Ip($urls));

$flow->await();

$io->success('Scraping is done.');

return Command::SUCCESS;
}
}
58 changes: 58 additions & 0 deletions src/IpStrategy/FlattenIpStrategy.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
<?php

declare(strict_types=1);

namespace Flow\IpStrategy;

use Flow\Event;
use Flow\Event\CountEvent;
use Flow\Event\PullEvent;
use Flow\Event\PushEvent;
use Flow\Ip;
use Flow\IpStrategyInterface;

/**
* @template T
*
* @implements IpStrategyInterface<T>
*/
class FlattenIpStrategy implements IpStrategyInterface
{
/**
* @var array<Ip<T>>
*/
private array $ips = [];

public static function getSubscribedEvents(): array
{
return [
Event::PUSH => 'push',
Event::PULL => 'pull',
Event::COUNT => 'count',

Check failure on line 31 in src/IpStrategy/FlattenIpStrategy.php

View workflow job for this annotation

GitHub Actions / Execute PHPStan analysis (8.3)

Access to undefined constant Flow\Event::COUNT.
];
}

/**
* @param PushEvent<T> $event
*/
public function push(PushEvent $event): void
{
$ip = $event->getIp();
foreach($ip->data as $data) {
$this->ips[] = new Ip($data);
}
}

/**
* @param PullEvent<T> $event
*/
public function pull(PullEvent $event): void
{
$event->setIp(array_shift($this->ips));
}

public function count(CountEvent $event): void

Check failure on line 54 in src/IpStrategy/FlattenIpStrategy.php

View workflow job for this annotation

GitHub Actions / Execute PHPStan analysis (8.3)

Parameter $event of method Flow\IpStrategy\FlattenIpStrategy::count() has invalid type Flow\Event\CountEvent.
{
$event->setCount(count($this->ips));

Check failure on line 56 in src/IpStrategy/FlattenIpStrategy.php

View workflow job for this annotation

GitHub Actions / Execute PHPStan analysis (8.3)

Call to method setCount() on an unknown class Flow\Event\CountEvent.
}
}
49 changes: 49 additions & 0 deletions src/Job/ScrapUrlJob.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
<?php

declare(strict_types=1);

namespace App\Job\FlowExamples;

use CurlMultiHandle;
use Fiber;
use Flow\JobInterface;
use Symfony\Contracts\HttpClient\HttpClientInterface;

/**
* @implements JobInterface<string, array>
*/
class ScrapUrlJob implements JobInterface
{
private CurlMultiHandle $mh;

public function __construct()
{
// Initialize a cURL multi handle
$this->mh = curl_multi_init();
}

/**
* @param string $url The URL to scrape
*
* @return array<string, mixed> Associative array with URL and its content
*/
public function __invoke($url): array
{
//$ch = curl_init();
//curl_setopt($ch, CURLOPT_URL, $url);
//curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
//curl_multi_add_handle($this->mh, $ch);



return [
'url' => $url,
'content' => '',
];
}

public function __destruct()
{
curl_multi_close($this->mh);
}
}
59 changes: 59 additions & 0 deletions src/Job/ScrapUrlsJob.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
<?php

declare(strict_types=1);

namespace App\Job\FlowExamples;

use CurlMultiHandle;
use Fiber;
use Flow\JobInterface;
use Symfony\Contracts\HttpClient\HttpClientInterface;

/**
* @implements JobInterface<array<string>, array>
*/
class ScrapUrlsJob implements JobInterface
{
public function __invoke($urls): array

Check failure on line 17 in src/Job/ScrapUrlsJob.php

View workflow job for this annotation

GitHub Actions / Execute PHPStan analysis (8.3)

Method App\Job\FlowExamples\ScrapUrlsJob::__invoke() return type has no value type specified in iterable type array.
{
// Initialize a cURL multi handle
$mh = curl_multi_init();

// Array to hold individual cURL handles
$curl_handles = [];

// Initialize individual cURL handles and add them to the multi handle
foreach ($urls as $url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_multi_add_handle($mh, $ch);
$curl_handles[] = [$ch, $url];
}

// Execute the multi handle
$running = null;
do {
curl_multi_exec($mh, $running);

Fiber::suspend();
} while ($running > 0);

// Collect the content from each handle
$responses = [];
foreach ($curl_handles as $curl_handle) {
[$ch, $url] = $curl_handle;
$responses[] = [
'url' => $url,
'content' => curl_multi_getcontent($ch),
];
curl_multi_remove_handle($mh, $ch);
curl_close($ch);
}

// Close the multi handle
curl_multi_close($mh);

return $responses;
}
}

0 comments on commit 76e787a

Please sign in to comment.