-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScrapable.php
82 lines (70 loc) · 2.02 KB
/
Scrapable.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
<?php
declare( strict_types = 1 );
namespace TheWebSolver\Codegarage\Scraper\Interfaces;
use Iterator;
use TheWebSolver\Codegarage\Scraper\Error\ScraperError;
use TheWebSolver\Codegarage\Scraper\Error\InvalidSource;
/**
* @template TKey
* @template TValue
*/
interface Scrapable {
/**
* Scrapes content from the source.
*
* @throws ScraperError When cannot scrape the content.
*/
public function scrape(): string;
/**
* Parses scraped content.
*
* @param string $content The scraped content.
* @return Iterator<TKey,TValue>
* @throws InvalidSource When cannot infer expected type from the content.
* @throws ScraperError When cannot parse the content.
*/
public function parse( string $content ): Iterator;
/**
* Caches scraped content to the cache file.
*
* @return int Number of bytes written to the cache file.
* @throws ScraperError When caching fails.
*/
public function toCache( string $content ): int;
/**
* Ensures whether scraped content has been cached to the file or not.
*/
public function hasCache(): bool;
/**
* Gets scraped content from the cached file.
*
* @throws InvalidSource When cannot get content from the cache file.
*/
public function fromCache(): string;
/**
* Deletes the cache file, if exists.
*
* @return bool `true` if the cache file is deleted, else `false`.
*/
public function invalidateCache(): bool;
/**
* Sets the cache file path.
*
* @param string $dirPath Absolute path to directory.
* @param string $filename The filename (with extension) to write content to.
* @throws InvalidSource When directory path could not be located.
*/
public function withCachePath( string $dirPath, string $filename ): static;
/**
* Gets the resource URL from where content should be scraped.
*/
public function getSourceUrl(): string;
/**
* Gets absolute path to the cache filename (with extension).
*/
public function getCachePath(): string;
/**
* Clears any garbage collected data during scraping.
*/
public function flush(): void;
}