From 7c73e7522aa1b236c1ba44255ee6047426b291dd Mon Sep 17 00:00:00 2001 From: j-mendez Date: Wed, 27 Dec 2023 10:07:39 -0500 Subject: [PATCH] chore(bench): add benchmarks --- .github/workflows/bench.yml | 35 +++++++++++++++++++++++++ .gitignore | 3 ++- bench/README.md | 8 +++--- bench/scrappy.py | 7 +++-- book/src/SUMMARY.md | 5 ++++ book/src/benchmarks.md | 52 +++++++++++++++++++++++++++++++++++++ book/src/storing-data.md | 22 ++++++++++++++++ 7 files changed, 123 insertions(+), 9 deletions(-) create mode 100644 .github/workflows/bench.yml create mode 100644 book/src/benchmarks.md create mode 100644 book/src/storing-data.md diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml new file mode 100644 index 0000000..114739f --- /dev/null +++ b/.github/workflows/bench.yml @@ -0,0 +1,35 @@ +name: Bench Compare + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + checkout_and_test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["pypy3.9", "pypy3.10", "3.9", "3.10", "3.11", "3.12"] + + steps: + - name: Checkout code from ${{ github.repository }} + uses: actions/checkout@v4 + + - name: Setup python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + + - name: Install Deps + run: pip install scrapy && pip install spider_rs + + - name: Run Bench @spider-rs/spider-rs + run: python ./bench/spider.py + + - name: Run Bench Scrapy + run: python ./bench/scrappy.py diff --git a/.gitignore b/.gitignore index 45fa517..6790171 100644 --- a/.gitignore +++ b/.gitignore @@ -203,4 +203,5 @@ __test__/*.js /storage /bench/*.js /bench/case/**.js -/bench/storage/ \ No newline at end of file +/bench/storage/ +/bench/__pycache__ \ No newline at end of file diff --git a/bench/README.md b/bench/README.md index 90a3e94..85c64c9 100644 --- a/bench/README.md +++ b/bench/README.md @@ -3,7 +3,7 @@ You can run the benches with python in terminal. ```sh -python scrappy.py && python spider.py +python scrapy.py && python spider.py ``` ## Cases @@ -16,15 +16,15 @@ mac Apple M1 Max URL used `https://rsseau.fr` -[Scrapy](scrappy.py) +[Scrapy](scrapy.py) ``` -Scrappy +Scrapy pages found 188 elasped duration 9.301506042480469 ``` -[Spider-Rs](spider.py) +[Spider-RS](spider.py) ``` Spider diff --git a/bench/scrappy.py b/bench/scrappy.py index 22ab138..965c198 100644 --- a/bench/scrappy.py +++ b/bench/scrappy.py @@ -1,7 +1,6 @@ -import time -import scrapy -from scrapy.spiders import CrawlSpider, Rule +import time, scrapy from scrapy.linkextractors import LinkExtractor +from scrapy.spiders import CrawlSpider, Rule from scrapy.crawler import CrawlerProcess class MySpider(CrawlSpider): @@ -23,8 +22,8 @@ def parse_item(self, response): print("benching scrappy(python)...") process = CrawlerProcess() -start = time.time() spider = MySpider +start = time.time() process.crawl(spider) process.start() end = time.time() diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md index ceb1649..09e4b7e 100644 --- a/book/src/SUMMARY.md +++ b/book/src/SUMMARY.md @@ -18,3 +18,8 @@ - [Crawl](./crawl.md) - [Scrape](./scrape.md) - [Cron Job](./cron-job.md) +- [Storing Data](./storing-data.md) + +# Benchmarks + +- [Compare](./benchmarks.md) diff --git a/book/src/benchmarks.md b/book/src/benchmarks.md new file mode 100644 index 0000000..05ccf6f --- /dev/null +++ b/book/src/benchmarks.md @@ -0,0 +1,52 @@ +# Benchmarks + +View the latest runs on [github](https://github.com/spider-rs/spider-py/actions/workflows/bench.yml). + +```sh +Linux +8-core CPU +32 GB of RAM memory +----------------------- +``` + +Test url: `https://choosealicense.com` (small) +32 pages + +| `libraries` | `speed` | +| :-------------------------------- | :------ | +| **`spider-rs: crawl 10 samples`** | `76ms` | +| **`scrapy: crawl 10 samples`** | `2.5s` | + +Test url: `https://rsseau.fr` (medium) +211 pages + +| `libraries` | `speed` | +| :-------------------------------- | :------ | +| **`spider-rs: crawl 10 samples`** | `0.5s` | +| **`scrapy: crawl 10 samples`** | `72s` | + +```sh +---------------------- +mac Apple M1 Max +10-core CPU +64 GB of RAM memory +----------------------- +``` + +Test url: `https://choosealicense.com` (small) +32 pages + +| `libraries` | `speed` | +| :-------------------------------- | :------ | +| **`spider-rs: crawl 10 samples`** | `286ms` | +| **`scrapy: crawl 10 samples`** | `2.5s` | + +Test url: `https://rsseau.fr` (medium) +211 pages + +| `libraries` | `speed` | +| :-------------------------------- | :------ | +| **`spider-rs: crawl 10 samples`** | `2.5s` | +| **`scrapy: crawl 10 samples`** | `10s` | + +The performance scales the larger the website and if throttling is needed. Linux benchmarks are about 10x faster than macOS for spider-rs. diff --git a/book/src/storing-data.md b/book/src/storing-data.md new file mode 100644 index 0000000..23ccb1a --- /dev/null +++ b/book/src/storing-data.md @@ -0,0 +1,22 @@ +# Storing Data + +Storing data can be done to collect the raw content for a website. + +This allows you to upload and download the content without UTF-8 conversion. The property only appears when +setting the second param of the `Website` class constructor to true. + +```py +import asyncio +from spider_rs import Website + +class Subscription: + def __init__(self): + print("Subscription Created...") + def __call__(self, page): + print(page.url + " - bytes: " + str(page.raw_content)) + # do something with page.raw_content + +async def main(): + website = Website("https://choosealicense.com") + website.crawl(Subscription(), True) +```