From 7c73e7522aa1b236c1ba44255ee6047426b291dd Mon Sep 17 00:00:00 2001
From: j-mendez <jeff@a11ywatch.com>
Date: Wed, 27 Dec 2023 10:07:39 -0500
Subject: [PATCH] chore(bench): add benchmarks

---
 .github/workflows/bench.yml | 35 +++++++++++++++++++++++++
 .gitignore                  |  3 ++-
 bench/README.md             |  8 +++---
 bench/scrappy.py            |  7 +++--
 book/src/SUMMARY.md         |  5 ++++
 book/src/benchmarks.md      | 52 +++++++++++++++++++++++++++++++++++++
 book/src/storing-data.md    | 22 ++++++++++++++++
 7 files changed, 123 insertions(+), 9 deletions(-)
 create mode 100644 .github/workflows/bench.yml
 create mode 100644 book/src/benchmarks.md
 create mode 100644 book/src/storing-data.md

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
new file mode 100644
index 0000000..114739f
--- /dev/null
+++ b/.github/workflows/bench.yml
@@ -0,0 +1,35 @@
+name: Bench Compare
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  checkout_and_test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["pypy3.9", "pypy3.10", "3.9", "3.10", "3.11", "3.12"]
+
+    steps:
+      - name: Checkout code from ${{ github.repository }}
+        uses: actions/checkout@v4
+
+      - name: Setup python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip'
+
+      - name: Install Deps
+        run: pip install scrapy && pip install spider_rs
+
+      - name: Run Bench @spider-rs/spider-rs
+        run: python ./bench/spider.py
+
+      - name: Run Bench Scrapy
+        run: python ./bench/scrappy.py
diff --git a/.gitignore b/.gitignore
index 45fa517..6790171 100644
--- a/.gitignore
+++ b/.gitignore
@@ -203,4 +203,5 @@ __test__/*.js
 /storage
 /bench/*.js
 /bench/case/**.js
-/bench/storage/
\ No newline at end of file
+/bench/storage/
+/bench/__pycache__
\ No newline at end of file
diff --git a/bench/README.md b/bench/README.md
index 90a3e94..85c64c9 100644
--- a/bench/README.md
+++ b/bench/README.md
@@ -3,7 +3,7 @@
 You can run the benches with python in terminal.
 
 ```sh
-python scrappy.py && python spider.py
+python scrapy.py && python spider.py
 ```
 
 ## Cases
@@ -16,15 +16,15 @@ mac Apple M1 Max
 
 URL used `https://rsseau.fr`
 
-[Scrapy](scrappy.py)
+[Scrapy](scrapy.py)
 
 ```
-Scrappy
+Scrapy
 pages found 188
 elasped duration 9.301506042480469
 ```
 
-[Spider-Rs](spider.py)
+[Spider-RS](spider.py)
 
 ```
 Spider
diff --git a/bench/scrappy.py b/bench/scrappy.py
index 22ab138..965c198 100644
--- a/bench/scrappy.py
+++ b/bench/scrappy.py
@@ -1,7 +1,6 @@
-import time
-import scrapy
-from scrapy.spiders import CrawlSpider, Rule
+import time, scrapy
 from scrapy.linkextractors import LinkExtractor
+from scrapy.spiders import CrawlSpider, Rule
 from scrapy.crawler import CrawlerProcess
 
 class MySpider(CrawlSpider):
@@ -23,8 +22,8 @@ def parse_item(self, response):
 
 print("benching scrappy(python)...")
 process = CrawlerProcess()
-start = time.time()
 spider = MySpider
+start = time.time()
 process.crawl(spider)
 process.start()
 end = time.time()
diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md
index ceb1649..09e4b7e 100644
--- a/book/src/SUMMARY.md
+++ b/book/src/SUMMARY.md
@@ -18,3 +18,8 @@
 - [Crawl](./crawl.md)
 - [Scrape](./scrape.md)
 - [Cron Job](./cron-job.md)
+- [Storing Data](./storing-data.md)
+
+# Benchmarks
+
+- [Compare](./benchmarks.md)
diff --git a/book/src/benchmarks.md b/book/src/benchmarks.md
new file mode 100644
index 0000000..05ccf6f
--- /dev/null
+++ b/book/src/benchmarks.md
@@ -0,0 +1,52 @@
+# Benchmarks
+
+View the latest runs on [github](https://github.com/spider-rs/spider-py/actions/workflows/bench.yml).
+
+```sh
+Linux
+8-core CPU
+32 GB of RAM memory
+-----------------------
+```
+
+Test url: `https://choosealicense.com` (small)
+32 pages
+
+| `libraries`                       | `speed` |
+| :-------------------------------- | :------ |
+| **`spider-rs: crawl 10 samples`** | `76ms`  |
+| **`scrapy: crawl 10 samples`**    | `2.5s`  |
+
+Test url: `https://rsseau.fr` (medium)
+211 pages
+
+| `libraries`                       | `speed` |
+| :-------------------------------- | :------ |
+| **`spider-rs: crawl 10 samples`** | `0.5s`  |
+| **`scrapy: crawl 10 samples`**    | `72s`   |
+
+```sh
+----------------------
+mac Apple M1 Max
+10-core CPU
+64 GB of RAM memory
+-----------------------
+```
+
+Test url: `https://choosealicense.com` (small)
+32 pages
+
+| `libraries`                       | `speed` |
+| :-------------------------------- | :------ |
+| **`spider-rs: crawl 10 samples`** | `286ms` |
+| **`scrapy: crawl 10 samples`**    | `2.5s`  |
+
+Test url: `https://rsseau.fr` (medium)
+211 pages
+
+| `libraries`                       | `speed` |
+| :-------------------------------- | :------ |
+| **`spider-rs: crawl 10 samples`** | `2.5s`  |
+| **`scrapy: crawl 10 samples`**    | `10s`   |
+
+The performance scales the larger the website and if throttling is needed. Linux benchmarks are about 10x faster than macOS for spider-rs.
diff --git a/book/src/storing-data.md b/book/src/storing-data.md
new file mode 100644
index 0000000..23ccb1a
--- /dev/null
+++ b/book/src/storing-data.md
@@ -0,0 +1,22 @@
+# Storing Data
+
+Storing data can be done to collect the raw content for a website.
+
+This allows you to upload and download the content without UTF-8 conversion. The property only appears when
+setting the second param of the `Website` class constructor to true.
+
+```py
+import asyncio
+from spider_rs import Website
+
+class Subscription:
+    def __init__(self):
+        print("Subscription Created...")
+    def __call__(self, page):
+        print(page.url + " - bytes: " + str(page.raw_content))
+        # do something with page.raw_content
+
+async def main():
+    website = Website("https://choosealicense.com")
+    website.crawl(Subscription(), True)
+```