chore(website): add simple subscription crawl tie

spider-rs · Dec 9, 2023 · 5806697 · 5806697
1 parent c647e9c
commit 5806697
Show file tree

Hide file tree

Showing 5 changed files with 314 additions and 71 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 edition = "2021"
 name = "spider_rs"
-version = "0.0.7"
+version = "0.0.8"
 description = "The fastest web crawler written in Rust ported to nodejs."
 repository = "https://github.com/spider-rs/spider-nodejs"
 

diff --git a/book/src/website.md b/book/src/website.md
@@ -1,40 +1,278 @@
 # Website
 
-The website class is the foundation to Spider.
+The Website class is the foundations to the spider.
 
-## Builder 
+## Builder pattern
 
-We use the builder pattern to configure our crawler.
+We use the builder pattern to configure the website for crawling.
 
-```python
+\*note: Replace `https://choosealicense.com` from the examples below with your website target URL.
+
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com")
+    website.crawl()
+    print(website.get_links())
+
+asyncio.run(main())
+```
+
+### Custom Headers
+
+Add custom HTTP headers to use when crawling/scraping.
+
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com").with_headers({ "authorization": "mytoken"})
+
+asyncio.run(main())
+```
+
+### Blacklist
+
+Prevent crawling a set path, url, or pattern with Regex.
+
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com").with_blacklist_url(["/blog", "/resume"])
+
+asyncio.run(main())
+```
+
+### Crons
+
+Setup a cron job that can run at any time in the background using cron-syntax.
+
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com").with_cron("1/5 * * * * *")
+
+asyncio.run(main())
+```
+
+View the [cron](./cron-job.md) section for details how to use the cron.
+
+### Budget
+
+Add a crawl budget that prevents crawling `x` amount of pages.
+
+```py
 import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com").with_budget({
+    "*": 1,
+  })
 
+asyncio.run(main())
+```
+
+### Subdomains
+
+Include subdomains in request.
+
+```py
+import asyncio
 from spider_rs import Website
 
 async def main():
-    website = Website("https://choosealicense.com", False).with_headers({ "authorization": "myjwttoken" })
+    website = Website("https://choosealicense.com").with_subdomains(True)
 
 asyncio.run(main())
 ```
 
-## Subscriptions
+### TLD
+
+Include TLDs in request.
 
-```python
+```py
 import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com").with_tlds(True)
+
+asyncio.run(main())
+```
+
+### External Domains
+
+Add external domains to include with the website.
 
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com").with_external_domains(["https://www.myotherdomain.com"])
+
+asyncio.run(main())
+```
+
+### Proxy
+
+Use a proxy to crawl a website.
+
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com").with_proxies(["https://www.myproxy.com"])
+
+asyncio.run(main())
+```
+
+### Delays
+
+Add delays between pages. Defaults to none.
+
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com").with_delays(200)
+
+asyncio.run(main())
+```
+
+### User-Agent
+
+Use a custom User-Agent.
+
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com").with_user_agent("mybot/v1")
+
+asyncio.run(main())
+```
+
+### Request Timeout
+
+Add a request timeout per page in miliseconds. Example shows 30 seconds.
+
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com").with_request_timeout(30000)
+
+asyncio.run(main())
+```
+
+### Respect Robots
+
+Respect the robots.txt file.
+
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com").with_respect_robots_txt(True)
+
+asyncio.run(main())
+```
+
+### Http2 Prior Knowledge
+
+Use http2 to connect if you know the website servers supports this.
+
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com").with_http2_prior_knowledge(True)
+
+asyncio.run(main())
+```
+
+## Chaining
+
+You can chain all of the configs together for simple configuration.
+
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com").with_subdomains(true).with_tlds(true).with_user_agent("mybot/v1").with_respect_robots_txt(true)
+
+asyncio.run(main())
+```
+
+## Raw Content
+
+Set the second param of the website constructor to `true` to return content without UTF-8.
+This will return `rawContent` and leave `content` when using subscriptions or the Page Object.
+
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com", True)
+    website.scrape()
+
+asyncio.run(main())
+```
+
+## Clearing Crawl Data
+
+Use `website.clear` to remove the links visited and page data or `website.drain_links` to drain the links visited.
+
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com")
+    website.crawl()
+    print(website.getLinks())
+    website.clear()
+    print(website.getLinks())
+
+asyncio.run(main())
+```
+
+## Stop crawl
+
+To stop a crawl you can use `website.stopCrawl(id)`, pass in the crawl id to stop a run or leave empty for all crawls to stop.
+
+
+```py
+import asyncio
 from spider_rs import Website
 
 class Subscription:
     def __init__(self): 
         print("Subscription Created...") 
     def __call__(self, page): 
-        print(page.url + " - status: " + str(page.status_code)) 
-        # uncomment to perform extra parsing and get the page title 
-        # print(page.url + " - title: " + page.title()) 
+        print(page.url + " - status: " + str(page.status_code))
 
 async def main():
-    website = Website("https://choosealicense.com", False)
+    website = Website("https://choosealicense.com")
     website.crawl(Subscription())
+    website.stop()
 
 asyncio.run(main())
-```
+```
diff --git a/examples/builder.py b/examples/builder.py
@@ -0,0 +1,10 @@
+import asyncio
+
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com", False).with_agent("BotBot").with_headers({ "authorization": "Something "})
+    website.crawl()
+    print(website.get_links())
+
+asyncio.run(main())
diff --git a/examples/stop.py b/examples/stop.py
@@ -0,0 +1,17 @@
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://www.drake.com")
+
+    class Subscription:
+        def __init__(self): 
+            print("Subscription Created...") 
+        def __call__(self, page): 
+            print(page.url + " - status: " + str(page.status_code))
+            # if (website.size >= 100):
+            #     website.stop()
+
+    website.crawl(Subscription())
+
+asyncio.run(main())