diff --git a/Cargo.toml b/Cargo.toml index e1d23a9..445e4be 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] edition = "2021" name = "spider_rs" -version = "0.0.12" +version = "0.0.13" description = "The fastest web crawler written in Rust ported to nodejs." repository = "https://github.com/spider-rs/spider-nodejs" diff --git a/README.md b/README.md index 5cb90e9..9c6fcad 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,6 @@ The [spider](https://github.com/spider-rs/spider) project ported to Python. -Test url: `https://espn.com` - -| `libraries` | `pages` | `speed` | -| :----------------------------- | :-------- | :------ | -| **`spider-rs(python): crawl`** | `150,387` | `186s` | -| **`scrapy(python): crawl`** | `49,598` | `1h` | - The benches above were ran on a mac m1, spider on linux arm machines performs about 2-10x faster. ## Getting Started @@ -39,7 +32,18 @@ Install maturin `pipx install maturin` and python. ## Benchmarks -View [bench](./bench/) to see the results. +View the [benchmarks](./bench/README.md) to see a breakdown between libs and platforms. + +Test url: `https://espn.com` + +| `libraries` | `pages` | `speed` | +| :-------------------------- | :-------- | :------ | +| **`spider(rust): crawl`** | `150,387` | `1m` | +| **`spider(nodejs): crawl`** | `150,387` | `153s` | +| **`spider(python): crawl`** | `150,387` | `186s` | +| **`scrapy(python): crawl`** | `49,598` | `1h` | + +The benches above were ran on a mac m1, spider on linux arm machines performs about 2-10x faster. ## Issues diff --git a/bench/scrappy.py b/bench/scrappy.py index 965c198..ecfaef1 100644 --- a/bench/scrappy.py +++ b/bench/scrappy.py @@ -1,12 +1,16 @@ -import time, scrapy +import time, scrapy, sys from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from scrapy.crawler import CrawlerProcess +from urllib.parse import urlparse + +url = len(sys.argv) > 1 and str(sys.argv[1]) or "https://rsseau.fr" +host = urlparse(url).hostname class MySpider(CrawlSpider): - name = 'rsseau.fr' - allowed_domains = ['rsseau.fr'] - start_urls = ['https://rsseau.fr'] + name = host + allowed_domains = [host] + start_urls = [url] links = [] rules = ( Rule(LinkExtractor(), callback='parse_item', follow=True), @@ -27,5 +31,4 @@ def parse_item(self, response): process.crawl(spider) process.start() end = time.time() -print("pages found " + str(len(spider.links))) -print("elasped duration " + str(end - start)) \ No newline at end of file +print(url, "pages found " + str(len(spider.links)), "elasped duration " + str(end - start) + "ms", sep="\n") diff --git a/bench/spider.py b/bench/spider.py index fa6e033..ac07d9a 100644 --- a/bench/spider.py +++ b/bench/spider.py @@ -1,16 +1,15 @@ -import asyncio +import asyncio, time, sys import time - from spider_rs import Website async def main(): print("benching spider-rs(python)...") - website = Website("https://rsseau.fr") + url = len(sys.argv) > 1 and str(sys.argv[1]) or "https://rsseau.fr" + website = Website(url) start = time.time() website.crawl() end = time.time() links = website.get_links() - print("pages found " + str(len(links))) - print("elasped duration " + str(end - start)) + print(url, "pages found " + str(len(links)), "elasped duration " + str(end - start) + "ms", sep="\n") asyncio.run(main()) \ No newline at end of file diff --git a/src/website.rs b/src/website.rs index d3f563d..e2b98bf 100644 --- a/src/website.rs +++ b/src/website.rs @@ -7,7 +7,7 @@ use spider::tokio::task::JoinHandle; use spider::utils::shutdown; use std::time::Duration; -/// a website holding the inner spider::website::Website from Rust fit for python. +/// A website holding the inner spider::website::Website from Rust fit for python. #[pyclass] pub struct Website { /// the website from spider. @@ -18,15 +18,8 @@ pub struct Website { crawl_handles: IndexMap>, /// do not convert content to UT8. raw_content: bool, - /// the data collected. - collected_data: Box>, /// is the crawl running in the background. running_in_background: bool, // /// the file handle for storing data - // file_handle: Option, -} - -struct PageEvent { - pub page: NPage, } #[pymethods] @@ -39,7 +32,6 @@ impl Website { subscription_handles: IndexMap::new(), crawl_handles: IndexMap::new(), raw_content: raw_content.unwrap_or_default(), - collected_data: Box::new(Vec::new()), running_in_background: false, // file_handle: None, } } @@ -49,49 +41,6 @@ impl Website { self.inner.get_status().to_string() } - // /// store data to memory for disk storing. This will create the path if not exist and defaults to ./storage. - // pub async fn export_jsonl_data(&self, export_path: Option) -> std::io::Result<()> { - // use spider::tokio::io::AsyncWriteExt; - // let file = match export_path { - // Some(p) => { - // let base_dir = p - // .split("/") - // .into_iter() - // .map(|f| { - // if f.contains(".") { - // "".to_string() - // } else { - // f.to_string() - // } - // }) - // .collect::(); - - // spider::tokio::fs::create_dir_all(&base_dir).await?; - - // if !p.contains(".") { - // p + ".jsonl" - // } else { - // p - // } - // } - // _ => { - // spider::tokio::fs::create_dir_all("./storage").await?; - // "./storage/".to_owned() - // + &self - // .inner - // .get_domain() - // .inner() - // .replace("http://", "") - // .replace("https://", "") - // + "jsonl" - // } - // }; - // let mut file = spider::tokio::fs::File::create(file).await?; - // // transform data step needed to auto convert type .. - // file.write_all(&self.collected_data).await?; - // Ok(()) - // } - /// subscribe and add an event listener. pub fn subscribe(mut slf: PyRefMut<'_, Self>, on_page_event: PyObject) -> u32 { let mut rx2 = slf