chore(crate): remove unused

spider-rs · Dec 28, 2023 · c1c3b9a · c1c3b9a
1 parent 92d7d06
commit c1c3b9a
Show file tree

Hide file tree

Showing 5 changed files with 27 additions and 72 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 edition = "2021"
 name = "spider_rs"
-version = "0.0.12"
+version = "0.0.13"
 description = "The fastest web crawler written in Rust ported to nodejs."
 repository = "https://github.com/spider-rs/spider-nodejs"
 

diff --git a/README.md b/README.md
@@ -2,13 +2,6 @@
 
 The [spider](https://github.com/spider-rs/spider) project ported to Python.
 
-Test url: `https://espn.com`
-
-| `libraries`                    | `pages`   | `speed` |
-| :----------------------------- | :-------- | :------ |
-| **`spider-rs(python): crawl`** | `150,387` | `186s`  |
-| **`scrapy(python): crawl`**    | `49,598`  | `1h`    |
-
 The benches above were ran on a mac m1, spider on linux arm machines performs about 2-10x faster.
 
 ## Getting Started
@@ -39,7 +32,18 @@ Install maturin `pipx install maturin` and python.
 
 ## Benchmarks
 
-View [bench](./bench/) to see the results.
+View the [benchmarks](./bench/README.md) to see a breakdown between libs and platforms.
+
+Test url: `https://espn.com`
+
+| `libraries`                 | `pages`   | `speed` |
+| :-------------------------- | :-------- | :------ |
+| **`spider(rust): crawl`**   | `150,387` | `1m`    |
+| **`spider(nodejs): crawl`** | `150,387` | `153s`  |
+| **`spider(python): crawl`** | `150,387` | `186s`  |
+| **`scrapy(python): crawl`** | `49,598`  | `1h`    |
+
+The benches above were ran on a mac m1, spider on linux arm machines performs about 2-10x faster.
 
 ## Issues
 

diff --git a/bench/scrappy.py b/bench/scrappy.py
@@ -1,12 +1,16 @@
-import time, scrapy
+import time, scrapy, sys
 from scrapy.linkextractors import LinkExtractor
 from scrapy.spiders import CrawlSpider, Rule
 from scrapy.crawler import CrawlerProcess
+from urllib.parse import urlparse
+
+url = len(sys.argv) > 1 and str(sys.argv[1]) or "https://rsseau.fr"
+host = urlparse(url).hostname
 
 class MySpider(CrawlSpider):
-    name = 'rsseau.fr'
-    allowed_domains = ['rsseau.fr']
-    start_urls = ['https://rsseau.fr']
+    name = host
+    allowed_domains = [host]
+    start_urls = [url]
     links = []
     rules = (
         Rule(LinkExtractor(), callback='parse_item', follow=True),
@@ -27,5 +31,4 @@ def parse_item(self, response):
 process.crawl(spider)
 process.start()
 end = time.time()
-print("pages found " + str(len(spider.links)))
-print("elasped duration " + str(end - start))
+print(url, "pages found " + str(len(spider.links)), "elasped duration " + str(end - start) + "s", sep="\n")
diff --git a/bench/spider.py b/bench/spider.py
@@ -1,16 +1,15 @@
-import asyncio
+import asyncio, time, sys
 import time
-
 from spider_rs import Website
 
 async def main():
     print("benching spider-rs(python)...")
-    website = Website("https://rsseau.fr")
+    url = len(sys.argv) > 1 and str(sys.argv[1]) or "https://rsseau.fr"
+    website = Website(url)
     start = time.time()
     website.crawl()
     end = time.time()
     links = website.get_links()
-    print("pages found " + str(len(links)))
-    print("elasped duration " + str(end - start))
+    print(url, "pages found " + str(len(links)), "elasped duration " + str(end - start) + "s", sep="\n")
 
 asyncio.run(main())
diff --git a/src/website.rs b/src/website.rs
@@ -7,7 +7,7 @@ use spider::tokio::task::JoinHandle;
 use spider::utils::shutdown;
 use std::time::Duration;
 
-/// a website holding the inner spider::website::Website from Rust fit for python.
+/// A website holding the inner spider::website::Website from Rust fit for python.
 #[pyclass]
 pub struct Website {
   /// the website from spider.
@@ -18,15 +18,8 @@ pub struct Website {
   crawl_handles: IndexMap<u32, JoinHandle<()>>,
   /// do not convert content to UT8.
   raw_content: bool,
-  /// the data collected.
-  collected_data: Box<Vec<u8>>,
   /// is the crawl running in the background.
   running_in_background: bool, // /// the file handle for storing data
-                               // file_handle: Option<spider::tokio::fs::File>,
-}
-
-struct PageEvent {
-  pub page: NPage,
 }
 
 #[pymethods]
@@ -39,7 +32,6 @@ impl Website {
       subscription_handles: IndexMap::new(),
       crawl_handles: IndexMap::new(),
       raw_content: raw_content.unwrap_or_default(),
-      collected_data: Box::new(Vec::new()),
       running_in_background: false, // file_handle: None,
     }
   }
@@ -49,49 +41,6 @@ impl Website {
     self.inner.get_status().to_string()
   }
 
-  // /// store data to memory for disk storing. This will create the path if not exist and defaults to ./storage.
-  // pub async fn export_jsonl_data(&self, export_path: Option<String>) -> std::io::Result<()> {
-  //   use spider::tokio::io::AsyncWriteExt;
-  //   let file = match export_path {
-  //     Some(p) => {
-  //       let base_dir = p
-  //         .split("/")
-  //         .into_iter()
-  //         .map(|f| {
-  //           if f.contains(".") {
-  //             "".to_string()
-  //           } else {
-  //             f.to_string()
-  //           }
-  //         })
-  //         .collect::<String>();
-
-  //       spider::tokio::fs::create_dir_all(&base_dir).await?;
-
-  //       if !p.contains(".") {
-  //         p + ".jsonl"
-  //       } else {
-  //         p
-  //       }
-  //     }
-  //     _ => {
-  //       spider::tokio::fs::create_dir_all("./storage").await?;
-  //       "./storage/".to_owned()
-  //         + &self
-  //           .inner
-  //           .get_domain()
-  //           .inner()
-  //           .replace("http://", "")
-  //           .replace("https://", "")
-  //         + "jsonl"
-  //     }
-  //   };
-  //   let mut file = spider::tokio::fs::File::create(file).await?;
-  //   // transform data step needed to auto convert type ..
-  //   file.write_all(&self.collected_data).await?;
-  //   Ok(())
-  // }
-
   /// subscribe and add an event listener.
   pub fn subscribe(mut slf: PyRefMut<'_, Self>, on_page_event: PyObject) -> u32 {
     let mut rx2 = slf