Skip to content

Commit

Permalink
chore(crate): remove unused
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 28, 2023
1 parent 92d7d06 commit a99780c
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 72 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
edition = "2021"
name = "spider_rs"
version = "0.0.12"
version = "0.0.13"
description = "The fastest web crawler written in Rust ported to nodejs."
repository = "https://github.com/spider-rs/spider-nodejs"

Expand Down
20 changes: 12 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,6 @@

The [spider](https://github.com/spider-rs/spider) project ported to Python.

Test url: `https://espn.com`

| `libraries` | `pages` | `speed` |
| :----------------------------- | :-------- | :------ |
| **`spider-rs(python): crawl`** | `150,387` | `186s` |
| **`scrapy(python): crawl`** | `49,598` | `1h` |

The benches above were ran on a mac m1, spider on linux arm machines performs about 2-10x faster.

## Getting Started
Expand Down Expand Up @@ -39,7 +32,18 @@ Install maturin `pipx install maturin` and python.

## Benchmarks

View [bench](./bench/) to see the results.
View the [benchmarks](./bench/README.md) to see a breakdown between libs and platforms.

Test url: `https://espn.com`

| `libraries` | `pages` | `speed` |
| :-------------------------- | :-------- | :------ |
| **`spider(rust): crawl`** | `150,387` | `1m` |
| **`spider(nodejs): crawl`** | `150,387` | `153s` |
| **`spider(python): crawl`** | `150,387` | `186s` |
| **`scrapy(python): crawl`** | `49,598` | `1h` |

The benches above were ran on a mac m1, spider on linux arm machines performs about 2-10x faster.

## Issues

Expand Down
15 changes: 9 additions & 6 deletions bench/scrappy.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import time, scrapy
import time, scrapy, sys
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from urllib.parse import urlparse

url = len(sys.argv) > 1 and str(sys.argv[1]) or "https://rsseau.fr"
host = urlparse(url).hostname

class MySpider(CrawlSpider):
name = 'rsseau.fr'
allowed_domains = ['rsseau.fr']
start_urls = ['https://rsseau.fr']
name = host
allowed_domains = [host]
start_urls = [url]
links = []
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
Expand All @@ -27,5 +31,4 @@ def parse_item(self, response):
process.crawl(spider)
process.start()
end = time.time()
print("pages found " + str(len(spider.links)))
print("elasped duration " + str(end - start))
print(url, "pages found " + str(len(spider.links)), "elasped duration " + str(end - start) + "ms", sep="\n")
9 changes: 4 additions & 5 deletions bench/spider.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
import asyncio
import asyncio, time, sys
import time

from spider_rs import Website

async def main():
print("benching spider-rs(python)...")
website = Website("https://rsseau.fr")
url = len(sys.argv) > 1 and str(sys.argv[1]) or "https://rsseau.fr"
website = Website(url)
start = time.time()
website.crawl()
end = time.time()
links = website.get_links()
print("pages found " + str(len(links)))
print("elasped duration " + str(end - start))
print(url, "pages found " + str(len(links)), "elasped duration " + str(end - start) + "ms", sep="\n")

asyncio.run(main())
53 changes: 1 addition & 52 deletions src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use spider::tokio::task::JoinHandle;
use spider::utils::shutdown;
use std::time::Duration;

/// a website holding the inner spider::website::Website from Rust fit for python.
/// A website holding the inner spider::website::Website from Rust fit for python.
#[pyclass]
pub struct Website {
/// the website from spider.
Expand All @@ -18,15 +18,8 @@ pub struct Website {
crawl_handles: IndexMap<u32, JoinHandle<()>>,
/// do not convert content to UT8.
raw_content: bool,
/// the data collected.
collected_data: Box<Vec<u8>>,
/// is the crawl running in the background.
running_in_background: bool, // /// the file handle for storing data
// file_handle: Option<spider::tokio::fs::File>,
}

struct PageEvent {
pub page: NPage,
}

#[pymethods]
Expand All @@ -39,7 +32,6 @@ impl Website {
subscription_handles: IndexMap::new(),
crawl_handles: IndexMap::new(),
raw_content: raw_content.unwrap_or_default(),
collected_data: Box::new(Vec::new()),
running_in_background: false, // file_handle: None,
}
}
Expand All @@ -49,49 +41,6 @@ impl Website {
self.inner.get_status().to_string()
}

// /// store data to memory for disk storing. This will create the path if not exist and defaults to ./storage.
// pub async fn export_jsonl_data(&self, export_path: Option<String>) -> std::io::Result<()> {
// use spider::tokio::io::AsyncWriteExt;
// let file = match export_path {
// Some(p) => {
// let base_dir = p
// .split("/")
// .into_iter()
// .map(|f| {
// if f.contains(".") {
// "".to_string()
// } else {
// f.to_string()
// }
// })
// .collect::<String>();

// spider::tokio::fs::create_dir_all(&base_dir).await?;

// if !p.contains(".") {
// p + ".jsonl"
// } else {
// p
// }
// }
// _ => {
// spider::tokio::fs::create_dir_all("./storage").await?;
// "./storage/".to_owned()
// + &self
// .inner
// .get_domain()
// .inner()
// .replace("http://", "")
// .replace("https://", "")
// + "jsonl"
// }
// };
// let mut file = spider::tokio::fs::File::create(file).await?;
// // transform data step needed to auto convert type ..
// file.write_all(&self.collected_data).await?;
// Ok(())
// }

/// subscribe and add an event listener.
pub fn subscribe(mut slf: PyRefMut<'_, Self>, on_page_event: PyObject) -> u32 {
let mut rx2 = slf
Expand Down

0 comments on commit a99780c

Please sign in to comment.