From fde4d6e610aeb3b1478f8bfe5a45e7581c765daf Mon Sep 17 00:00:00 2001 From: j-mendez Date: Sat, 9 Dec 2023 09:46:28 -0500 Subject: [PATCH] chore(website): add subscription example --- Cargo.toml | 2 +- book/src/SUMMARY.md | 1 + book/src/website.md | 40 +++++++++++++++++++++++++ examples/subscription.py | 15 ++++++++++ examples/website.py | 2 +- src/lib.rs | 2 +- src/npage.rs | 37 ++++++++++++----------- src/shortcut.rs | 4 +-- src/website.rs | 64 +++++++++++++++++++++++++++++++--------- 9 files changed, 131 insertions(+), 36 deletions(-) create mode 100644 book/src/website.md create mode 100644 examples/subscription.py diff --git a/Cargo.toml b/Cargo.toml index a8504b2..6cfd49f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] edition = "2021" name = "spider_rs" -version = "0.0.6" +version = "0.0.7" description = "The fastest web crawler written in Rust ported to nodejs." repository = "https://github.com/spider-rs/spider-nodejs" diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md index f45e37d..fea8586 100644 --- a/book/src/SUMMARY.md +++ b/book/src/SUMMARY.md @@ -9,5 +9,6 @@ # Configuration +- [Website](./website.md) - [Environment](./env.md) diff --git a/book/src/website.md b/book/src/website.md new file mode 100644 index 0000000..a06ee24 --- /dev/null +++ b/book/src/website.md @@ -0,0 +1,40 @@ +# Website + +The website class is the foundation to Spider. + +## Builder + +We use the builder pattern to configure our crawler. + +```python +import asyncio + +from spider_rs import Website + +async def main(): + website = Website("https://choosealicense.com", False).with_headers({ "authorization": "myjwttoken" }) + +asyncio.run(main()) +``` + +## Subscriptions + +```python +import asyncio + +from spider_rs import Website + +class Subscription: + def __init__(self): + print("Subscription Created...") + def __call__(self, page): + print(page.url + " - status: " + str(page.status_code)) + # uncomment to perform extra parsing and get the page title + # print(page.url + " - title: " + page.title()) + +async def main(): + website = Website("https://choosealicense.com", False) + website.crawl(Subscription()) + +asyncio.run(main()) +``` \ No newline at end of file diff --git a/examples/subscription.py b/examples/subscription.py new file mode 100644 index 0000000..8f28986 --- /dev/null +++ b/examples/subscription.py @@ -0,0 +1,15 @@ +import asyncio + +from spider_rs import Website + +class Subscription: + def __init__(self): + print("Subscription Created...") + def __call__(self, page): + print(page.url + " - status: " + str(page.status_code)) + +async def main(): + website = Website("https://choosealicense.com", False) + website.crawl(Subscription()) + +asyncio.run(main()) \ No newline at end of file diff --git a/examples/website.py b/examples/website.py index dc9e577..0ceca56 100644 --- a/examples/website.py +++ b/examples/website.py @@ -3,7 +3,7 @@ from spider_rs import Website async def main(): - website = Website("https://choosealicense.com", False).with_headers({ "authorization": "myjwttoken"}) + website = Website("https://choosealicense.com", False) website.crawl() print(website.get_links()) diff --git a/src/lib.rs b/src/lib.rs index 362949e..6f1f2ab 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,7 +13,7 @@ pub mod page; pub mod shortcut; pub mod website; -pub use npage::{page_title, NPage}; +pub use npage::{new_page, page_title, NPage}; pub use nwebsite::NWebsite; pub use page::Page; pub use website::Website; diff --git a/src/npage.rs b/src/npage.rs index 3793a43..5e24beb 100644 --- a/src/npage.rs +++ b/src/npage.rs @@ -28,24 +28,27 @@ pub fn page_title(page: NPage) -> String { page.title() } -impl NPage { - /// establish a new page - pub fn new(res: &spider::page::Page, raw: bool) -> NPage { - NPage { - url: res.get_url().into(), - status_code: res.status_code.as_u16(), - content: if raw { - Default::default() - } else { - res.get_html() - }, - raw_content: if raw { - Some(res.get_html_bytes_u8().into()) - } else { - None - }, - } +/// get a new Page +pub fn new_page(res: &spider::page::Page, raw: bool) -> NPage { + NPage { + url: res.get_url().into(), + status_code: res.status_code.as_u16(), + content: if raw { + Default::default() + } else { + res.get_html() + }, + raw_content: if raw { + Some(res.get_html_bytes_u8().into()) + } else { + None + }, } +} + +#[pymethods] +impl NPage { + fn __call__(&self) {} /// the html page title. pub fn title(&self) -> String { diff --git a/src/shortcut.rs b/src/shortcut.rs index 4873186..11f3aa9 100644 --- a/src/shortcut.rs +++ b/src/shortcut.rs @@ -1,4 +1,4 @@ -use crate::NPage; +use crate::new_page; use crate::NWebsite; use crate::BUFFER; @@ -13,7 +13,7 @@ pub async fn crawl(url: String, raw_content: Option) -> NWebsite { spider::tokio::spawn(async move { while let Ok(res) = rx2.recv().await { - if let Err(_) = tx.send(NPage::new(&res, raw_content)).await { + if let Err(_) = tx.send(new_page(&res, raw_content)).await { println!("receiver dropped"); return; } diff --git a/src/website.rs b/src/website.rs index c2d31c0..06bfc9e 100644 --- a/src/website.rs +++ b/src/website.rs @@ -1,5 +1,4 @@ -use crate::NPage; -use crate::BUFFER; +use crate::{new_page, NPage, BUFFER}; use compact_str::CompactString; use indexmap::IndexMap; use pyo3::prelude::*; @@ -102,7 +101,7 @@ impl Website { let handle = pyo3_asyncio::tokio::get_runtime().spawn(async move { while let Ok(res) = rx2.recv().await { - let page = NPage::new(&res, raw_content); + let page = new_page(&res, raw_content); Python::with_gil(|py| { let _ = on_page_event.call(py, (page, 0), None); }); @@ -201,6 +200,8 @@ impl Website { slf.running_in_background = background; } + // todo: cleanup crawl handles + match on_page_event { Some(callback) => { if background { @@ -211,9 +212,9 @@ impl Website { let handle = spider::tokio::spawn(async move { while let Ok(res) = rx2.recv().await { - let page = NPage::new(&res, raw_content); + let page = new_page(&res, raw_content); Python::with_gil(|py| { - let _ = callback.call(py, (page, 0), None); + let _ = callback.call(py, (page,), None); }); } }); @@ -246,8 +247,10 @@ impl Website { let handle = pyo3_asyncio::tokio::get_runtime().spawn(async move { while let Ok(res) = rx2.recv().await { + let page = new_page(&res, raw_content); + Python::with_gil(|py| { - let _ = callback.call(py, (NPage::new(&res, raw_content), 0), None); + let _ = callback.call(py, (page,), None); }); } }); @@ -259,14 +262,29 @@ impl Website { slf.subscription_handles.insert(id, handle); - let _ = pyo3_asyncio::tokio::get_runtime().block_on(async move { + let ss = pyo3_asyncio::tokio::get_runtime().block_on(async move { if headless { slf.inner.crawl().await; } else { slf.inner.crawl_raw().await; } - Ok::<(), ()>(()) + + Ok::, ()>(slf) }); + + match ss { + Ok(mut s) => { + let handle = s.subscription_handles.remove(&id); + + match handle { + Some(s) => { + s.abort(); + } + _ => (), + } + } + _ => (), + } } } _ => { @@ -326,8 +344,10 @@ impl Website { let handle = spider::tokio::spawn(async move { while let Ok(res) = rx2.recv().await { + let page = new_page(&res, raw_content); + Python::with_gil(|py| { - let _ = callback.call(py, (NPage::new(&res, raw_content), 0), None); + let _ = callback.call(py, (page,), None); }); } }); @@ -360,8 +380,10 @@ impl Website { let handle = pyo3_asyncio::tokio::get_runtime().spawn(async move { while let Ok(res) = rx2.recv().await { + let page = new_page(&res, raw_content); + Python::with_gil(|py| { - let _ = callback.call(py, (NPage::new(&res, raw_content), 0), None); + let _ = callback.call(py, (page,), None); }); } }); @@ -373,14 +395,28 @@ impl Website { slf.subscription_handles.insert(id, handle); - let _ = pyo3_asyncio::tokio::get_runtime().block_on(async move { + let ss = pyo3_asyncio::tokio::get_runtime().block_on(async move { if headless { slf.inner.scrape().await; } else { slf.inner.scrape_raw().await; } - Ok::<(), ()>(()) + Ok::, ()>(slf) }); + + match ss { + Ok(mut s) => { + let handle = s.subscription_handles.remove(&id); + + match handle { + Some(s) => { + s.abort(); + } + _ => (), + } + } + _ => (), + } } } _ => { @@ -428,7 +464,7 @@ impl Website { let handler = spider::tokio::spawn(async move { while let Ok(res) = rx2.recv().await { Python::with_gil(|py| { - let _ = callback.call(py, (NPage::new(&res, raw_content), 0), None); + let _ = callback.call(py, (new_page(&res, raw_content),), None); }); } }); @@ -497,7 +533,7 @@ impl Website { match self.inner.get_pages() { Some(p) => { for page in p.iter() { - pages.push(NPage::new(page, raw_content)); + pages.push(new_page(page, raw_content)); } } _ => (),