Skip to content

Commit

Permalink
chore(website): add subscription example
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 9, 2023
1 parent 3edd41b commit fde4d6e
Show file tree
Hide file tree
Showing 9 changed files with 131 additions and 36 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
edition = "2021"
name = "spider_rs"
version = "0.0.6"
version = "0.0.7"
description = "The fastest web crawler written in Rust ported to nodejs."
repository = "https://github.com/spider-rs/spider-nodejs"

Expand Down
1 change: 1 addition & 0 deletions book/src/SUMMARY.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@

# Configuration

- [Website](./website.md)
- [Environment](./env.md)

40 changes: 40 additions & 0 deletions book/src/website.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Website

The website class is the foundation to Spider.

## Builder

We use the builder pattern to configure our crawler.

```python
import asyncio

from spider_rs import Website

async def main():
website = Website("https://choosealicense.com", False).with_headers({ "authorization": "myjwttoken" })

asyncio.run(main())
```

## Subscriptions

```python
import asyncio

from spider_rs import Website

class Subscription:
def __init__(self):
print("Subscription Created...")
def __call__(self, page):
print(page.url + " - status: " + str(page.status_code))
# uncomment to perform extra parsing and get the page title
# print(page.url + " - title: " + page.title())

async def main():
website = Website("https://choosealicense.com", False)
website.crawl(Subscription())

asyncio.run(main())
```
15 changes: 15 additions & 0 deletions examples/subscription.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import asyncio

from spider_rs import Website

class Subscription:
def __init__(self):
print("Subscription Created...")
def __call__(self, page):
print(page.url + " - status: " + str(page.status_code))

async def main():
website = Website("https://choosealicense.com", False)
website.crawl(Subscription())

asyncio.run(main())
2 changes: 1 addition & 1 deletion examples/website.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from spider_rs import Website

async def main():
website = Website("https://choosealicense.com", False).with_headers({ "authorization": "myjwttoken"})
website = Website("https://choosealicense.com", False)
website.crawl()
print(website.get_links())

Expand Down
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ pub mod page;
pub mod shortcut;
pub mod website;

pub use npage::{page_title, NPage};
pub use npage::{new_page, page_title, NPage};
pub use nwebsite::NWebsite;
pub use page::Page;
pub use website::Website;
Expand Down
37 changes: 20 additions & 17 deletions src/npage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,24 +28,27 @@ pub fn page_title(page: NPage) -> String {
page.title()
}

impl NPage {
/// establish a new page
pub fn new(res: &spider::page::Page, raw: bool) -> NPage {
NPage {
url: res.get_url().into(),
status_code: res.status_code.as_u16(),
content: if raw {
Default::default()
} else {
res.get_html()
},
raw_content: if raw {
Some(res.get_html_bytes_u8().into())
} else {
None
},
}
/// get a new Page
pub fn new_page(res: &spider::page::Page, raw: bool) -> NPage {
NPage {
url: res.get_url().into(),
status_code: res.status_code.as_u16(),
content: if raw {
Default::default()
} else {
res.get_html()
},
raw_content: if raw {
Some(res.get_html_bytes_u8().into())
} else {
None
},
}
}

#[pymethods]
impl NPage {
fn __call__(&self) {}

/// the html page title.
pub fn title(&self) -> String {
Expand Down
4 changes: 2 additions & 2 deletions src/shortcut.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::NPage;
use crate::new_page;
use crate::NWebsite;
use crate::BUFFER;

Expand All @@ -13,7 +13,7 @@ pub async fn crawl(url: String, raw_content: Option<bool>) -> NWebsite {

spider::tokio::spawn(async move {
while let Ok(res) = rx2.recv().await {
if let Err(_) = tx.send(NPage::new(&res, raw_content)).await {
if let Err(_) = tx.send(new_page(&res, raw_content)).await {
println!("receiver dropped");
return;
}
Expand Down
64 changes: 50 additions & 14 deletions src/website.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use crate::NPage;
use crate::BUFFER;
use crate::{new_page, NPage, BUFFER};
use compact_str::CompactString;
use indexmap::IndexMap;
use pyo3::prelude::*;
Expand Down Expand Up @@ -102,7 +101,7 @@ impl Website {

let handle = pyo3_asyncio::tokio::get_runtime().spawn(async move {
while let Ok(res) = rx2.recv().await {
let page = NPage::new(&res, raw_content);
let page = new_page(&res, raw_content);
Python::with_gil(|py| {
let _ = on_page_event.call(py, (page, 0), None);
});
Expand Down Expand Up @@ -201,6 +200,8 @@ impl Website {
slf.running_in_background = background;
}

// todo: cleanup crawl handles

match on_page_event {
Some(callback) => {
if background {
Expand All @@ -211,9 +212,9 @@ impl Website {

let handle = spider::tokio::spawn(async move {
while let Ok(res) = rx2.recv().await {
let page = NPage::new(&res, raw_content);
let page = new_page(&res, raw_content);
Python::with_gil(|py| {
let _ = callback.call(py, (page, 0), None);
let _ = callback.call(py, (page,), None);
});
}
});
Expand Down Expand Up @@ -246,8 +247,10 @@ impl Website {

let handle = pyo3_asyncio::tokio::get_runtime().spawn(async move {
while let Ok(res) = rx2.recv().await {
let page = new_page(&res, raw_content);

Python::with_gil(|py| {
let _ = callback.call(py, (NPage::new(&res, raw_content), 0), None);
let _ = callback.call(py, (page,), None);
});
}
});
Expand All @@ -259,14 +262,29 @@ impl Website {

slf.subscription_handles.insert(id, handle);

let _ = pyo3_asyncio::tokio::get_runtime().block_on(async move {
let ss = pyo3_asyncio::tokio::get_runtime().block_on(async move {
if headless {
slf.inner.crawl().await;
} else {
slf.inner.crawl_raw().await;
}
Ok::<(), ()>(())

Ok::<PyRefMut<'_, Website>, ()>(slf)
});

match ss {
Ok(mut s) => {
let handle = s.subscription_handles.remove(&id);

match handle {
Some(s) => {
s.abort();
}
_ => (),
}
}
_ => (),
}
}
}
_ => {
Expand Down Expand Up @@ -326,8 +344,10 @@ impl Website {

let handle = spider::tokio::spawn(async move {
while let Ok(res) = rx2.recv().await {
let page = new_page(&res, raw_content);

Python::with_gil(|py| {
let _ = callback.call(py, (NPage::new(&res, raw_content), 0), None);
let _ = callback.call(py, (page,), None);
});
}
});
Expand Down Expand Up @@ -360,8 +380,10 @@ impl Website {

let handle = pyo3_asyncio::tokio::get_runtime().spawn(async move {
while let Ok(res) = rx2.recv().await {
let page = new_page(&res, raw_content);

Python::with_gil(|py| {
let _ = callback.call(py, (NPage::new(&res, raw_content), 0), None);
let _ = callback.call(py, (page,), None);
});
}
});
Expand All @@ -373,14 +395,28 @@ impl Website {

slf.subscription_handles.insert(id, handle);

let _ = pyo3_asyncio::tokio::get_runtime().block_on(async move {
let ss = pyo3_asyncio::tokio::get_runtime().block_on(async move {
if headless {
slf.inner.scrape().await;
} else {
slf.inner.scrape_raw().await;
}
Ok::<(), ()>(())
Ok::<PyRefMut<'_, Website>, ()>(slf)
});

match ss {
Ok(mut s) => {
let handle = s.subscription_handles.remove(&id);

match handle {
Some(s) => {
s.abort();
}
_ => (),
}
}
_ => (),
}
}
}
_ => {
Expand Down Expand Up @@ -428,7 +464,7 @@ impl Website {
let handler = spider::tokio::spawn(async move {
while let Ok(res) = rx2.recv().await {
Python::with_gil(|py| {
let _ = callback.call(py, (NPage::new(&res, raw_content), 0), None);
let _ = callback.call(py, (new_page(&res, raw_content),), None);
});
}
});
Expand Down Expand Up @@ -497,7 +533,7 @@ impl Website {
match self.inner.get_pages() {
Some(p) => {
for page in p.iter() {
pages.push(NPage::new(page, raw_content));
pages.push(new_page(page, raw_content));
}
}
_ => (),
Expand Down

0 comments on commit fde4d6e

Please sign in to comment.