From 9c1e3e32e27eeeba5946b6ee82468242648aa74c Mon Sep 17 00:00:00 2001 From: j-mendez Date: Fri, 8 Dec 2023 11:41:45 -0500 Subject: [PATCH] chore(website): add main builder methods --- src/shortcut.rs | 2 +- src/website.rs | 388 +++++++++++++++++++++++------------------------- 2 files changed, 190 insertions(+), 200 deletions(-) diff --git a/src/shortcut.rs b/src/shortcut.rs index 77c3df0..4873186 100644 --- a/src/shortcut.rs +++ b/src/shortcut.rs @@ -33,4 +33,4 @@ pub async fn crawl(url: String, raw_content: Option) -> NWebsite { let links = pages.iter().map(|x| x.url.clone()).collect::>(); NWebsite { links, pages } -} \ No newline at end of file +} diff --git a/src/website.rs b/src/website.rs index fa40d98..be5c9bd 100644 --- a/src/website.rs +++ b/src/website.rs @@ -1,24 +1,24 @@ -use pyo3::prelude::*; use crate::NPage; use compact_str::CompactString; use indexmap::IndexMap; -use spider::utils::shutdown; +use pyo3::prelude::*; use spider::tokio::task::JoinHandle; +use spider::utils::shutdown; use std::time::Duration; #[pyfunction] -fn crawl(py: Python, url: String, raw_content: Option) -> PyResult<&PyAny> { - pyo3_asyncio::tokio::future_into_py(py, async move { - let w = crate::crawl(url, raw_content).await; +fn crawl(py: Python, url: String, raw_content: Option) -> PyResult<&PyAny> { + pyo3_asyncio::tokio::future_into_py(py, async move { + let w = crate::crawl(url, raw_content).await; - Ok(w) + Ok(w) }) } #[pymodule] fn spider_rs(_py: Python<'_>, m: &PyModule) -> PyResult<()> { - m.add_function(wrap_pyfunction!(crawl, m)?)?; - Ok(()) + m.add_function(wrap_pyfunction!(crawl, m)?)?; + Ok(()) } /// a website holding the inner spider::website::Website from Rust fit for python. @@ -35,9 +35,8 @@ pub struct Website { /// the data collected. collected_data: Box>, /// is the crawl running in the background. - running_in_background: bool - // /// the file handle for storing data - // file_handle: Option, + running_in_background: bool, // /// the file handle for storing data + // file_handle: Option, } struct PageEvent { @@ -54,8 +53,7 @@ impl Website { crawl_handles: IndexMap::new(), raw_content: raw_content.unwrap_or_default(), collected_data: Box::new(Vec::new()), - running_in_background: false - // file_handle: None, + running_in_background: false, // file_handle: None, } } @@ -63,7 +61,7 @@ impl Website { pub fn status(&self) -> String { self.inner.get_status().to_string() } - + // /// store data to memory for disk storing. This will create the path if not exist and defaults to ./storage. // pub async fn export_jsonl_data(&self, export_path: Option) -> std::io::Result<()> { // use spider::tokio::io::AsyncWriteExt; @@ -107,7 +105,6 @@ impl Website { // Ok(()) // } - // /// subscribe and add an event listener. // pub fn subscribe( // &mut self, @@ -139,7 +136,6 @@ impl Website { // id // } - // /// remove a subscription listener. // pub fn unsubscribe(&mut self, id: Option) -> bool { // match id { @@ -166,7 +162,6 @@ impl Website { // } // } - // /// stop a crawl // pub async unsafe fn stop(&mut self, id: Option) -> bool { // self.inner.stop(); @@ -201,7 +196,6 @@ impl Website { // } // } - // /// crawl a website // pub async unsafe fn crawl( // &mut self, @@ -213,7 +207,7 @@ impl Website { // let background = background.is_some() && background.unwrap_or_default(); // let headless = headless.is_some() && headless.unwrap_or_default(); // // let raw_content = self.raw_content; - + // if background { // self.running_in_background = background; // } @@ -338,7 +332,6 @@ impl Website { // // } // } - // /// scrape a website // pub async unsafe fn scrape( // &mut self, @@ -506,62 +499,57 @@ impl Website { // Cron { inner, cron_handle } // } - - // /// get all the links of a website - // pub fn get_links(&self) -> Vec { - // let links = self - // .inner - // .get_links() - // .iter() - // .map(|x| x.as_ref().to_string()) - // .collect::>(); - // links - // } - - // /// get the size of the website in amount of pages crawled. If you ran the page in the background, this value will not update. - // pub fn size(&mut self) -> u32 { - // self.inner.size() as u32 - // } + /// get all the links of a website + pub fn get_links(&self) -> Vec { + let links = self + .inner + .get_links() + .iter() + .map(|x| x.as_ref().to_string()) + .collect::>(); + links + } - // /// get all the pages of a website - requires calling website.scrape - - // pub fn get_pages(&self) -> Vec { - // let mut pages: Vec = Vec::new(); - // let raw_content = self.raw_content; + /// get the size of the website in amount of pages crawled. If you ran the page in the background, this value will not update. + pub fn size(&mut self) -> u32 { + self.inner.size() as u32 + } - // match self.inner.get_pages() { - // Some(p) => { - // for page in p.iter() { - // pages.push(NPage::new(page, raw_content)); - // } - // } - // _ => (), - // } + /// get all the pages of a website - requires calling website.scrape + pub fn get_pages(&self) -> Vec { + let mut pages: Vec = Vec::new(); + let raw_content = self.raw_content; + + match self.inner.get_pages() { + Some(p) => { + for page in p.iter() { + pages.push(NPage::new(page, raw_content)); + } + } + _ => (), + } - // pages - // } + pages + } - - // /// drain all links from storing - // pub fn drain_links(&mut self) -> Vec { - // let links = self - // .inner - // .drain_links() - // .map(|x| x.as_ref().to_string()) - // .collect::>(); + /// drain all links from storing + pub fn drain_links(&mut self) -> Vec { + let links = self + .inner + .drain_links() + .map(|x| x.as_ref().to_string()) + .collect::>(); - // links - // } + links + } - - // /// clear all links and page data - // pub fn clear(&mut self) { - // self.inner.clear(); - // } + /// clear all links and page data + pub fn clear(&mut self) { + self.inner.clear(); + } - // /// Set HTTP headers for request using [reqwest::header::HeaderMap](https://docs.rs/reqwest/latest/reqwest/header/struct.HeaderMap.html). - // pub fn with_headers(&mut self, headers: Option) -> &Self { + // pub fn with_headers(mut slf: PyRefMut<'_, Self>, headers: Option) -> PyRefMut<'_, Self> { // use std::str::FromStr; // match headers { @@ -589,72 +577,69 @@ impl Website { // _ => (), // } // } - // self.inner.with_headers(Some(h)); + // slf.inner.with_headers(Some(h)); // } // _ => { - // self.inner.with_headers(None); + // slf.inner.with_headers(None); // } // }; - // self + // slf // } - // /// Add user agent to request. - - // pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &Self { - // self.inner.configuration.with_user_agent(user_agent); - // self - // } + /// Add user agent to request. + pub fn with_user_agent(mut slf: PyRefMut<'_, Self>, user_agent: Option) -> PyRefMut<'_, Self> { + slf.inner.configuration.with_user_agent(user_agent.as_deref()); + slf + } - // /// Respect robots.txt file. - - // pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &Self { - // self - // .inner - // .configuration - // .with_respect_robots_txt(respect_robots_txt); - // self - // } + /// Respect robots.txt file. + pub fn with_respect_robots_txt(mut slf: PyRefMut<'_, Self>, respect_robots_txt: bool) -> PyRefMut<'_, Self> { + slf + .inner + .configuration + .with_respect_robots_txt(respect_robots_txt); + slf + } - // /// Include subdomains detection. - - // pub fn with_subdomains(&mut self, subdomains: bool) -> &Self { - // self.inner.configuration.with_subdomains(subdomains); - // self - // } + /// Include subdomains detection. + pub fn with_subdomains(mut slf: PyRefMut<'_, Self>, subdomains: bool) -> PyRefMut<'_, Self> { + slf.inner.configuration.with_subdomains(subdomains); + slf + } - // /// Include tld detection. - - // pub fn with_tld(&mut self, tld: bool) -> &Self { - // self.inner.configuration.with_tld(tld); - // self - // } + /// Include tld detection. + pub fn with_tld(mut slf: PyRefMut<'_, Self>, tld: bool) -> PyRefMut<'_, Self> { + slf.inner.configuration.with_tld(tld); + slf + } - // /// Only use HTTP/2. - - // pub fn with_http2_prior_knowledge(&mut self, http2_prior_knowledge: bool) -> &Self { - // self - // .inner - // .configuration - // .with_http2_prior_knowledge(http2_prior_knowledge); - // self - // } + /// Only use HTTP/2. + pub fn with_http2_prior_knowledge(mut slf: PyRefMut<'_, Self>, http2_prior_knowledge: bool) -> PyRefMut<'_, Self> { + slf + .inner + .configuration + .with_http2_prior_knowledge(http2_prior_knowledge); + slf + } - // /// Max time to wait for request duration to milliseconds. - - // pub fn with_request_timeout(&mut self, request_timeout: Option) -> &Self { - // self - // .inner - // .configuration - // .with_request_timeout(match request_timeout { - // Some(d) => Some(Duration::from_millis(d.into())), - // _ => None, - // }); - // self - // } + /// Max time to wait for request duration to milliseconds. + pub fn with_request_timeout(mut slf: PyRefMut<'_, Self>, request_timeout: Option) -> PyRefMut<'_, Self> { + slf + .inner + .configuration + .with_request_timeout(match request_timeout { + Some(d) => Some(Duration::from_millis(d.into())), + _ => None, + }); + slf + } /// add external domains - pub fn with_external_domains(mut slf: PyRefMut<'_, Self>, external_domains: Option>) -> PyRefMut<'_, Self> { + pub fn with_external_domains( + mut slf: PyRefMut<'_, Self>, + external_domains: Option>, + ) -> PyRefMut<'_, Self> { slf.inner.with_external_domains(match external_domains { Some(ext) => Some(ext.into_iter()), _ => None, @@ -662,93 +647,99 @@ impl Website { slf } - - // /// Set the crawling budget - // pub fn with_budget(&mut self, budget: Option>) -> &Self { - // use spider::hashbrown::hash_map::HashMap; - - // match budget { - // Some(d) => { - // let v = d - // .iter() - // .map(|e| e.0.to_owned() + "," + &e.1.to_string()) - // .collect::(); - // let v = v - // .split(",") - // .collect::>() - // .chunks(2) - // .map(|x| (x[0], x[1].parse::().unwrap_or_default())) - // .collect::>(); - - // self.inner.with_budget(Some(v)); - // } - // _ => (), - // } + /// Set the crawling budget + pub fn with_budget( + mut slf: PyRefMut<'_, Self>, + budget: Option>, + ) -> PyRefMut<'_, Self> { + use spider::hashbrown::hash_map::HashMap; + + match budget { + Some(d) => { + let v = d + .iter() + .map(|e| e.0.to_owned() + "," + &e.1.to_string()) + .collect::(); + let v = v + .split(",") + .collect::>() + .chunks(2) + .map(|x| (x[0], x[1].parse::().unwrap_or_default())) + .collect::>(); + + slf.inner.with_budget(Some(v)); + } + _ => (), + } - // self - // } + slf + } - - // /// Regex black list urls from the crawl - // pub fn with_blacklist_url(&mut self, blacklist_url: Option>) -> &Self { - // self - // .inner - // .configuration - // .with_blacklist_url(match blacklist_url { - // Some(v) => { - // let mut blacklist: Vec = Vec::new(); - // for item in v { - // blacklist.push(CompactString::new(item)); - // } - // Some(blacklist) - // } - // _ => None, - // }); + /// Regex black list urls from the crawl + pub fn with_blacklist_url( + mut slf: PyRefMut<'_, Self>, + blacklist_url: Option>, + ) -> PyRefMut<'_, Self> { + slf + .inner + .configuration + .with_blacklist_url(match blacklist_url { + Some(v) => { + let mut blacklist: Vec = Vec::new(); + for item in v { + blacklist.push(CompactString::new(item)); + } + Some(blacklist) + } + _ => None, + }); - // self - // } + slf + } - // /// Setup cron jobs to run - - // pub fn with_cron(&mut self, cron_str: String, cron_type: Option) -> &Self { - // self.inner.with_cron( - // cron_str.as_str(), - // if cron_type.unwrap_or_default() == "scrape" { - // spider::website::CronType::Scrape - // } else { - // spider::website::CronType::Crawl - // }, - // ); - // self - // } + /// Setup cron jobs to run + pub fn with_cron( + mut slf: PyRefMut<'_, Self>, + cron_str: String, + cron_type: Option, + ) -> PyRefMut<'_, Self> { + slf.inner.with_cron( + cron_str.as_str(), + if cron_type.unwrap_or_default() == "scrape" { + spider::website::CronType::Scrape + } else { + spider::website::CronType::Crawl + }, + ); + slf + } - // /// Delay between request as ms. - - // pub fn with_delay(&mut self, delay: u32) -> &Self { - // self.inner.configuration.with_delay(delay.into()); - // self - // } + /// Delay between request as ms. + pub fn with_delay(mut slf: PyRefMut<'_, Self>, delay: u32) -> PyRefMut<'_, Self> { + slf.inner.configuration.with_delay(delay.into()); + slf + } - // /// Use proxies for request. - - // pub fn with_proxies(&mut self, proxies: Option>) -> &Self { - // self.inner.configuration.with_proxies(proxies); - // self - // } + /// Use proxies for request. + pub fn with_proxies( + mut slf: PyRefMut<'_, Self>, + proxies: Option>, + ) -> PyRefMut<'_, Self> { + slf.inner.configuration.with_proxies(proxies); + slf + } - - // /// build the inner website - not required for all builder_steps - // pub fn build(&mut self) -> &Self { - // match self.inner.build() { - // Ok(w) => self.inner = w, - // _ => (), - // } - // self - // } + /// build the inner website - not required for all builder_steps + pub fn build(mut slf: PyRefMut<'_, Self>) -> PyRefMut<'_, Self> { + match slf.inner.build() { + Ok(w) => slf.inner = w, + _ => (), + } + slf + } } /// a runner for handling crons - pub struct Cron { /// the runner task inner: spider::async_job::Runner, @@ -756,10 +747,9 @@ pub struct Cron { cron_handle: Option>, } - impl Cron { /// stop the cron instance - + pub async unsafe fn stop(&mut self) { self.inner.stop().await; match &self.cron_handle {