diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ed7e21c..f7b05fa 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,3 +27,4 @@ jobs: uses: actions-rs/cargo@v1 with: command: test + args: --all-features diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b10a0f..4458c4f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.0.2] - 2021-08-10 +### Added +- `fetch` feature + - `fetch` function to retrieve HTML + - `fetch_partially` to fetch first 10 chunks (of arbitrary size) from the URL + - `fetch_with_limit` same implementation for `fetch_partially` with custom + limit of chunks + ## [0.0.1] - 2021-07-31 ### Added - `LinkPreview` struct implementation diff --git a/Cargo.lock b/Cargo.lock index 3132b1b..7d63378 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -427,7 +427,7 @@ checksum = "320cfe77175da3a483efed4bc0adc1968ca050b098ce4f2f1c13a56626128790" [[package]] name = "link-preview" -version = "0.0.1" +version = "0.0.2" dependencies = [ "reqwest", "scraper", @@ -953,9 +953,9 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.126" +version = "1.0.127" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec7505abeacaec74ae4778d9d9328fe5a5d04253220a85c4ee022239fc996d03" +checksum = "f03b9878abf6d14e6779d3f24f07b2cfa90352cfec4acc5aab8f1ac7f146fae8" [[package]] name = "serde_json" diff --git a/Cargo.toml b/Cargo.toml index a87a235..7a65894 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "link-preview" -version = "0.0.1" +version = "0.0.2" authors = ["Esteban Borai "] edition = "2018" description = "Retrieve website metadata such as title, description, preview image, author and more from OpenGraph, Google, Schema.org and Twitter compliant sites" @@ -21,4 +21,5 @@ url = "2.2.2" tokio = { version = "1.9.0", features = ["rt", "macros"] } [features] +# Provide fetch capabilities fetch = ["reqwest"] diff --git a/src/fetch.rs b/src/fetch.rs new file mode 100644 index 0000000..81d03d5 --- /dev/null +++ b/src/fetch.rs @@ -0,0 +1,108 @@ +use reqwest::get; +use scraper::Html; +use std::string::FromUtf8Error; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum Error { + #[error("Failed to fetch {0}. An error ocurred: {1}")] + FetchFailed(String, reqwest::Error), + #[error("Failed to parse response from {0}. An error ocurred: {1}")] + ParseError(String, reqwest::Error), + #[error("Failed to stream response chunks {0}. An error ocurred: {1}")] + StreamError(String, reqwest::Error), + #[error("Failed to parse bytes into UTF-8 while streaming response from {0}")] + InvalidUtf8(String, FromUtf8Error), +} + +/// Fetches the provided URL and retrieves an instance of `LinkPreview` +pub async fn fetch(url: &str) -> Result { + let resp = get(url) + .await + .map_err(|err| Error::FetchFailed(url.to_string(), err))?; + let html = resp + .text() + .await + .map_err(|err| Error::ParseError(url.to_string(), err))?; + + Ok(Html::parse_document(&html)) +} + +/// Fetches the provided URL and retrieves an instance of `LinkPreview` +pub async fn fetch_partially(url: &str) -> Result { + fetch_with_limit(url, 10).await +} + +/// Fetches the provided URL and retrieves an instance of `LinkPreview` +pub async fn fetch_with_limit(url: &str, limit: usize) -> Result { + let mut laps = 0_usize; + let mut resp = get(url) + .await + .map_err(|err| Error::FetchFailed(url.to_string(), err))?; + let mut bytes: Vec = Vec::new(); + + while let Some(chunk) = resp + .chunk() + .await + .map_err(|err| Error::StreamError(url.to_string(), err))? + { + if laps >= limit { + break; + } + + let ref mut chunk = chunk.to_vec(); + + bytes.append(chunk); + laps += 1; + } + + let html = String::from_utf8(bytes).map_err(|err| Error::InvalidUtf8(url.to_string(), err))?; + + Ok(Html::parse_document(&html)) +} + +#[cfg(test)] +mod tests { + use crate::tests::REMOTE_FULL_FEATURED_HTML; + use crate::LinkPreview; + + use super::{fetch, fetch_partially, fetch_with_limit}; + + #[tokio::test] + async fn fetches() { + let html = fetch(REMOTE_FULL_FEATURED_HTML).await.unwrap(); + let link_preview = LinkPreview::from(&html); + + assert_eq!( + link_preview.title.unwrap_or(String::default()), + "SEO Strategies for a better web" + ); + assert_eq!(link_preview.description.unwrap_or(String::default()), "John Appleseed tells you his secrets on SEO for a better web experience by taking advantage of OpenGraph\'s Tags!"); + } + + #[tokio::test] + async fn fetches_page_partially() { + let html = fetch_partially(REMOTE_FULL_FEATURED_HTML).await.unwrap(); + let link_preview = LinkPreview::from(&html); + + assert_eq!( + link_preview.title.unwrap_or(String::default()), + "SEO Strategies for a better web" + ); + assert_eq!(link_preview.description.unwrap_or(String::default()), "John Appleseed tells you his secrets on SEO for a better web experience by taking advantage of OpenGraph\'s Tags!"); + } + + #[tokio::test] + async fn fetches_page_with_limit_of_20() { + let html = fetch_with_limit(REMOTE_FULL_FEATURED_HTML, 20) + .await + .unwrap(); + let link_preview = LinkPreview::from(&html); + + assert_eq!( + link_preview.title.unwrap_or(String::default()), + "SEO Strategies for a better web" + ); + assert_eq!(link_preview.description.unwrap_or(String::default()), "John Appleseed tells you his secrets on SEO for a better web experience by taking advantage of OpenGraph\'s Tags!"); + } +} diff --git a/src/lib.rs b/src/lib.rs index 31f72e8..2c7396f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,6 @@ +#[cfg(feature = "fetch")] +pub mod fetch; + pub mod html; pub mod og; pub mod preview; @@ -12,4 +15,8 @@ mod tests { pub const OG_COMPILANT_HTML: &[u8] = include_bytes!("../html/og_compilant.html"); pub const SCHEMA_COMPILANT_HTML: &[u8] = include_bytes!("../html/schema_compilant.html"); pub const TWITTER_COMPILANT_HTML: &[u8] = include_bytes!("../html/twitter_compilant.html"); + + #[cfg(feature = "fetch")] + pub const REMOTE_FULL_FEATURED_HTML: &str = + "https://raw.githubusercontent.com/EstebanBorai/link-preview/main/html/full_featured.html"; } diff --git a/src/preview.rs b/src/preview.rs index c35412a..02d8d3a 100644 --- a/src/preview.rs +++ b/src/preview.rs @@ -1,6 +1,3 @@ -#[cfg(feature = "fetch")] -use reqwest::get; - use scraper::Html; use std::str::FromStr; use std::string::FromUtf8Error; @@ -16,6 +13,9 @@ use crate::twitter::{find_twitter_tag, TwitterMetaTag}; pub enum Error { #[error("The provided byte slice contains invalid UTF-8 characters")] InvalidUtf8(FromUtf8Error), + #[cfg(feature = "fetch")] + #[error("Failed to fetch {0}. An error ocurred: {1}")] + FailedToFetch(String, reqwest::Error), } #[derive(Debug)] @@ -27,15 +27,6 @@ pub struct LinkPreview { } impl LinkPreview { - /// Fetches the provided URL and retrieves an instance of `LinkPreview` - #[cfg(feature = "fetch")] - pub async fn fetch(url: &str) -> Self { - let resp = get(url).await.unwrap(); - let html = resp.text().await; - - LinkPreview::from_str(html) - } - /// Attempts to find the description of the page in the following order: /// /// - Document's ` element's `href` attribute