Skip to content

Commit

Permalink
allow three types of urls
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremyandrews committed Sep 17, 2023
1 parent c4eadd2 commit 642dcf9
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 11 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ license = "Apache-2.0"

[dependencies]
goose = { version = "0.17", default-features = false }
http = "0.2"
log = "0.4"
rand = "0.8"
regex = "1.5"
Expand Down
70 changes: 59 additions & 11 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

use goose::goose::GooseResponse;
use goose::prelude::*;
use http::Uri;
use log::info;
use regex::Regex;
use reqwest::header::HeaderMap;
Expand Down Expand Up @@ -679,21 +680,53 @@ pub fn valid_header_value<'a>(headers: &HeaderMap, header: (&'a str, &'a str)) -
}
}

/// Helper to confirm the URI is valid and local.
fn valid_local_uri(user: &mut GooseUser, uri: &str) -> bool {
match uri.parse::<Uri>() {
Ok(parsed_uri) => {
if let Some(parsed_host) = parsed_uri.host() {
if parsed_host == user.base_url.host_str().unwrap() {
// The URI host matches the base_url.
true
} else {
// The URI host does not match the base_url.
false
}
} else {
// The URI is a valid relative path.
true
}
}
Err(_) => {
let url_leading = format!("/{}", uri);
match url_leading.parse::<Uri>() {
Ok(_) => {
// The URI is a valid relative path (without a leading slash).
true
}
Err(_) => {
// The URI is not valid.
false
}
}
}
}
}

/// Extract all local static elements defined with a `src=` tag from the the provided html.
///
/// While you can invoke this function directly, it's generally preferred to invoke
/// [`validate_and_load_static_assets`] which in turn invokes this function.
pub async fn get_src_elements(user: &mut GooseUser, html: &str) -> Vec<String> {
// Determine the base_url that was used to load this path, used to extract absolute URLs.
let base_url = user.base_url.to_string();

// Use a case-insensitive regular expression to find all src=<foo> in the html, where
// <foo> is the URL to local image and js assets.
// @TODO: parse HTML5 srcset= also
let src_elements = Regex::new(format!(r#"(?i)src="(({base_url}|/).*?)""#).as_str()).unwrap();
let src_elements = Regex::new(r#"(?i)src="(.*?)""#).unwrap();
let mut elements: Vec<String> = Vec::new();
for url in src_elements.captures_iter(html) {
elements.push(url[1].to_string());
if valid_local_uri(user, &url[1]) {
elements.push(url[1].to_string());
}
}
elements
}
Expand All @@ -703,15 +736,14 @@ pub async fn get_src_elements(user: &mut GooseUser, html: &str) -> Vec<String> {
/// While you can invoke this function directly, it's generally preferred to invoke
/// [`validate_and_load_static_assets`] which in turn invokes this function.
pub async fn get_css_elements(user: &mut GooseUser, html: &str) -> Vec<String> {
// Determine the base_url that was used to load this path, used to extract absolute URLs.
let base_url = user.base_url.to_string();

// Use a case-insensitive regular expression to find all href=<foo> in the html, where
// <foo> is the URL to local css assets.
let css = Regex::new(format!(r#"(?i)href="(({base_url}|/).*?\.css.*?)""#).as_str()).unwrap();
let css = Regex::new(r#"(?i)href="(.*?\.css.*?)""#).unwrap();
let mut elements: Vec<String> = Vec::new();
for url in css.captures_iter(html) {
elements.push(url[1].to_string());
if valid_local_uri(user, &url[1]) {
elements.push(url[1].to_string());
}
}
elements
}
Expand Down Expand Up @@ -1079,6 +1111,22 @@ mod tests {
eprintln!("actual matches: {:#?}", urls);
}
assert_eq!(urls.len(), 3);
assert_eq!(user.weighted_users_index, usize::max_value());

let urls = get_src_elements(&mut user, HTML).await;
if urls.len() != 6 {
eprintln!(
"expected matches: {:#?}",
vec![
"http://example.com/example.jpg",
"path/to/example.gif",
"/path/to/example.",
"http://example.com/example.js",
"path/to/example.js",
"/path/to/example.js",
]
);
eprintln!("actual matches: {:#?}", urls);
}
assert_eq!(urls.len(), 6);
}
}

0 comments on commit 642dcf9

Please sign in to comment.