From 50ebd7f5d9b7890ae1ded5bbf9fb5d677cd3a23f Mon Sep 17 00:00:00 2001 From: Don Isaac Date: Wed, 22 May 2024 15:42:29 -0400 Subject: [PATCH 1/3] perf: use NonZeroU32 for query and fragment start --- url/src/lib.rs | 154 +++++++++++++++++++++++++++++---------- url/src/parser.rs | 19 +++-- url/src/path_segments.rs | 6 +- url/src/slicing.rs | 14 ++-- 4 files changed, 136 insertions(+), 57 deletions(-) diff --git a/url/src/lib.rs b/url/src/lib.rs index 1959a1213..891f55b56 100644 --- a/url/src/lib.rs +++ b/url/src/lib.rs @@ -160,6 +160,7 @@ use std::mem; use std::net::IpAddr; #[cfg(any(unix, windows, target_os = "redox", target_os = "wasi"))] use std::net::{SocketAddr, ToSocketAddrs}; +use std::num::NonZeroU32; use std::ops::{Range, RangeFrom, RangeTo}; use std::path::{Path, PathBuf}; use std::str; @@ -203,9 +204,9 @@ pub struct Url { host_end: u32, host: HostInternal, port: Option, - path_start: u32, // Before initial '/', if any - query_start: Option, // Before '?', unlike Position::QueryStart - fragment_start: Option, // Before '#', unlike Position::FragmentStart + path_start: u32, // Before initial '/', if any + query_start: Option, // Before '?', unlike Position::QueryStart + fragment_start: Option, // Before '#', unlike Position::FragmentStart } /// Full configuration for the URL parser. @@ -732,11 +733,11 @@ impl Url { } } if let Some(start) = self.query_start { - assert!(start >= self.path_start); + assert!(start.get() >= self.path_start); assert_eq!(self.byte_at(start), b'?'); } if let Some(start) = self.fragment_start { - assert!(start >= self.path_start); + assert!(start.get() >= self.path_start); assert_eq!(self.byte_at(start), b'#'); } if let (Some(query_start), Some(fragment_start)) = (self.query_start, self.fragment_start) { @@ -1332,7 +1333,7 @@ impl Url { match (self.query_start, self.fragment_start) { (None, None) => self.slice(self.path_start..), (Some(next_component_start), _) | (None, Some(next_component_start)) => { - self.slice(self.path_start..next_component_start) + self.slice(self.path_start..next_component_start.get()) } } } @@ -1409,11 +1410,11 @@ impl Url { (None, _) => None, (Some(query_start), None) => { debug_assert!(self.byte_at(query_start) == b'?'); - Some(self.slice(query_start + 1..)) + Some(self.slice(query_start.saturating_add(1)..)) } (Some(query_start), Some(fragment_start)) => { debug_assert!(self.byte_at(query_start) == b'?'); - Some(self.slice(query_start + 1..fragment_start)) + Some(self.slice(query_start.saturating_add(1)..fragment_start)) } } } @@ -1482,7 +1483,7 @@ impl Url { pub fn fragment(&self) -> Option<&str> { self.fragment_start.map(|start| { debug_assert!(self.byte_at(start) == b'#'); - self.slice(start + 1..) + self.slice(start.saturating_add(1)..) }) } @@ -1520,11 +1521,12 @@ impl Url { // Remove any previous fragment if let Some(start) = self.fragment_start { debug_assert!(self.byte_at(start) == b'#'); - self.serialization.truncate(start as usize); + self.truncate(start); } // Write the new one if let Some(input) = fragment { - self.fragment_start = Some(to_u32(self.serialization.len()).unwrap()); + self.fragment_start = + Some(NonZeroU32::new(to_u32(self.serialization.len()).unwrap()).unwrap()); self.serialization.push('#'); self.mutate(|parser| parser.parse_fragment(parser::Input::new_no_trim(input))) } else { @@ -1536,8 +1538,8 @@ impl Url { fn take_fragment(&mut self) -> Option { self.fragment_start.take().map(|start| { debug_assert!(self.byte_at(start) == b'#'); - let fragment = self.slice(start + 1..).to_owned(); - self.serialization.truncate(start as usize); + let fragment = self.slice(start.saturating_add(1)..).to_owned(); + self.truncate(start); fragment }) } @@ -1545,7 +1547,8 @@ impl Url { fn restore_already_parsed_fragment(&mut self, fragment: Option) { if let Some(ref fragment) = fragment { assert!(self.fragment_start.is_none()); - self.fragment_start = Some(to_u32(self.serialization.len()).unwrap()); + self.fragment_start = + Some(NonZeroU32::new(to_u32(self.serialization.len()).unwrap()).unwrap()); self.serialization.push('#'); self.serialization.push_str(fragment); } @@ -1577,11 +1580,12 @@ impl Url { // Remove any previous query if let Some(start) = self.query_start.take() { debug_assert!(self.byte_at(start) == b'?'); - self.serialization.truncate(start as usize); + self.truncate(start); } // Write the new query, if any if let Some(input) = query { - self.query_start = Some(to_u32(self.serialization.len()).unwrap()); + self.query_start = + Some(NonZeroU32::new(to_u32(self.serialization.len()).unwrap()).unwrap()); self.serialization.push('?'); let scheme_type = SchemeType::from(self.scheme()); let scheme_end = self.scheme_end; @@ -1641,10 +1645,10 @@ impl Url { let query_start; if let Some(start) = self.query_start { debug_assert!(self.byte_at(start) == b'?'); - query_start = start as usize; + query_start = start.get() as usize; } else { query_start = self.serialization.len(); - self.query_start = Some(to_u32(query_start).unwrap()); + self.query_start = Some(NonZeroU32::new(to_u32(query_start).unwrap()).unwrap()); self.serialization.push('?'); } @@ -1659,7 +1663,7 @@ impl Url { match (self.query_start, self.fragment_start) { (Some(i), _) | (None, Some(i)) => { let after_path = self.slice(i..).to_owned(); - self.serialization.truncate(i as usize); + self.truncate(i); after_path } (None, None) => String::new(), @@ -1740,9 +1744,10 @@ impl Url { fn restore_after_path(&mut self, old_after_path_position: u32, after_path: &str) { let new_after_path_position = to_u32(self.serialization.len()).unwrap(); - let adjust = |index: &mut u32| { - *index -= old_after_path_position; - *index += new_after_path_position; + let adjust = |index: &mut NonZeroU32| { + *index = + NonZeroU32::new(index.get() - old_after_path_position + new_after_path_position) + .unwrap(); }; if let Some(ref mut index) = self.query_start { adjust(index) @@ -1835,10 +1840,10 @@ impl Url { let offset = self.path_start - self.host_end; self.path_start = self.host_end; if let Some(ref mut index) = self.query_start { - *index -= offset + *index = NonZeroU32::new(index.get() - offset).unwrap(); } if let Some(ref mut index) = self.fragment_start { - *index -= offset + *index = NonZeroU32::new(index.get() - offset).unwrap(); } } (Some(old), Some(new)) if old == new => {} @@ -1849,9 +1854,9 @@ impl Url { let old_path_start = self.path_start; let new_path_start = to_u32(self.serialization.len()).unwrap(); self.path_start = new_path_start; - let adjust = |index: &mut u32| { - *index -= old_path_start; - *index += new_path_start; + let adjust = |index: &mut NonZeroU32| { + *index = + NonZeroU32::new(index.get() - old_path_start + new_path_start).unwrap(); }; if let Some(ref mut index) = self.query_start { adjust(index) @@ -2002,10 +2007,12 @@ impl Url { self.host_end = new_path_start; self.port = None; if let Some(ref mut index) = self.query_start { - *index -= offset + // *index -= offset + *index = NonZeroU32::new(index.get() - offset).unwrap(); } if let Some(ref mut index) = self.fragment_start { - *index -= offset + // *index -= offset + *index = NonZeroU32::new(index.get() - offset).unwrap(); } } Ok(()) @@ -2045,12 +2052,15 @@ impl Url { *index -= old_suffix_pos; *index += new_suffix_pos; }; + let adjust_nonzero = |index: &mut NonZeroU32| { + *index = NonZeroU32::new(index.get() - old_suffix_pos + new_suffix_pos).unwrap(); + }; adjust(&mut self.path_start); if let Some(ref mut index) = self.query_start { - adjust(index) + adjust_nonzero(index) } if let Some(ref mut index) = self.fragment_start { - adjust(index) + adjust_nonzero(index) } } @@ -2152,14 +2162,17 @@ impl Url { *index -= old_host_start; *index += new_host_start; }; + let adjust_nonzero = |index: &mut NonZeroU32| { + *index = NonZeroU32::new(index.get() - old_host_start + new_host_start).unwrap(); + }; self.host_start = new_host_start; adjust(&mut self.host_end); adjust(&mut self.path_start); if let Some(ref mut index) = self.query_start { - adjust(index) + adjust_nonzero(index) } if let Some(ref mut index) = self.fragment_start { - adjust(index) + adjust_nonzero(index) } self.serialization.push_str(&host_and_after); @@ -2181,10 +2194,10 @@ impl Url { self.host_end -= offset; self.path_start -= offset; if let Some(ref mut index) = self.query_start { - *index -= offset + *index = NonZeroU32::new(index.get() - offset).unwrap(); } if let Some(ref mut index) = self.fragment_start { - *index -= offset + *index = NonZeroU32::new(index.get() - offset).unwrap(); } } Ok(()) @@ -2265,14 +2278,17 @@ impl Url { *index -= removed_bytes; *index += added_bytes; }; + let adjust_nonzero = |index: &mut NonZeroU32| { + *index = NonZeroU32::new(index.get() - removed_bytes + added_bytes).unwrap(); + }; adjust(&mut self.host_start); adjust(&mut self.host_end); adjust(&mut self.path_start); if let Some(ref mut index) = self.query_start { - adjust(index) + adjust_nonzero(index) } if let Some(ref mut index) = self.fragment_start { - adjust(index) + adjust_nonzero(index) } Ok(()) } @@ -2413,6 +2429,9 @@ impl Url { *index -= old_scheme_end; *index += new_scheme_end; }; + let adjust_nonzero = |index: &mut NonZeroU32| { + *index = NonZeroU32::new(index.get() - old_scheme_end + new_scheme_end).unwrap(); + }; self.scheme_end = new_scheme_end; adjust(&mut self.username_end); @@ -2420,10 +2439,10 @@ impl Url { adjust(&mut self.host_end); adjust(&mut self.path_start); if let Some(ref mut index) = self.query_start { - adjust(index) + adjust_nonzero(index) } if let Some(ref mut index) = self.fragment_start { - adjust(index) + adjust_nonzero(index) } parser.serialization.push_str(self.slice(old_scheme_end..)); @@ -2646,13 +2665,49 @@ impl Url { { range.slice_of(&self.serialization) } +} + +trait ByteAt { + fn byte_at(&self, i: T) -> u8; +} +impl ByteAt for Url { #[inline] fn byte_at(&self, i: u32) -> u8 { self.serialization.as_bytes()[i as usize] } } +impl ByteAt for Url { + #[inline] + fn byte_at(&self, i: NonZeroU32) -> u8 { + self.serialization.as_bytes()[i.get() as usize] + } +} + +trait Truncate { + /// Truncate the URL's serialization to the given length. + fn truncate(&mut self, len: T); +} +impl Truncate for Url { + #[inline] + fn truncate(&mut self, len: u32) { + self.serialization.truncate(len as usize) + } +} +impl Truncate for Url { + #[inline] + fn truncate(&mut self, len: NonZeroU32) { + self.serialization.truncate(len.get() as usize) + } +} +impl Truncate for Url { + #[inline] + fn truncate(&mut self, len: usize) { + self.serialization.truncate(len) + } +} + /// Parse a string as an URL, without a base URL or encoding override. impl str::FromStr for Url { type Err = ParseError; @@ -2776,6 +2831,27 @@ impl RangeArg for RangeTo { } } +impl RangeArg for Range { + #[inline] + fn slice_of<'a>(&self, s: &'a str) -> &'a str { + &s[self.start.get() as usize..self.end.get() as usize] + } +} + +impl RangeArg for RangeFrom { + #[inline] + fn slice_of<'a>(&self, s: &'a str) -> &'a str { + &s[self.start.get() as usize..] + } +} + +impl RangeArg for RangeTo { + #[inline] + fn slice_of<'a>(&self, s: &'a str) -> &'a str { + &s[..self.end.get() as usize] + } +} + /// Serializes this URL into a `serde` stream. /// /// This implementation is only available if the `serde` Cargo feature is enabled. diff --git a/url/src/parser.rs b/url/src/parser.rs index 2e3a4f644..62132df7e 100644 --- a/url/src/parser.rs +++ b/url/src/parser.rs @@ -8,10 +8,11 @@ use std::error::Error; use std::fmt::{self, Formatter, Write}; +use std::num::NonZeroU32; use std::str; use crate::host::{Host, HostInternal}; -use crate::Url; +use crate::{ByteAt as _, Url}; use form_urlencoded::EncodingOverride; use percent_encoding::{percent_encode, utf8_percent_encode, AsciiSet, CONTROLS}; @@ -608,7 +609,7 @@ impl<'a> Parser<'a> { None => { // Copy everything except the fragment let before_fragment = match base_url.fragment_start { - Some(i) => &base_url.serialization[..i as usize], + Some(i) => &base_url.serialization[..i.get() as usize], None => &*base_url.serialization, }; self.serialization.push_str(before_fragment); @@ -720,7 +721,7 @@ impl<'a> Parser<'a> { None => { // Copy everything except the fragment let before_fragment = match base_url.fragment_start { - Some(i) => &base_url.serialization[..i as usize], + Some(i) => &base_url.serialization[..i.get() as usize], None => &*base_url.serialization, }; self.serialization.push_str(before_fragment); @@ -1435,12 +1436,12 @@ impl<'a> Parser<'a> { scheme_type: SchemeType, scheme_end: u32, mut input: Input<'_>, - ) -> ParseResult<(Option, Option)> { + ) -> ParseResult<(Option, Option)> { let mut query_start = None; match input.next() { Some('#') => {} Some('?') => { - query_start = Some(to_u32(self.serialization.len())?); + query_start = Some(to_u32_non_zero(self.serialization.len())?); self.serialization.push('?'); let remaining = self.parse_query(scheme_type, scheme_end, input); if let Some(remaining) = remaining { @@ -1453,7 +1454,7 @@ impl<'a> Parser<'a> { _ => panic!("Programming error. parse_query_and_fragment() called without ? or #"), } - let fragment_start = to_u32(self.serialization.len())?; + let fragment_start = to_u32_non_zero(self.serialization.len())?; self.serialization.push('#'); self.parse_fragment(input); Ok((query_start, Some(fragment_start))) @@ -1511,7 +1512,7 @@ impl<'a> Parser<'a> { self.parse_fragment(input); Ok(Url { serialization: self.serialization, - fragment_start: Some(to_u32(before_fragment.len())?), + fragment_start: Some(to_u32_non_zero(before_fragment.len())?), ..*base_url }) } @@ -1596,6 +1597,10 @@ pub fn to_u32(i: usize) -> ParseResult { Err(ParseError::Overflow) } } +#[inline] +pub fn to_u32_non_zero(i: usize) -> ParseResult { + to_u32(i).and_then(|n| NonZeroU32::new(n).ok_or(ParseError::Overflow)) +} fn is_normalized_windows_drive_letter(segment: &str) -> bool { is_windows_drive_letter(segment) && segment.as_bytes()[1] == b':' diff --git a/url/src/path_segments.rs b/url/src/path_segments.rs index d8a78d785..2f210b2f4 100644 --- a/url/src/path_segments.rs +++ b/url/src/path_segments.rs @@ -7,7 +7,7 @@ // except according to those terms. use crate::parser::{self, to_u32, SchemeType}; -use crate::Url; +use crate::{ByteAt as _, Truncate as _, Url}; use std::str; /// Exposes methods to manipulate the path of an URL that is not cannot-be-base. @@ -144,9 +144,7 @@ impl<'a> PathSegmentsMut<'a> { let last_slash = self.url.serialization[self.after_first_slash..] .rfind('/') .unwrap_or(0); - self.url - .serialization - .truncate(self.after_first_slash + last_slash); + self.url.truncate(self.after_first_slash + last_slash); self } diff --git a/url/src/slicing.rs b/url/src/slicing.rs index 13829575d..15e487dcf 100644 --- a/url/src/slicing.rs +++ b/url/src/slicing.rs @@ -6,7 +6,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use crate::Url; +use crate::{ByteAt as _, Url}; use std::ops::{Index, Range, RangeFrom, RangeFull, RangeTo}; impl Index for Url { @@ -184,29 +184,29 @@ impl Url { Position::BeforePath => self.path_start as usize, Position::AfterPath => match (self.query_start, self.fragment_start) { - (Some(q), _) => q as usize, - (None, Some(f)) => f as usize, + (Some(q), _) => q.get() as usize, + (None, Some(f)) => f.get() as usize, (None, None) => self.serialization.len(), }, Position::BeforeQuery => match (self.query_start, self.fragment_start) { (Some(q), _) => { debug_assert!(self.byte_at(q) == b'?'); - q as usize + "?".len() + q.get() as usize + "?".len() } - (None, Some(f)) => f as usize, + (None, Some(f)) => f.get() as usize, (None, None) => self.serialization.len(), }, Position::AfterQuery => match self.fragment_start { None => self.serialization.len(), - Some(f) => f as usize, + Some(f) => f.get() as usize, }, Position::BeforeFragment => match self.fragment_start { Some(f) => { debug_assert!(self.byte_at(f) == b'#'); - f as usize + "#".len() + f.get() as usize + "#".len() } None => self.serialization.len(), }, From 873cff703a4c3b351d9efdc458f622540e544306 Mon Sep 17 00:00:00 2001 From: Don Isaac Date: Wed, 29 May 2024 13:16:15 -0400 Subject: [PATCH 2/3] fix: nonzerou32 in expose_internals --- url/src/quirks.rs | 6 ++++-- url/tests/unit.rs | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/url/src/quirks.rs b/url/src/quirks.rs index 391a50dbe..629d64e40 100644 --- a/url/src/quirks.rs +++ b/url/src/quirks.rs @@ -11,6 +11,8 @@ //! Unless you need to be interoperable with web browsers, //! you probably want to use `Url` method instead. +use std::num::NonZeroU32; + use crate::parser::{default_port, Context, Input, Parser, SchemeType}; use crate::{Host, ParseError, Position, Url}; @@ -35,8 +37,8 @@ pub struct InternalComponents { pub host_end: u32, pub port: Option, pub path_start: u32, - pub query_start: Option, - pub fragment_start: Option, + pub query_start: Option, + pub fragment_start: Option, } /// Internal component / parsed offsets of the URL. diff --git a/url/tests/unit.rs b/url/tests/unit.rs index afe842beb..51d14a391 100644 --- a/url/tests/unit.rs +++ b/url/tests/unit.rs @@ -789,6 +789,8 @@ fn test_domain_encoding_quirks() { #[cfg(feature = "expose_internals")] #[test] fn test_expose_internals() { + use std::num::NonZeroU32; + use url::quirks::internal_components; use url::quirks::InternalComponents; @@ -810,8 +812,8 @@ fn test_expose_internals() { assert_eq!(host_end, 19); assert_eq!(port, None); assert_eq!(path_start, 19); - assert_eq!(query_start, Some(33)); - assert_eq!(fragment_start, Some(51)); + assert_eq!(query_start, Some(NonZeroU32::new(33).unwrap())); + assert_eq!(fragment_start, Some(NonZeroU32::new(51).unwrap())); } #[test] From ca0874d9af30b38547763ce400ba142ffa91f48e Mon Sep 17 00:00:00 2001 From: Don Isaac Date: Wed, 29 May 2024 13:41:32 -0400 Subject: [PATCH 3/3] test: add some pop_if_empty cases --- url/tests/unit.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/url/tests/unit.rs b/url/tests/unit.rs index 51d14a391..29472be62 100644 --- a/url/tests/unit.rs +++ b/url/tests/unit.rs @@ -1011,6 +1011,31 @@ fn test_null_host_with_leading_empty_path_segment() { assert_eq!(reparsed, url); } +#[test] +fn test_pop_if_empty_for_empty_path() { + let urls = [ + "https://github.com", + "https://github.com/", + "https://github.com//", + ]; + for url in &urls { + let mut url = Url::parse(url).unwrap(); + url.path_segments_mut().unwrap().pop_if_empty(); + assert_eq!(url.as_str(), "https://github.com/"); + } +} + +#[test] +fn test_pop_if_empty_when_nonempty() { + let mut url = Url::parse("https://github.com/servo/rust-url").unwrap(); + url.path_segments_mut().unwrap().push("").pop_if_empty(); + assert_eq!(url.as_str(), "https://github.com/servo/rust-url"); + + let mut url = Url::parse("https://github.com/servo/rust-url/").unwrap(); + url.path_segments_mut().unwrap().pop_if_empty(); + assert_eq!(url.as_str(), "https://github.com/servo/rust-url"); +} + #[test] fn pop_if_empty_in_bounds() { let mut url = Url::parse("m://").unwrap();