From 8aab6a8f56a9062d7d248479619b38d171f0f10a Mon Sep 17 00:00:00 2001 From: rushmorem Date: Thu, 5 Jan 2017 09:05:51 +0200 Subject: [PATCH] Refactor regex --- src/lib.rs | 51 +++++++++++++++++++++++++++++++++++++++++++++------ src/tests.rs | 42 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 86 insertions(+), 7 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 91639e6..d5ead12 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -88,7 +88,7 @@ use std::fmt; pub use errors::{Result, Error}; -use regex::Regex; +use regex::RegexSet; use errors::ErrorKind; #[cfg(feature = "remote_list")] use native_tls::TlsConnector; @@ -141,7 +141,45 @@ pub enum Host { } lazy_static! { - static ref LABEL: Regex = Regex::new(r"^([[:alnum:]]+|[[:alnum:]]+[[:alnum:]-]*[[:alnum:]]+)$").unwrap(); + // Regex for matching domain name labels + static ref LABEL: RegexSet = { + let exprs = vec![ + // can be any combination of alphanumeric characters + r"^[[:alnum:]]+$", + // or it can start with an alphanumeric character + // then optionally be followed by any combination of + // alphanumeric characters and dashes before finally + // ending with an alphanumeric character + r"^[[:alnum:]]+[[:alnum:]-]*[[:alnum:]]+$", + ]; + RegexSet::new(exprs).unwrap() + }; + + // Regex for matching the local-part of an + // email address + static ref LOCAL: RegexSet = { + // these characters can be anywhere in the expresion + let global = r#"[[:alnum:]!#$%&'*+/=?^_`{|}~-]"#; + // non-ascii characters (an also be unquoted) + let non_ascii = r#"[^\x00-\x7F]"#; + // the pattern to match + let quoted = r#"["(),\\:;<>@\[\]. ]"#; + // combined regex + let combined = format!(r#"({}*{}*)"#, global, non_ascii); + + let exprs = vec![ + // can be any combination of allowed characters + format!(r#"^{}+$"#, combined), + // can be any combination of allowed charaters + // separated by a . in between + format!(r#"^({0}+[.]?{0}+)+$"#, combined), + // can be a quoted string with allowed plus + // additional characters + format!(r#"^"({}*{}*)*"$"#, combined, quoted), + ]; + + RegexSet::new(exprs).unwrap() + }; } /// Converts a type into a Url object @@ -379,6 +417,8 @@ impl List { // http://girders.org/blog/2013/01/31/dont-rfc-validate-email-addresses/ // https://html.spec.whatwg.org/multipage/forms.html#valid-e-mail-address // https://hackernoon.com/the-100-correct-way-to-validate-email-addresses-7c4818f24643#.pgcir4z3e + // http://haacked.com/archive/2007/08/21/i-knew-how-to-validate-an-email-address-until-i.aspx/ + // https://tools.ietf.org/html/rfc6530#section-10.1 pub fn parse_email(&self, address: &str) -> Result { let mut parts = address.rsplitn(2, "@"); let host = match parts.next() { @@ -389,10 +429,10 @@ impl List { Some(local) => local, None => { return Err(ErrorKind::InvalidEmail.into()); } }; - if local.starts_with(".") - || local.ends_with(".") - || local.chars().count() > 64 + if local.chars().count() > 64 || address.chars().count() > 254 + || (!local.starts_with('"') && local.contains("..")) + || !LOCAL.is_match(local) { return Err(ErrorKind::InvalidEmail.into()); } @@ -596,7 +636,6 @@ impl Domain { return Err(ErrorKind::InvalidDomain(domain.into()).into()); } let input = domain; - //let domain = input.trim().trim_right_matches('.'); let (domain, res) = domain_to_unicode(input); if let Err(errors) = res { return Err(ErrorKind::Uts46(errors).into()); diff --git a/src/tests.rs b/src/tests.rs index 889bcb3..8e229e1 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -244,6 +244,14 @@ fn list_behaviour() { "#!$%&'*+-/=?^_`{}|~@example.org", "example@s.solutions", "user@[fd79:cdcb:38cc:9dd:f686:e06d:32f3:c123]", + r#""Abc\@def"@example.com"#, + r#""Fred Bloggs"@example.com"#, + r#""Joe\\Blow"@example.com"#, + r#""Abc@def"@example.com"#, + r#"customer/department=shipping@example.com"#, + "$A12345@example.com", + "!def!xyz%abc@example.com", + "_somename@example.com", ]; for email in emails { println!("{} should be valid", email); @@ -252,6 +260,27 @@ fn list_behaviour() { pass!() }); + ctx.it("should reject invalid email addresses", || { + let emails = vec![ + "Abc.example.com", + "A@b@c@example.com", + r#"a"b(c)d,e:f;gi[j\k]l@example.com"#, + r#""just"not"right@example.com"#, + r#"this is"not\allowed@example.com"#, + r#"this\ still\"not\\allowed@example.com"#, + "1234567890123456789012345678901234567890123456789012345678901234+x@example.com", + "john..doe@example.com", + "john.doe@example..com", + " prettyandsimple@example.com", + "prettyandsimple@example.com ", + ]; + for email in emails { + println!("{} should not be valid", email); + assert!(list.parse_email(email).is_err()); + } + pass!() + }); + ctx.it("should allow parsing emails as str", || { assert!(list.parse_str("prettyandsimple@example.com").unwrap().is_domain()); pass!() @@ -263,7 +292,18 @@ fn list_behaviour() { }); ctx.it("should allow parsing IDN email addresses", || { - assert!(list.parse_email("用户@例子.广告").is_ok()); + let emails = vec![ + r#"Pelé@example.com"#, + r#"δοκιμή@παράδειγμα.δοκιμή"#, + r#"我買@屋企.香港"#, + r#"甲斐@黒川.日本"#, + r#"чебурашка@ящик-с-апельсинами.рф"#, + r#"संपर्क@डाटामेल.भारत"#, + ]; + for email in emails { + println!("{} should be valid", email); + assert!(list.parse_email(email).is_ok()); + } pass!() }); });