From 8728b008805cf8f8c7ce628e004658bcab75c019 Mon Sep 17 00:00:00 2001 From: William Storey Date: Thu, 14 Mar 2024 15:20:48 -0700 Subject: [PATCH] Replace fewer TLDs when normalizing --- CHANGELOG.md | 7 ++-- lib/minfraud/components/email.rb | 59 ++++++++++++++++++++++++++++++-- spec/components/email_spec.rb | 7 ++-- 3 files changed, 63 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 10deed6..7468ed8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,11 +17,8 @@ * Duplicate `.com`s are now removed from email domain names when `hash_address` is used. For example, `example.com.com` will become `example.com`. -* Extraneous characters after `.com` are now removed from email domain - names when `hash_address` is used. For example, `example.comfoo` will - become `example.com`. -* Certain `.com` typos are now normalized to `.com` when `hash_address` is - used. For example, `example.cam` will become `example.com`. +* Certain TLD typos are now normalized when `hash_address` is used. For + example, `example.comcom` will become `example.com`. * Additional `gmail.com` domain names with leading digits are now normalized when `hash_address` is used. For example, `100gmail.com` will become `gmail.com`. diff --git a/lib/minfraud/components/email.rb b/lib/minfraud/components/email.rb index c4fecc3..e7bbaa9 100644 --- a/lib/minfraud/components/email.rb +++ b/lib/minfraud/components/email.rb @@ -131,6 +131,55 @@ def clean_email_address(address) }.freeze private_constant :TYPO_DOMAINS + TYPO_TLDS = { + 'comm' => 'com', + 'commm' => 'com', + 'commmm' => 'com', + 'comn' => 'com', + + 'cbm' => 'com', + 'ccm' => 'com', + 'cdm' => 'com', + 'cem' => 'com', + 'cfm' => 'com', + 'cgm' => 'com', + 'chm' => 'com', + 'cim' => 'com', + 'cjm' => 'com', + 'ckm' => 'com', + 'clm' => 'com', + 'cmm' => 'com', + 'cnm' => 'com', + 'cpm' => 'com', + 'cqm' => 'com', + 'crm' => 'com', + 'csm' => 'com', + 'ctm' => 'com', + 'cum' => 'com', + 'cvm' => 'com', + 'cwm' => 'com', + 'cxm' => 'com', + 'cym' => 'com', + 'czm' => 'com', + + 'col' => 'com', + 'con' => 'com', + + 'dom' => 'com', + 'don' => 'com', + 'som' => 'com', + 'son' => 'com', + 'vom' => 'com', + 'von' => 'com', + 'xom' => 'com', + 'xon' => 'com', + + 'clam' => 'com', + 'colm' => 'com', + 'comcom' => 'com', + }.freeze + private_constant :TYPO_TLDS + EQUIVALENT_DOMAINS = { 'googlemail.com' => 'gmail.com', 'pm.me' => 'protonmail.com', @@ -330,10 +379,16 @@ def clean_domain(domain) domain = SimpleIDN.to_ascii(domain) domain.sub!(/(?:\.com){2,}$/, '.com') - domain.sub!(/\.com[^.]+$/, '.com') - domain.sub!(/(?:\.(?:com|c[a-z]{1,2}m|co[ln]|[dsvx]o[mn]|))$/, '.com') domain.sub!(/^\d+(?:gmail?\.com)$/, 'gmail.com') + idx = domain.rindex('.') + if !idx.nil? + tld = domain[idx + 1..] + if TYPO_TLDS.key?(tld) + domain = "#{domain[0, idx]}.#{TYPO_TLDS[tld]}" + end + end + if TYPO_DOMAINS.key?(domain) domain = TYPO_DOMAINS[domain] end diff --git a/spec/components/email_spec.rb b/spec/components/email_spec.rb index 4c4c4b4..e0dc9c8 100644 --- a/spec/components/email_spec.rb +++ b/spec/components/email_spec.rb @@ -63,7 +63,7 @@ { input: ' Test@maxmind.com', output: 'test@maxmind.com' }, { input: 'Test@maxmind.com|abc124472372', - output: 'test@maxmind.com', + output: 'test@maxmind.com|abc124472372', }, { input: 'Test+foo@yahoo.com', output: 'test+foo@yahoo.com' }, { input: 'Test-foo@yahoo.com', output: 'test@yahoo.com' }, @@ -76,9 +76,10 @@ { input: 'alias@user.fastmail.com', output: 'user@fastmail.com' }, { input: 'foo-bar@ymail.com', output: 'foo@ymail.com' }, { input: 'foo@example.com.com', output: 'foo@example.com' }, - { input: 'foo@example.comfoo', output: 'foo@example.com' }, - { input: 'foo@example.cam', output: 'foo@example.com' }, + { input: 'foo@example.comfoo', output: 'foo@example.comfoo' }, + { input: 'foo@example.cam', output: 'foo@example.cam' }, { input: 'foo@10000gmail.com', output: 'foo@gmail.com' }, + { input: 'foo@example.comcom', output: 'foo@example.com' }, ] tests.each do |i|