diff --git a/README.md b/README.md index cc93dd5..0572f37 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,9 @@ ## Summary -**go-fasttld** is a high performance [top level domains (TLD)](https://en.wikipedia.org/wiki/Top-level_domain) extraction module that extracts subcomponents from [URLs](https://en.wikipedia.org/wiki/URL). +**go-fasttld** is a high performance [effective top level domains (eTLD)](https://wiki.mozilla.org/Public_Suffix_List) extraction module that extracts subcomponents from [URLs](https://en.wikipedia.org/wiki/URL). -URLs can either contain hostnames, IPv4 addresses, or IPv6 addresses. TLD extraction is based on the [Mozilla Public Suffix List](http://www.publicsuffix.org). Private domains listed in the [Mozilla Public Suffix List](http://www.publicsuffix.org) like 'blogspot.co.uk' and 'sinaapp.com' are also supported. +URLs can either contain hostnames, IPv4 addresses, or IPv6 addresses. eTLD extraction is based on the [Mozilla Public Suffix List](http://www.publicsuffix.org). Private domains listed in the [Mozilla Public Suffix List](http://www.publicsuffix.org) like 'blogspot.co.uk' and 'sinaapp.com' are also supported. ![Demo](demo.gif) @@ -302,18 +302,18 @@ Benchmarks performed on AMD Ryzen 7 5800X, Manjaro Linux. ### Why not split on "." and take the last element instead? -Splitting on "." and taking the last element only works for simple TLDs like `com`, but not more complex ones like `oseto.nagasaki.jp`. +Splitting on "." and taking the last element only works for simple eTLDs like `com`, but not more complex ones like `oseto.nagasaki.jp`. -### TLD tries +### eTLD tries ![Trie](Trie_example.svg) -**go-fasttld** stores TLDs in [compressed tries](https://en.wikipedia.org/wiki/Trie). +**go-fasttld** stores eTLDs in [compressed tries](https://en.wikipedia.org/wiki/Trie). -Valid TLDs from the [Mozilla Public Suffix List](http://www.publicsuffix.org) are appended to the compressed trie in reverse-order. +Valid eTLDs from the [Mozilla Public Suffix List](http://www.publicsuffix.org) are appended to the compressed trie in reverse-order. ```sh -Given the following TLDs +Given the following eTLDs au nsw.edu.au com.ac @@ -334,11 +334,11 @@ START ╚═ gov 🚩 === Symbol meanings === -🚩 : path to this node is a valid TLD +🚩 : path to this node is a valid eTLD ✅ : path to this node found in example URL host `example.nsw.edu.au` ``` -The URL host subcomponents are parsed from right-to-left until no more matching nodes can be found. In this example, the path of matching nodes are `au -> edu -> nsw`. Reversing the nodes gives the extracted TLD `nsw.edu.au`. +The URL host subcomponents are parsed from right-to-left until no more matching nodes can be found. In this example, the path of matching nodes are `au -> edu -> nsw`. Reversing the nodes gives the extracted eTLD `nsw.edu.au`. ## Acknowledgements diff --git a/benchmark_test.go b/benchmark_test.go index 27e33d8..38ad96a 100644 --- a/benchmark_test.go +++ b/benchmark_test.go @@ -84,7 +84,7 @@ Omitted modules github.com/M507/tlde | Almost exactly the same as github.com/joeguo/tldextract -github.com/ImVexed/fasturl | Fast, but cannot extract TLDs +github.com/ImVexed/fasturl | Fast, but cannot extract eTLDs github.com/weppos/publicsuffix-go | Cannot handle full URLs with scheme (i.e. https:// ftp:// etc.) diff --git a/cmd/fasttld/root.go b/cmd/fasttld/root.go index d555905..1dea133 100644 --- a/cmd/fasttld/root.go +++ b/cmd/fasttld/root.go @@ -12,8 +12,8 @@ var version string = "" var rootCmd = &cobra.Command{ Use: "fasttld", Version: version, - Short: `fasttld is a high performance top level domains (TLD) extraction module.`, - Long: `fasttld is a high performance top level domains (TLD) extraction module.`, + Short: `fasttld is a high performance effective top level domains (eTLD) extraction module.`, + Long: `fasttld is a high performance effective top level domains (eTLD) extraction module.`, Run: func(cmd *cobra.Command, args []string) {}, } diff --git a/fasttld.go b/fasttld.go index 2bdd337..6b4c759 100644 --- a/fasttld.go +++ b/fasttld.go @@ -1,4 +1,4 @@ -// Package fasttld is a high performance top level domains (TLD) +// Package fasttld is a high performance effective top level domains (eTLD) // extraction module implemented with compressed tries. // // This module is a port of the Python fasttld module, @@ -72,7 +72,7 @@ type URLParams struct { } // trie is a node of the compressed trie -// used to store Public Suffix List TLDs. +// used to store Public Suffix List eTLDs. type trie struct { matches hashmap.Map[string, *trie] end bool @@ -115,7 +115,7 @@ func nestedDict(dic *trie, keys []string) { } } -// trieConstruct constructs a compressed trie to store Public Suffix List TLDs split at "." in reverse-order. +// trieConstruct constructs a compressed trie to store Public Suffix List eTLDs split at "." in reverse-order. // // For example: "us.gov.pl" will be stored in the order {"pl", "gov", "us"}. func trieConstruct(includePrivateSuffix bool, cacheFilePath string) (*trie, error) { @@ -300,7 +300,7 @@ func (f *FastTLD) Extract(e URLParams) (ExtractResult, error) { return urlParts, err } - // Check for TLD Suffix + // Check for eTLD Suffix node := f.tldTrie var ( @@ -341,7 +341,7 @@ func (f *FastTLD) Extract(e URLParams) (ExtractResult, error) { break } - // check if label is part of a TLD + // check if label is part of an eTLD label, _ = url.QueryUnescape(label) if val, ok := node.matches.Get(label); ok { suffixStartIdx = sepIdx diff --git a/fasttld_test.go b/fasttld_test.go index db40a3a..7ef42ea 100644 --- a/fasttld_test.go +++ b/fasttld_test.go @@ -259,12 +259,12 @@ var schemeTests = []extractTest{ var noSchemeTests = []extractTest{ {urlParams: URLParams{URL: "localhost"}, expected: ExtractResult{Domain: "localhost", HostType: HostName}, description: "localhost"}, {urlParams: URLParams{URL: "16777215"}, expected: ExtractResult{Domain: "16777215", HostType: HostName}, description: "Number >= 0xFFFFFF"}, - {urlParams: URLParams{URL: "org"}, expected: ExtractResult{Suffix: "org"}, err: errs[9], description: "Single TLD | Suffix Only"}, - {urlParams: URLParams{URL: "org."}, expected: ExtractResult{Suffix: "org"}, err: errs[9], description: "Single TLD | Suffix Only with single trailing dot"}, // RFC 1034 - allow single trailing dot - {urlParams: URLParams{URL: "org.."}, expected: ExtractResult{}, err: errs[8], description: "Single TLD | Suffix Only with 2 trailing dots"}, - {urlParams: URLParams{URL: "co.th"}, expected: ExtractResult{Suffix: "co.th"}, err: errs[9], description: "Double TLD | Suffix Only"}, - {urlParams: URLParams{URL: "co.th."}, expected: ExtractResult{Suffix: "co.th"}, err: errs[9], description: "Double TLD | Suffix Only with single trailing dot"}, // RFC 1034 - allow single trailing dot - {urlParams: URLParams{URL: "co.th.."}, expected: ExtractResult{}, err: errs[8], description: "Double TLD | Suffix Only with 2 trailing dots"}, + {urlParams: URLParams{URL: "org"}, expected: ExtractResult{Suffix: "org"}, err: errs[9], description: "Single eTLD | Suffix Only"}, + {urlParams: URLParams{URL: "org."}, expected: ExtractResult{Suffix: "org"}, err: errs[9], description: "Single eTLD | Suffix Only with single trailing dot"}, // RFC 1034 - allow single trailing dot + {urlParams: URLParams{URL: "org.."}, expected: ExtractResult{}, err: errs[8], description: "Single eTLD | Suffix Only with 2 trailing dots"}, + {urlParams: URLParams{URL: "co.th"}, expected: ExtractResult{Suffix: "co.th"}, err: errs[9], description: "Double eTLD | Suffix Only"}, + {urlParams: URLParams{URL: "co.th."}, expected: ExtractResult{Suffix: "co.th"}, err: errs[9], description: "Double eTLD | Suffix Only with single trailing dot"}, // RFC 1034 - allow single trailing dot + {urlParams: URLParams{URL: "co.th.."}, expected: ExtractResult{}, err: errs[8], description: "Double eTLD | Suffix Only with 2 trailing dots"}, {urlParams: URLParams{URL: "users@example.com"}, expected: ExtractResult{UserInfo: "users", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "UserInfo + Domain | No Scheme"}, {urlParams: URLParams{URL: "mailto:users@example.com"}, expected: ExtractResult{UserInfo: "mailto:users", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "Mailto | No Scheme"}, {urlParams: URLParams{URL: "example.com:999"}, expected: ExtractResult{Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Port: "999", HostType: HostName}, description: "Domain + Port | No Scheme"}, @@ -445,15 +445,15 @@ var invalidTests = []extractTest{ {urlParams: URLParams{URL: "https://brb\u002ei\u3002am\uff0egoing\uff61to\uff0ebe\u3002a\uff61\u3002fk"}, expected: ExtractResult{Scheme: "https://"}, err: errs[6], description: "Consecutive label separators within Suffix", }, - {urlParams: URLParams{URL: ".\u3002a\uff61fk"}, expected: ExtractResult{}, err: errs[8], description: "TLD only, multiple leading label separators"}, + {urlParams: URLParams{URL: ".\u3002a\uff61fk"}, expected: ExtractResult{}, err: errs[8], description: "eTLD only, multiple leading label separators"}, {urlParams: URLParams{URL: "https://brb\u002ei\u3002am\uff0egoing\uff61to\uff0ebe.\u3002a\uff61fk"}, expected: ExtractResult{Scheme: "https://"}, err: errs[8], description: "Consecutive label separators between Domain and Suffix"}, {urlParams: URLParams{URL: "https://brb\u002ei\u3002am\uff0egoing\uff61to.\uff0ebe\u3002a\uff61fk"}, expected: ExtractResult{Scheme: "https://"}, err: errs[8], description: "Consecutive label separators between SubDomain and Domain"}, {urlParams: URLParams{URL: "https://brb\u002ei\u3002.am.\uff0egoing\uff61to\uff0ebe\u3002a\uff61fk"}, expected: ExtractResult{Scheme: "https://"}, err: errs[8], description: "Consecutive label separators within SubDomain"}, {urlParams: URLParams{URL: "https://\uff0eexample.com"}, expected: ExtractResult{Scheme: "https://"}, err: errs[8], description: "Hostname starting with label separator"}, {urlParams: URLParams{URL: "//server.example.com/path"}, expected: ExtractResult{Scheme: "//", SubDomain: "server", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Path: "/path", HostType: HostName}, description: "Double-slash only Scheme with subdomain"}, - {urlParams: URLParams{URL: "http://temasek"}, expected: ExtractResult{Scheme: "http://", Suffix: "temasek"}, err: errs[9], description: "Basic URL with TLD only"}, - {urlParams: URLParams{URL: "http://temasek.this-tld-cannot-be-real"}, expected: ExtractResult{Scheme: "http://", SubDomain: "temasek", Domain: "this-tld-cannot-be-real", HostType: HostName}, description: "Basic URL with bad TLD"}, - {urlParams: URLParams{URL: "http://temasek.temasek.this-tld-cannot-be-real"}, expected: ExtractResult{Scheme: "http://", SubDomain: "temasek.temasek", Domain: "this-tld-cannot-be-real", HostType: HostName}, description: "Basic URL with subdomain and bad TLD"}, + {urlParams: URLParams{URL: "http://temasek"}, expected: ExtractResult{Scheme: "http://", Suffix: "temasek"}, err: errs[9], description: "Basic URL with eTLD only"}, + {urlParams: URLParams{URL: "http://temasek.this-tld-cannot-be-real"}, expected: ExtractResult{Scheme: "http://", SubDomain: "temasek", Domain: "this-tld-cannot-be-real", HostType: HostName}, description: "Basic URL with bad eTLD"}, + {urlParams: URLParams{URL: "http://temasek.temasek.this-tld-cannot-be-real"}, expected: ExtractResult{Scheme: "http://", SubDomain: "temasek.temasek", Domain: "this-tld-cannot-be-real", HostType: HostName}, description: "Basic URL with subdomain and bad eTLD"}, {urlParams: URLParams{URL: "http://127.0.0.256"}, expected: ExtractResult{Scheme: "http://", SubDomain: "127.0.0", Domain: "256", HostType: HostName}, description: "Basic IPv4 Address URL with bad IP"}, {urlParams: URLParams{URL: "http://127\uff0e0\u30020\uff61256:5000"}, expected: ExtractResult{Scheme: "http://", SubDomain: "127\uff0e0\u30020", Port: "5000", @@ -538,16 +538,16 @@ var invalidTests = []extractTest{ } var internationalTLDTests = []extractTest{ {urlParams: URLParams{URL: "https://𝖊𝖝𝖆𝖒𝖕𝖑𝖊.𝖈𝖔𝖒.𝖘𝖌", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "https://", Domain: "example", Suffix: "com.sg", RegisteredDomain: "example.com.sg", HostType: HostName}}, - {urlParams: URLParams{URL: "http://example.敎育.hk/地图/A/b/C?编号=42", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--lcvr32d.hk", RegisteredDomain: "example.xn--lcvr32d.hk", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with mixed international TLD (result in punycode)"}, - {urlParams: URLParams{URL: "http://example.обр.срб/地图/A/b/C?编号=42", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "example.xn--90azh.xn--90a3ac", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with full international TLD (result in punycode)"}, - {urlParams: URLParams{URL: "http://example.敎育.hk/地图/A/b/C?编号=42"}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "敎育.hk", RegisteredDomain: "example.敎育.hk", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with mixed international TLD (result in unicode)"}, - {urlParams: URLParams{URL: "http://example.обр.срб/地图/A/b/C?编号=42"}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "обр.срб", RegisteredDomain: "example.обр.срб", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with full international TLD (result in unicode)"}, - {urlParams: URLParams{URL: "http://example.xn--ciqpn.hk/地图/A/b/C?编号=42", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--ciqpn.hk", RegisteredDomain: "example.xn--ciqpn.hk", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with mixed punycode international TLD (result in punycode)"}, - {urlParams: URLParams{URL: "http://example.xn--90azh.xn--90a3ac/地图/A/b/C?编号=42", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "example.xn--90azh.xn--90a3ac", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with full punycode international TLD (result in punycode)"}, - {urlParams: URLParams{URL: "http://example.xn--ciqpn.hk"}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--ciqpn.hk", RegisteredDomain: "example.xn--ciqpn.hk", HostType: HostName}, description: "Basic URL with mixed punycode international TLD (no further conversion to punycode)"}, - {urlParams: URLParams{URL: "http://example.xn--90azh.xn--90a3ac"}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "example.xn--90azh.xn--90a3ac", HostType: HostName}, description: "Basic URL with full punycode international TLD (no further conversion to punycode)"}, - {urlParams: URLParams{URL: "http://xN--h1alffa9f.xn--90azh.xn--90a3ac"}, expected: ExtractResult{Scheme: "http://", Domain: "xN--h1alffa9f", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "xN--h1alffa9f.xn--90azh.xn--90a3ac", HostType: HostName}, description: "Mixed case Punycode Domain with full punycode international TLD (no further conversion to punycode) See: https://github.com/golang/go/issues/48778"}, - {urlParams: URLParams{URL: "http://xN--h1alffa9f.xn--90azh.xn--90a3ac", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://", Domain: "xn--h1alffa9f", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "xn--h1alffa9f.xn--90azh.xn--90a3ac", HostType: HostName}, description: "Mixed case Punycode Domain with full punycode international TLD (with further conversion to punycode)"}, + {urlParams: URLParams{URL: "http://example.敎育.hk/地图/A/b/C?编号=42", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--lcvr32d.hk", RegisteredDomain: "example.xn--lcvr32d.hk", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with mixed international eTLD (result in punycode)"}, + {urlParams: URLParams{URL: "http://example.обр.срб/地图/A/b/C?编号=42", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "example.xn--90azh.xn--90a3ac", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with full international eTLD (result in punycode)"}, + {urlParams: URLParams{URL: "http://example.敎育.hk/地图/A/b/C?编号=42"}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "敎育.hk", RegisteredDomain: "example.敎育.hk", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with mixed international eTLD (result in unicode)"}, + {urlParams: URLParams{URL: "http://example.обр.срб/地图/A/b/C?编号=42"}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "обр.срб", RegisteredDomain: "example.обр.срб", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with full international eTLD (result in unicode)"}, + {urlParams: URLParams{URL: "http://example.xn--ciqpn.hk/地图/A/b/C?编号=42", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--ciqpn.hk", RegisteredDomain: "example.xn--ciqpn.hk", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with mixed punycode international eTLD (result in punycode)"}, + {urlParams: URLParams{URL: "http://example.xn--90azh.xn--90a3ac/地图/A/b/C?编号=42", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "example.xn--90azh.xn--90a3ac", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with full punycode international eTLD (result in punycode)"}, + {urlParams: URLParams{URL: "http://example.xn--ciqpn.hk"}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--ciqpn.hk", RegisteredDomain: "example.xn--ciqpn.hk", HostType: HostName}, description: "Basic URL with mixed punycode international eTLD (no further conversion to punycode)"}, + {urlParams: URLParams{URL: "http://example.xn--90azh.xn--90a3ac"}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "example.xn--90azh.xn--90a3ac", HostType: HostName}, description: "Basic URL with full punycode international eTLD (no further conversion to punycode)"}, + {urlParams: URLParams{URL: "http://xN--h1alffa9f.xn--90azh.xn--90a3ac"}, expected: ExtractResult{Scheme: "http://", Domain: "xN--h1alffa9f", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "xN--h1alffa9f.xn--90azh.xn--90a3ac", HostType: HostName}, description: "Mixed case Punycode Domain with full punycode international eTLD (no further conversion to punycode) See: https://github.com/golang/go/issues/48778"}, + {urlParams: URLParams{URL: "http://xN--h1alffa9f.xn--90azh.xn--90a3ac", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://", Domain: "xn--h1alffa9f", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "xn--h1alffa9f.xn--90azh.xn--90a3ac", HostType: HostName}, description: "Mixed case Punycode Domain with full punycode international eTLD (with further conversion to punycode)"}, } var domainOnlySingleTLDTests = []extractTest{ {urlParams: URLParams{URL: "https://example.ai/en"}, expected: ExtractResult{Scheme: "https://", Domain: "example", Suffix: "ai", RegisteredDomain: "example.ai", Path: "/en", HostType: HostName}, description: "Domain only + ai"},