Skip to content

Commit

Permalink
golang: speedup using TRIE
Browse files Browse the repository at this point in the history
Most patterns are simple search strings (not special Regexp symbols).
Some utilize ^ and $, which can be emulated in plaintext search by appending
these characters to the text itself for matching as regular characters.
Additionally, some patterns involve (xx|yy) or [xY] structures, which expand
to several plaintexts. Rare patterns require real regexp matching.

I've applied these simplifications and modifications. There are two tables:
one replaces a pattern with a list of possible search strings, while the other
matches rare patterns requiring regexp with specific strings indicating their
possible presence in text. The specific string is needed to know when to run
the regexp.

Search strings are substituted with a random hex string of length 16 (to prevent
spontaneous or intentional matching with anything), followed by a label ("-" for
simple search strings, "*" for rare cases requiring regexp, and a number encoded
as "%05d" format).

All replacements are performed using strings.Replacer, which utilizes TRIE and
is therefore very fast. The random hex string is searched within the output of
the replacement. If it's not found, it indicates a mismatch. If found, it's
either a match (for simple search string labels) or a potential match (for
regexp patterns). In the latter case, the corresponding regexp is executed on
the text to verify the match.

Benchmark comparison:

$ benchstat old.txt new.txt
goos: linux
goarch: amd64
pkg: github.com/monperrus/crawler-user-agents
cpu: Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
                           │    old.txt    │               new.txt               │
                           │    sec/op     │   sec/op     vs base                │
IsCrawlerPositive-2          71.384µ ±  7%   1.535µ ± 3%  -97.85% (p=0.000 n=10)
MatchingCrawlersPositive-2   70.597µ ±  2%   1.586µ ± 1%  -97.75% (p=0.000 n=10)
IsCrawlerNegative-2          71.072µ ± 11%   1.747µ ± 4%  -97.54% (p=0.000 n=10)
MatchingCrawlersNegative-2   67.978µ ±  1%   1.723µ ± 2%  -97.47% (p=0.000 n=10)
geomean                       70.24µ         1.645µ       -97.66%

                           │    old.txt    │                 new.txt                 │
                           │      B/s      │      B/s       vs base                  │
IsCrawlerPositive-2          2.112Mi ±  7%   98.205Mi ± 3%  +4548.98% (p=0.000 n=10)
MatchingCrawlersPositive-2   2.131Mi ±  2%   95.029Mi ± 1%  +4358.39% (p=0.000 n=10)
IsCrawlerNegative-2          2.055Mi ± 10%   83.528Mi ± 4%  +3964.27% (p=0.000 n=10)
MatchingCrawlersNegative-2   2.146Mi ±  1%   84.710Mi ± 2%  +3847.78% (p=0.000 n=10)
geomean                      2.111Mi          90.14Mi       +4170.39%

New implementation is 40 times faster!
  • Loading branch information
starius committed Apr 13, 2024
1 parent 0ef518e commit cf7b3da
Showing 1 changed file with 187 additions and 9 deletions.
196 changes: 187 additions & 9 deletions validate.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@ package agents

import (
_ "embed"
"encoding/hex"
"encoding/json"
"fmt"
"hash/maphash"
"regexp"
"strconv"
"strings"
"time"
)

Expand Down Expand Up @@ -80,31 +84,205 @@ var Crawlers = func() []Crawler {
return crawlers
}()

var regexps = func() []*regexp.Regexp {
regexps := make([]*regexp.Regexp, len(Crawlers))
var pattern2literals = map[string][]string{
`[wW]get`: {`wget`, `Wget`},
`Ahrefs(Bot|SiteAudit)`: {`AhrefsBot`, `AhrefsSiteAudit`},
`S[eE][mM]rushBot`: {`SemrushBot`, `SeMrushBot`, `SEmrushBot`, `SEMrushBot`},
`Livelap[bB]ot`: {`Livelapbot`, `LivelapBot`},
`[pP]ingdom`: {`pingdom`, `Pingdom`},
`Bark[rR]owler`: {`Barkrowler`, `BarkRowler`},
`^Apache-HttpClient`: {`^Apache-HttpClient`},
`^LCC `: {`^LCC `},
`(^| )sentry\/`: {`^sentry/`, ` sentry/`},
`^curl`: {`^curl`},
`[Cc]urebot`: {`Curebot`, `curebot`},
`^PHP-Curl-Class`: {`^PHP-Curl-Class`},
`(^| )PTST\/`: {`^PTST/`, ` PTST/`},
`^BW\/`: {`^BW/`},
}

var pattern2mainLiteral = map[string]string{
`AdsBot-Google([^-]|$)`: `AdsBot-Google`,
`BlogTraffic\/\d\.\d+ Feed-Fetcher`: `BlogTraffic/`,
}

func analyzePattern(pattern string) (olds []string, re *regexp.Regexp) {
literals, has := pattern2literals[pattern]
if has {
return literals, nil
}

re = regexp.MustCompile(pattern)
prefix, complete := re.LiteralPrefix()
if complete {
return []string{prefix}, nil
}

mainLiteral, has := pattern2mainLiteral[pattern]
if !has {
panic("don't know what to do with pattern: " + pattern)
}
return []string{mainLiteral}, re
}

type regexpPattern struct {
re *regexp.Regexp
index int
}

type matcher struct {
replacer *strings.Replacer
regexps []regexpPattern
}

var uniqueToken = hex.EncodeToString((&maphash.Hash{}).Sum(nil))

const (
uniqueTokenLen = 2 * 8
numLen = 5
literalLabel = '-'
regexpLabel = '*'
)

var m = func() matcher {
if len(uniqueToken) != uniqueTokenLen {
panic("len(uniqueToken) != uniqueTokenLen")
}

regexps := []regexpPattern{}
oldnew := make([]string, 0, len(Crawlers)*2)

// Put re-based patterns to the end to prevent AdsBot-Google from
// shadowing AdsBot-Google-Mobile.
var oldnew2 []string

for i, crawler := range Crawlers {
regexps[i] = regexp.MustCompile(crawler.Pattern)
literals, re := analyzePattern(crawler.Pattern)

label := literalLabel
num := i
if re != nil {
label = regexpLabel
num = len(regexps)
regexps = append(regexps, regexpPattern{
re: re,
index: i,
})
}

replaceWith := fmt.Sprintf(" %s%c%0*d ", uniqueToken, label, numLen, num)

for _, literal := range literals {
if re != nil {
oldnew2 = append(oldnew2, literal, replaceWith)
} else {
oldnew = append(oldnew, literal, replaceWith)
}
}
}
oldnew = append(oldnew, oldnew2...)

// Allocate another array with regexps of exact size to save memory.
regexps2 := make([]regexpPattern, len(regexps))
copy(regexps2, regexps)

r := strings.NewReplacer(oldnew...)
r.Replace("") // To cause internal build process.

return matcher{
replacer: r,
regexps: regexps2,
}
return regexps
}()

// Returns if User Agent string matches any of crawler patterns.
func IsCrawler(userAgent string) bool {
for _, re := range regexps {
if re.MatchString(userAgent) {
// This code is mostly copy-paste of MatchingCrawlers,
// but with early exit logic, so it works a but faster.

text := "^" + userAgent + "$"
replaced := m.replacer.Replace(text)
if replaced == text {
return false
}

for {
uniquePos := strings.Index(replaced, uniqueToken)
if uniquePos == -1 {
break
}

start := uniquePos + uniqueTokenLen + 1
if start+numLen >= len(replaced) {
panic("corrupt replaced: " + replaced)
}

label := replaced[start-1]
switch label {
case literalLabel:
return true
case regexpLabel:
// Rare case. Run regexp to confirm the match.
indexStr := replaced[start : start+numLen]
index, err := strconv.Atoi(indexStr)
if err != nil {
panic("corrupt replaced: " + replaced)
}
rp := m.regexps[index]
if rp.re.MatchString(userAgent) {
return true
}
default:
panic("corrupt replaced: " + replaced)
}

replaced = replaced[start+numLen:]
}

return false
}

// Finds all crawlers matching the User Agent and returns the list of their indices in Crawlers.
func MatchingCrawlers(userAgent string) []int {
text := "^" + userAgent + "$"
replaced := m.replacer.Replace(text)
if replaced == text {
return []int{}
}

indices := []int{}
for i, re := range regexps {
if re.MatchString(userAgent) {
indices = append(indices, i)
for {
uniquePos := strings.Index(replaced, uniqueToken)
if uniquePos == -1 {
break
}

start := uniquePos + uniqueTokenLen + 1
if start+numLen >= len(replaced) {
panic("corrupt replaced: " + replaced)
}
indexStr := replaced[start : start+numLen]
index, err := strconv.Atoi(indexStr)
if err != nil {
panic("corrupt replaced: " + replaced)
}

label := replaced[start-1]
switch label {
case literalLabel:
indices = append(indices, index)
case regexpLabel:
// Rare case. Run regexp to confirm the match.
rp := m.regexps[index]
if rp.re.MatchString(userAgent) {
indices = append(indices, rp.index)
}
default:
panic("corrupt replaced: " + replaced)
}

replaced = replaced[start+numLen:]
}

return indices
}

0 comments on commit cf7b3da

Please sign in to comment.