From 9d88509780b308cd857e273872649799b3c7cff7 Mon Sep 17 00:00:00 2001 From: Shane St Savage Date: Fri, 4 Oct 2024 16:50:15 -0700 Subject: [PATCH] Initial commit --- LICENSE | 23 +++++ README.md | 68 +++++++++++++ agent-allowlist.txt | 12 +++ cleaner.go | 164 ++++++++++++++++++++++++++++++ cleaner_test.go | 53 ++++++++++ go.mod | 5 + go.sum | 2 + testdata/expected-crawler.log | 4 + testdata/expected-error.log | 1 + testdata/expected-non-crawler.log | 3 + testdata/extra-crawler-agents.txt | 2 + testdata/web.log | 8 ++ 12 files changed, 345 insertions(+) create mode 100644 LICENSE create mode 100644 README.md create mode 100644 agent-allowlist.txt create mode 100644 cleaner.go create mode 100644 cleaner_test.go create mode 100644 go.mod create mode 100644 go.sum create mode 100644 testdata/expected-crawler.log create mode 100644 testdata/expected-error.log create mode 100644 testdata/expected-non-crawler.log create mode 100644 testdata/extra-crawler-agents.txt create mode 100644 testdata/web.log diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6c40a95 --- /dev/null +++ b/LICENSE @@ -0,0 +1,23 @@ +The MIT License (MIT) +===================== + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the “Software”), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..621d972 --- /dev/null +++ b/README.md @@ -0,0 +1,68 @@ +# crawler-cleaner 🕷️🧹✨ + +Processes json web log input from stdin (one json log object per line), +removing any user agents determined to be crawlers/bot/scrapers. + +The project uses the +[crawler-user-agents](https://github.com/monperrus/crawler-user-agents) +project as the user agent database. + +By default crawler-cleaner looks for the user agent in the top level +`http_user_agent` field in each json log. This may be configured using the +`-user-agent-key` flag (but must be top level). + +Detected crawler logs can be discarded (default) or written to a separate +file/stream. JSON parse errors (error messages between json logs etc) +can also be written to a separate file. The following strings have +special meaning for the output files; + +* `0`, `/dev/null`, `null` - discard output +* `-`, `/dev/stdout`, `stdout` - write output to stdout +- `+`, `/dev/stderr`, `stderr` - write output to stderr + +## Example usage + +``` +$ ./crawler-cleaner -help +Usage of ./crawler-cleaner: + -crawler-output string + File to write crawler output to (default "/dev/null") + -error-output string + File to write unparsable json iput to (default "/dev/null") + -extra-crawler-agents-file string + File containing additional crawler user agent patterns, one per line + -non-crawler-output string + File to write non-crawler output to (default "/dev/stdout") + -user-agent-key string + Json key for user agent (default "http_user_agent") +``` + +``` +$ cat web.log | ./crawler-cleaner -crawler-output ./crawlers.log \ + -non-crawler-output ./legit.log -error-output errors.log +``` + +or + +``` +$ ./legit.log 2> ./crawlers.log +``` + +## Reviewing results + +After running, it's useful to examine the user agents in both non-crawler +and crawler outputs to identify any adjustments needed. Example command +to view counts of user agents using [`jq`](https://jqlang.github.io/jq/): + +``` + 0 { + if _, err := os.Stat(extraCrawlerAgentsFile); err == nil { + extraAgents, err := os.Open(extraCrawlerAgentsFile) + if err != nil { + fmt.Println(err) + } + defer extraAgents.Close() + + addExtraCrawlerAgents(extraAgents) + } else { + fmt.Println("Error loading extra agents file", extraCrawlerAgentsFile, err) + } + } + + nonCrawlerWriter := getWriter(nonCrawlerOutput) + defer nonCrawlerWriter.Close() + crawlerWriter := getWriter(crawlerOutput) + defer crawlerWriter.Close() + errorWriter := getWriter(errorOutput) + defer errorWriter.Close() + + cleanCrawlers(userAgentKeyConfig, os.Stdin, nonCrawlerWriter, crawlerWriter, errorWriter) +} diff --git a/cleaner_test.go b/cleaner_test.go new file mode 100644 index 0000000..e1c2d26 --- /dev/null +++ b/cleaner_test.go @@ -0,0 +1,53 @@ +package main + +/* +To generate test data: + +