From 2ddf864ec5fe90a17a90498ed6b5b210294717d7 Mon Sep 17 00:00:00 2001 From: Tal Einat Date: Mon, 4 May 2020 17:14:23 +0300 Subject: [PATCH] normalize to "copyright" and "trademark" rather than single-char variants This improves matching correctness when trying to match a license file to the text of a license which includes "copyright" verbatim in the body of the license text, since the same normalization is not applied to the original license texts. This includes common licenses such as the 2- and 3-clause BSD licenses. Signed-off-by: Tal Einat --- licensedb/internal/normalize/normalize.go | 12 +++++++----- licensedb/internal/normalize/normalize_test.go | 1 + 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/licensedb/internal/normalize/normalize.go b/licensedb/internal/normalize/normalize.go index 7f34814..478daab 100644 --- a/licensedb/internal/normalize/normalize.go +++ b/licensedb/internal/normalize/normalize.go @@ -75,13 +75,13 @@ var ( ) // 9.1.1 "©", "(c)", or "Copyright" should be considered equivalent and interchangeable. - copyrightRe = regexp.MustCompile("copyright|\\(c\\)") - trademarkRe = regexp.MustCompile("trademark(s?)|\\(tm\\)") + copyrightRe = regexp.MustCompile("©|\\(c\\)") + trademarkRe = regexp.MustCompile("trademarks|\\(tm\\)|™") // extra cleanup brokenLinkRe = regexp.MustCompile("http s ://") urlCleanupRe = regexp.MustCompile("[<(](http(s?)://[^\\s]+)[)>]") - copyrightLineRe = regexp.MustCompile("(?m)^((©.*)|(all rights reserved(\\.)?)|(li[cs]en[cs]e))\n") + copyrightLineRe = regexp.MustCompile("(?m)^((copyright.*)|(all rights reserved(\\.)?)|(li[cs]en[cs]e))\n") nonAlphaNumRe = regexp.MustCompile("[^- \\na-z0-9]") // used in Split() @@ -128,8 +128,8 @@ func LicenseText(text string, strictness Strictness) string { text = wordReplacer.Replace(text) // 9. Copyright Symbol - text = copyrightRe.ReplaceAllString(text, "©") - text = trademarkRe.ReplaceAllString(text, "™") + text = copyrightRe.ReplaceAllString(text, "copyright") + text = trademarkRe.ReplaceAllString(text, "trademark") // fix broken URLs in SPDX source texts text = brokenLinkRe.ReplaceAllString(text, "https://") @@ -155,7 +155,9 @@ func LicenseText(text string, strictness Strictness) string { // there are common mismatches because of trailing dots text = strings.Replace(text, ".", "", -1) // usually copyright lines are custom and occur multiple times + text = strings.Replace(text, "copyright notice", "PLACEHOLDER", -1) text = copyrightLineRe.ReplaceAllString(text, "") + text = strings.Replace(text, "PLACEHOLDER", "copyright notice", -1) } if strictness > Moderate { diff --git a/licensedb/internal/normalize/normalize_test.go b/licensedb/internal/normalize/normalize_test.go index 8d24cde..4cf5f1c 100644 --- a/licensedb/internal/normalize/normalize_test.go +++ b/licensedb/internal/normalize/normalize_test.go @@ -23,6 +23,7 @@ permissions granted by this license.`}, {"punctuation", "a-‒–—―⁓⸺⸻~˗‐‑⁃⁻₋−∼⎯⏤─➖𐆑֊﹘﹣-", "a-"}, {"bullet", "-\n*\n✱\n﹡\n•\n●\n⚫\n⏺\n🞄\n∙\n⋅\n", ""}, {"license", "", ""}, + {"copyright notice", "copyright notice", "copyright notice"}, } for _, tc := range tt {