Fix panic out of range when index map gets out of sync with normaliza…

…tion text (#27) * Create IndexMap after ToLower(). Ensure this and initialize() are only done once. Protect from out of range. Index out of range errors can happen when the Normalized Text gets longer than the IndexMap. This was happening when ToLower() was increasing the size of NormalizedText after the IndexMap creation. Initialize was also resetting NormalizedText when called after normalization shrunk the text to "". * ToLower() is now part of initialize() * initialize is now only called once, as needed, protected by sync. * The out of range exposures are also now protected. * ToLower() is now before some matchers that were using unnecessary case-insensitive matching. Those are now optimized to assume lower. Fixes #26 Signed-off-by: Mark Sturdevant <[email protected]> * Remove unused var Signed-off-by: Mark Sturdevant <[email protected]> Signed-off-by: Mark Sturdevant <[email protected]>
IBM · Nov 10, 2022 · f0f5a86 · f0f5a86
1 parent eceeacf
commit f0f5a86
Show file tree

Hide file tree

Showing 2 changed files with 62 additions and 42 deletions.
diff --git a/normalizer/normalizer.go b/normalizer/normalizer.go
@@ -12,23 +12,24 @@ import (
 	"fmt"
 	"regexp"
 	"strings"
+	"sync"
 
 	"github.com/mrutkows/sbom-utility/log"
 	"golang.org/x/exp/slices"
 )
 
 const (
-	NoteTagPattern          = `(?i)<<note[:=].+?>>`
-	WildcardMatchingPattern = `(?i)<<match=\.\+>>`
+	NoteTagPattern          = `<<note[:=].+?>>`
+	WildcardMatchingPattern = `<<match=\.\+>>`
 
 	OptionalWildcardMatchingPattern = `<<match=\.\*>>`
 
-	ReplaceableTextPattern     = `(?i)<<(?:var;(?:name=(.+?);)?(?:original=(.*?);)?)?match=(.+?)>>`
-	BeginOptionalLinePattern   = `(?im)^<<beginOptional(?:;name=.*?)?>>`
-	BeginOptionalPattern       = `(?i)<<beginOptional(?:;name=.*?)?>>`
+	ReplaceableTextPattern     = `<<(?:var;(?:name=(.+?);)?(?:original=(.*?);)?)?match=(.+?)>>`
+	BeginOptionalLinePattern   = `(m)^<<beginoptional(?:;name=.*?)?>>`
+	BeginOptionalPattern       = `<<beginoptional(?:;name=.*?)?>>`
 	OmitableLine               = "<<omitable>>\n"
 	Omitable                   = "<<omitable>>"
-	EndOptionalPattern         = `(?i)<<endOptional>>`
+	EndOptionalPattern         = `<<endoptional>>`
 	ReplaceEndPattern          = `<</omitable>>`
 	CommentBlockOutsidePattern = `(?m)^\s*(?:/\*|-{2,3}\[=*\[)|(?:\*/|]=*])\s*$`
 	CommentBlockInsidePattern  = `(?m)^\s*[*#]{1,6}|\*{1,6}$`
@@ -112,6 +113,7 @@ type NormalizationData struct {
 	CaptureGroups  []*CaptureGroup
 	Hash           Digest
 	IsTemplate     bool
+	initializeOnce sync.Once
 }
 
 type CaptureGroup struct {
@@ -136,7 +138,6 @@ func NewNormalizationData(originalText string, isTemplate bool) *NormalizationDa
 		OriginalText: originalText,
 		IsTemplate:   isTemplate,
 	}
-	nd.initialize()
 	return &nd
 }
 
@@ -154,10 +155,6 @@ func (n *NormalizationData) NormalizeText() error {
 		return fmt.Errorf("failed to normalize data: invalid input text with control characters")
 	}
 
-	// TODO: remove excessive whitespace, prior to generating the index map.
-	// TODO: keep track of the removed sections while stripping off excessive whitespace
-	// TODO: TBD ^^^ Why bother when we will replace all whitespace with single " " later on?
-
 	// remove note tags
 	n.removeNoteTags()
 
@@ -173,9 +170,6 @@ func (n *NormalizationData) NormalizeText() error {
 	// Replace the optional tags with <<omitable>> and <</omitable>>. (Guideline 2.1.4)
 	n.standardizeOmitableTags()
 
-	// Convert the input text to all lower case. (Guideline 4.1.1)
-	n.NormalizedText = strings.ToLower(n.NormalizedText)
-
 	// remove odd characters, such as TM, replacement character ?, etc
 	// NOTE! Remove these before any use of regexp2 because rune chars throw off the index map
 	n.removeOddCharacters()
@@ -246,19 +240,19 @@ func (n *NormalizationData) NormalizeText() error {
 
 // initializeIndexMap initializes the index map based on the normalized text
 func (n *NormalizationData) initialize() {
-	// initialize the normalized text with the original text
-	if len(n.NormalizedText) == 0 {
-		n.NormalizedText = n.OriginalText
-	}
+	n.initializeOnce.Do(func() {
+		// Convert the input text to all lower case. (Guideline 4.1.1)
+		// Note: Regex patterns also assume ToLower() was already done to avoid needing case-insensitive match.
+		n.NormalizedText = strings.ToLower(n.OriginalText)
 
-	// generate an index map, to map the normalized text indices back to the respective index in the original text
-	if len(n.IndexMap) == 0 {
+		// generate an index map, to map the normalized text indices back to the respective index in the original text
+		// Note: ToLower() must be done before creating IndexMap because some chars change in length.
 		l := len(n.NormalizedText)
 		n.IndexMap = make([]int, l)
 		for i := 0; i < l; i++ {
 			n.IndexMap[i] = i
 		}
-	}
+	})
 }
 
 func (n *NormalizationData) removeNoteTags() {
@@ -522,6 +516,14 @@ func substr(text string, from int, to int) string {
 	return text[from:to]
 }
 
+// subset will do thing[from:to] or thing[from:] depending on to >= len(thing)
+func subset[T any](thing []T, from int, to int) []T {
+	if to >= len(thing) {
+		return thing[from:]
+	}
+	return thing[from:to]
+}
+
 // replaceMatchesWithStringAndUpdateIndexMap iterates over matches to:
 // * remove or replace the matched text
 // * build an updated index map
@@ -542,26 +544,26 @@ func (n *NormalizationData) replaceMatchesWithStringsAndUpdateIndexMap(allSubmat
 		replacement := replacements[i]
 
 		// copy the text and index map before (and in between) matches
-		if firstIndex > prev {
-			newText += n.NormalizedText[prev:firstIndex]
-			newIndex = append(newIndex, n.IndexMap[prev:firstIndex]...)
+		if prev < len(n.IndexMap) && firstIndex > prev {
+			newText += substr(n.NormalizedText, prev, firstIndex)
+			newIndex = append(newIndex, subset(n.IndexMap, prev, firstIndex)...)
 		}
 
-		if len(replacement) > 0 {
+		replacementLen := len(replacement)
+		if replacementLen > 0 {
 			// If a replacement string is being inserted, then we'll also insert indexes as follows:
 			// * The first element should be the first index in the replaced section.
 			// * The last element should be the last index in the replaced section. (Unless there is only a single char)
 			// * Middle elements should be -1, for 'replaced'. A match starting/ending on these indices is invalid.
-			replacementIndex := make([]int, len(replacement))
-			for c := range replacement {
-				switch c {
-				case 0:
-					replacementIndex[c] = n.IndexMap[firstIndex]
-				case len(replacement) - 1:
-					replacementIndex[c] = n.IndexMap[lastIndex-1]
-				default:
-					replacementIndex[c] = -1
-				}
+			replacementIndex := make([]int, replacementLen)
+			for i := 0; i < cap(replacementIndex); i++ {
+				replacementIndex[i] = -1
+			}
+			if firstIndex < len(n.IndexMap) {
+				replacementIndex[0] = n.IndexMap[firstIndex]
+			}
+			if replacementLen > 1 && lastIndex-1 < len(n.IndexMap) {
+				replacementIndex[replacementLen-1] = n.IndexMap[lastIndex-1]
 			}
 
 			// Append the replacement text and indexes

diff --git a/normalizer/normalizer_test.go b/normalizer/normalizer_test.go
@@ -84,6 +84,24 @@ func TestNormalizationData_NormalizeText(t *testing.T) {
 				NormalizedText: "runes in commissariat à l'énergie atomique then htmltag <<omitable>>x♢z<</omitable>> .",
 			},
 		},
+		{
+			name: "Character that changes length should not cause out-of-bounds with indexMap",
+			n: &NormalizationData{
+				OriginalText: "\n\xfe\nx\n",
+			},
+			e: &NormalizationData{
+				NormalizedText: "x",
+			},
+		},
+		{
+			name: "Character that changes length should not cause out-of-bounds with indexMap (char 0)",
+			n: &NormalizationData{
+				OriginalText: "\xfe\nx\n",
+			},
+			e: &NormalizationData{
+				NormalizedText: "x",
+			},
+		},
 	}
 
 	for _, tc := range tcs {
@@ -109,7 +127,7 @@ func TestNormalizationData_NormalizeText_removeNoteTag(t *testing.T) {
 			OriginalText: "Something to note about <<note: Please be careful with this license>>",
 		},
 		e: &NormalizationData{
-			NormalizedText: "Something to note about  ",
+			NormalizedText: "something to note about  ",
 		},
 	}}
 
@@ -186,7 +204,7 @@ func TestNormalizationData_NormalizeText_CaptureReplaceableTextSections(t *testi
 		e: &NormalizationData{
 			CaptureGroups: []*CaptureGroup{{
 				GroupNumber: 1,
-				Name:        "replaceableSection",
+				Name:        "replaceablesection",
 				Original:    "some text",
 				Matches:     ".+?",
 			}},
@@ -622,7 +640,7 @@ func TestNormalizationData_NormalizeText_removeOddCharacters(t *testing.T) {
 			OriginalText: fmt.Sprintf("Trademark \u0099  Not sign ¬"),
 		},
 		e: &NormalizationData{
-			NormalizedText: fmt.Sprintf("Trademark    Not sign  "),
+			NormalizedText: fmt.Sprintf("trademark    not sign  "),
 		},
 	}}
 
@@ -646,7 +664,7 @@ func TestNormalizationData_NormalizeText_replaceWhitespace(t *testing.T) {
 			OriginalText: fmt.Sprintf("\nThis text   has \tsome \nwhitespace.\n"),
 		},
 		e: &NormalizationData{
-			NormalizedText: fmt.Sprintf("This text has some whitespace."),
+			NormalizedText: fmt.Sprintf("this text has some whitespace."),
 		},
 	}}
 
@@ -672,23 +690,23 @@ func TestNormalizationData_NormalizeText_Replacement_Words(t *testing.T) {
 				OriginalText: "This licence license organisation organisation to redistributions redistribution",
 			},
 			e: &NormalizationData{
-				NormalizedText: "This license license organization organization to redistribution redistribution",
+				NormalizedText: "this license license organization organization to redistribution redistribution",
 			},
 		},
 		{
 			n: &NormalizationData{
 				OriginalText: "This license organisation to redistribution",
 			},
 			e: &NormalizationData{
-				NormalizedText: "This license organization to redistribution",
+				NormalizedText: "this license organization to redistribution",
 			},
 		},
 		{
 			n: &NormalizationData{
 				OriginalText: "This licence organization to redistributions ",
 			},
 			e: &NormalizationData{
-				NormalizedText: "This license organization to redistribution ",
+				NormalizedText: "this license organization to redistribution ",
 			},
 		},
 	}