Skip to content
This repository has been archived by the owner on Aug 29, 2023. It is now read-only.

Commit

Permalink
Fix panic out of range when index map gets out of sync with normaliza…
Browse files Browse the repository at this point in the history
…tion text (#27)

* Create IndexMap after ToLower(). Ensure this and initialize() are only done once. Protect from out of range.

Index out of range errors can happen when the Normalized Text gets longer than the IndexMap.
This was happening when ToLower() was increasing the size of NormalizedText after the IndexMap creation.
Initialize was also resetting NormalizedText when called after normalization shrunk the text to "".

* ToLower() is now part of initialize()
* initialize is now only called once, as needed, protected by sync.
* The out of range exposures are also now protected.
* ToLower() is now before some matchers that were using unnecessary case-insensitive matching. Those are now optimized to assume lower.

Fixes #26

Signed-off-by: Mark Sturdevant <[email protected]>

* Remove unused var

Signed-off-by: Mark Sturdevant <[email protected]>

Signed-off-by: Mark Sturdevant <[email protected]>
  • Loading branch information
markstur authored Nov 10, 2022
1 parent eceeacf commit f0f5a86
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 42 deletions.
72 changes: 37 additions & 35 deletions normalizer/normalizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,24 @@ import (
"fmt"
"regexp"
"strings"
"sync"

"github.com/mrutkows/sbom-utility/log"
"golang.org/x/exp/slices"
)

const (
NoteTagPattern = `(?i)<<note[:=].+?>>`
WildcardMatchingPattern = `(?i)<<match=\.\+>>`
NoteTagPattern = `<<note[:=].+?>>`
WildcardMatchingPattern = `<<match=\.\+>>`

OptionalWildcardMatchingPattern = `<<match=\.\*>>`

ReplaceableTextPattern = `(?i)<<(?:var;(?:name=(.+?);)?(?:original=(.*?);)?)?match=(.+?)>>`
BeginOptionalLinePattern = `(?im)^<<beginOptional(?:;name=.*?)?>>`
BeginOptionalPattern = `(?i)<<beginOptional(?:;name=.*?)?>>`
ReplaceableTextPattern = `<<(?:var;(?:name=(.+?);)?(?:original=(.*?);)?)?match=(.+?)>>`
BeginOptionalLinePattern = `(m)^<<beginoptional(?:;name=.*?)?>>`
BeginOptionalPattern = `<<beginoptional(?:;name=.*?)?>>`
OmitableLine = "<<omitable>>\n"
Omitable = "<<omitable>>"
EndOptionalPattern = `(?i)<<endOptional>>`
EndOptionalPattern = `<<endoptional>>`
ReplaceEndPattern = `<</omitable>>`
CommentBlockOutsidePattern = `(?m)^\s*(?:/\*|-{2,3}\[=*\[)|(?:\*/|]=*])\s*$`
CommentBlockInsidePattern = `(?m)^\s*[*#]{1,6}|\*{1,6}$`
Expand Down Expand Up @@ -112,6 +113,7 @@ type NormalizationData struct {
CaptureGroups []*CaptureGroup
Hash Digest
IsTemplate bool
initializeOnce sync.Once
}

type CaptureGroup struct {
Expand All @@ -136,7 +138,6 @@ func NewNormalizationData(originalText string, isTemplate bool) *NormalizationDa
OriginalText: originalText,
IsTemplate: isTemplate,
}
nd.initialize()
return &nd
}

Expand All @@ -154,10 +155,6 @@ func (n *NormalizationData) NormalizeText() error {
return fmt.Errorf("failed to normalize data: invalid input text with control characters")
}

// TODO: remove excessive whitespace, prior to generating the index map.
// TODO: keep track of the removed sections while stripping off excessive whitespace
// TODO: TBD ^^^ Why bother when we will replace all whitespace with single " " later on?

// remove note tags
n.removeNoteTags()

Expand All @@ -173,9 +170,6 @@ func (n *NormalizationData) NormalizeText() error {
// Replace the optional tags with <<omitable>> and <</omitable>>. (Guideline 2.1.4)
n.standardizeOmitableTags()

// Convert the input text to all lower case. (Guideline 4.1.1)
n.NormalizedText = strings.ToLower(n.NormalizedText)

// remove odd characters, such as TM, replacement character ?, etc
// NOTE! Remove these before any use of regexp2 because rune chars throw off the index map
n.removeOddCharacters()
Expand Down Expand Up @@ -246,19 +240,19 @@ func (n *NormalizationData) NormalizeText() error {

// initializeIndexMap initializes the index map based on the normalized text
func (n *NormalizationData) initialize() {
// initialize the normalized text with the original text
if len(n.NormalizedText) == 0 {
n.NormalizedText = n.OriginalText
}
n.initializeOnce.Do(func() {
// Convert the input text to all lower case. (Guideline 4.1.1)
// Note: Regex patterns also assume ToLower() was already done to avoid needing case-insensitive match.
n.NormalizedText = strings.ToLower(n.OriginalText)

// generate an index map, to map the normalized text indices back to the respective index in the original text
if len(n.IndexMap) == 0 {
// generate an index map, to map the normalized text indices back to the respective index in the original text
// Note: ToLower() must be done before creating IndexMap because some chars change in length.
l := len(n.NormalizedText)
n.IndexMap = make([]int, l)
for i := 0; i < l; i++ {
n.IndexMap[i] = i
}
}
})
}

func (n *NormalizationData) removeNoteTags() {
Expand Down Expand Up @@ -522,6 +516,14 @@ func substr(text string, from int, to int) string {
return text[from:to]
}

// subset will do thing[from:to] or thing[from:] depending on to >= len(thing)
func subset[T any](thing []T, from int, to int) []T {
if to >= len(thing) {
return thing[from:]
}
return thing[from:to]
}

// replaceMatchesWithStringAndUpdateIndexMap iterates over matches to:
// * remove or replace the matched text
// * build an updated index map
Expand All @@ -542,26 +544,26 @@ func (n *NormalizationData) replaceMatchesWithStringsAndUpdateIndexMap(allSubmat
replacement := replacements[i]

// copy the text and index map before (and in between) matches
if firstIndex > prev {
newText += n.NormalizedText[prev:firstIndex]
newIndex = append(newIndex, n.IndexMap[prev:firstIndex]...)
if prev < len(n.IndexMap) && firstIndex > prev {
newText += substr(n.NormalizedText, prev, firstIndex)
newIndex = append(newIndex, subset(n.IndexMap, prev, firstIndex)...)
}

if len(replacement) > 0 {
replacementLen := len(replacement)
if replacementLen > 0 {
// If a replacement string is being inserted, then we'll also insert indexes as follows:
// * The first element should be the first index in the replaced section.
// * The last element should be the last index in the replaced section. (Unless there is only a single char)
// * Middle elements should be -1, for 'replaced'. A match starting/ending on these indices is invalid.
replacementIndex := make([]int, len(replacement))
for c := range replacement {
switch c {
case 0:
replacementIndex[c] = n.IndexMap[firstIndex]
case len(replacement) - 1:
replacementIndex[c] = n.IndexMap[lastIndex-1]
default:
replacementIndex[c] = -1
}
replacementIndex := make([]int, replacementLen)
for i := 0; i < cap(replacementIndex); i++ {
replacementIndex[i] = -1
}
if firstIndex < len(n.IndexMap) {
replacementIndex[0] = n.IndexMap[firstIndex]
}
if replacementLen > 1 && lastIndex-1 < len(n.IndexMap) {
replacementIndex[replacementLen-1] = n.IndexMap[lastIndex-1]
}

// Append the replacement text and indexes
Expand Down
32 changes: 25 additions & 7 deletions normalizer/normalizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,24 @@ func TestNormalizationData_NormalizeText(t *testing.T) {
NormalizedText: "runes in commissariat à l'énergie atomique then htmltag <<omitable>>x♢z<</omitable>> .",
},
},
{
name: "Character that changes length should not cause out-of-bounds with indexMap",
n: &NormalizationData{
OriginalText: "\n\xfe\nx\n",
},
e: &NormalizationData{
NormalizedText: "x",
},
},
{
name: "Character that changes length should not cause out-of-bounds with indexMap (char 0)",
n: &NormalizationData{
OriginalText: "\xfe\nx\n",
},
e: &NormalizationData{
NormalizedText: "x",
},
},
}

for _, tc := range tcs {
Expand All @@ -109,7 +127,7 @@ func TestNormalizationData_NormalizeText_removeNoteTag(t *testing.T) {
OriginalText: "Something to note about <<note: Please be careful with this license>>",
},
e: &NormalizationData{
NormalizedText: "Something to note about ",
NormalizedText: "something to note about ",
},
}}

Expand Down Expand Up @@ -186,7 +204,7 @@ func TestNormalizationData_NormalizeText_CaptureReplaceableTextSections(t *testi
e: &NormalizationData{
CaptureGroups: []*CaptureGroup{{
GroupNumber: 1,
Name: "replaceableSection",
Name: "replaceablesection",
Original: "some text",
Matches: ".+?",
}},
Expand Down Expand Up @@ -622,7 +640,7 @@ func TestNormalizationData_NormalizeText_removeOddCharacters(t *testing.T) {
OriginalText: fmt.Sprintf("Trademark \u0099 Not sign ¬"),
},
e: &NormalizationData{
NormalizedText: fmt.Sprintf("Trademark Not sign "),
NormalizedText: fmt.Sprintf("trademark not sign "),
},
}}

Expand All @@ -646,7 +664,7 @@ func TestNormalizationData_NormalizeText_replaceWhitespace(t *testing.T) {
OriginalText: fmt.Sprintf("\nThis text has \tsome \nwhitespace.\n"),
},
e: &NormalizationData{
NormalizedText: fmt.Sprintf("This text has some whitespace."),
NormalizedText: fmt.Sprintf("this text has some whitespace."),
},
}}

Expand All @@ -672,23 +690,23 @@ func TestNormalizationData_NormalizeText_Replacement_Words(t *testing.T) {
OriginalText: "This licence license organisation organisation to redistributions redistribution",
},
e: &NormalizationData{
NormalizedText: "This license license organization organization to redistribution redistribution",
NormalizedText: "this license license organization organization to redistribution redistribution",
},
},
{
n: &NormalizationData{
OriginalText: "This license organisation to redistribution",
},
e: &NormalizationData{
NormalizedText: "This license organization to redistribution",
NormalizedText: "this license organization to redistribution",
},
},
{
n: &NormalizationData{
OriginalText: "This licence organization to redistributions ",
},
e: &NormalizationData{
NormalizedText: "This license organization to redistribution ",
NormalizedText: "this license organization to redistribution ",
},
},
}
Expand Down

0 comments on commit f0f5a86

Please sign in to comment.