-
Notifications
You must be signed in to change notification settings - Fork 25
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve parsing of SPL2 modules for statement names to handle strings, fields, functions, comments. #131
Improve parsing of SPL2 modules for statement names to handle strings, fields, functions, comments. #131
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
/** | ||
* This helper function retrieves the names of all module-level search statements | ||
* | ||
* @param spl2Module module contents | ||
* @returns array of regex matches of statements capturing names of each statement | ||
*/ | ||
export function getModuleStatements(spl2Module: string): string[] { | ||
// Remove anything within comments, field literals, string | ||
// literals, or between braces { .. } which will eliminate | ||
// function/lambda params like `$it -> { $p = 1 }` | ||
// and commented-out statements like /* $out = from [{}] */ | ||
let inBlockComment = false; // /* .. */ | ||
let inField = false; // ' .. ' | ||
let inString = false; // " .. " | ||
let inLineComment = false; // // .. <EOL> | ||
let braceLevel = 0; // { .. } | ||
|
||
let newModule = ''; | ||
let prev = ''; | ||
while (spl2Module.length > 0) { | ||
let indx = 0; | ||
let next = spl2Module.charAt(indx++); | ||
let peeked = peek(spl2Module); | ||
let crlf = (next === '\r' && peeked === '\n'); | ||
let newLine = crlf || (next === '\n'); | ||
if (inBlockComment) { | ||
if (next === '*' && peeked === '/') { | ||
inBlockComment = false; // exit block comment | ||
indx++; // move past */ | ||
} | ||
} else if (inField) { | ||
if (next === '\'' && prev !== '\\') { // ignore \' | ||
inField = false; // exit field literal | ||
} | ||
} else if (inString) { | ||
if (newLine || (next === '"' && prev !== '\\')) { // ignore \" | ||
inString = false; // exit string literal | ||
if (crlf) { | ||
indx++; // move past \r\n | ||
} | ||
} | ||
} else if (inLineComment) { | ||
if (newLine) { | ||
inLineComment = false; // exit line comment | ||
if (crlf) { | ||
indx++; // move past \r\n | ||
} | ||
} | ||
} else if (braceLevel > 0) { | ||
if (next === '{') { | ||
braceLevel++; | ||
} else if (next === '}') { | ||
braceLevel--; | ||
} | ||
if (braceLevel === 0) { | ||
// insert newlines after blocks like function and dataset declarations | ||
// to start new statements/declarations on new lines when possible | ||
newModule += '\n'; | ||
} | ||
} else { | ||
// Check for entering new block | ||
switch (next) { | ||
case '/': | ||
if (peeked === '/') { | ||
inLineComment = true; | ||
indx++; // move past // | ||
} else if (peeked === '*') { | ||
inBlockComment = true; | ||
indx++; // move past /* | ||
} | ||
break; | ||
case '\'': | ||
inField = true; | ||
break; | ||
case '"': | ||
inString = true; | ||
break; | ||
case '{': | ||
braceLevel++; | ||
break; | ||
} | ||
// if we're not in one of the blocks above, write to cleaned module | ||
if (!inBlockComment && !inField && !inString && !inLineComment && braceLevel === 0) { | ||
newModule += next; | ||
} | ||
} | ||
spl2Module = spl2Module.substring(indx, spl2Module.length); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This might be a tall ask and is out of scope for this, but are the repeated substring calls suboptimal here? Would it be best to traverse the string via an index like so:
Doing so might fare better for larger modules since we don't need to repeatedly substringify the module. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, good suggestion |
||
} | ||
|
||
// Match anything that looks like `$statement_1 = ...` and return the statement names | ||
return [...newModule.matchAll(/^\s*\$([a-zA-Z0-9_]+)[\s]*=/gm)] | ||
.map(group => (group.length > 1) ? group[1] : null) | ||
.filter(val => (val !== null)); | ||
} | ||
|
||
function peek(str: string): string { | ||
return (str.length > 1) ? str.charAt(1) : ""; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
const { assert } = require('chai'); | ||
const { getModuleStatements } = require("../out/notebooks/utils/parsing"); | ||
|
||
describe('splunk', () => { | ||
describe('getModuleStatements()', () => { | ||
it('should find a single statement', () => { | ||
const module = ` | ||
$out = from a; | ||
`; | ||
const statements = getModuleStatements(module); | ||
assert.equal(statements.length, 1); | ||
assert.equal(statements[0], 'out'); | ||
}); | ||
it('should find each statement when several specified', () => { | ||
const module = ` | ||
$out1 = from a; | ||
$out2 = from b; | ||
$out3 = from c; | ||
`; | ||
const statements = getModuleStatements(module); | ||
assert.equal(statements.length, 3); | ||
assert.equal(statements[0], 'out1'); | ||
assert.equal(statements[1], 'out2'); | ||
assert.equal(statements[2], 'out3'); | ||
}); | ||
it('should ignore single line comments', () => { | ||
const module = ` | ||
//$out1 = from a; | ||
$out2 = from b; // $out3 = from c; | ||
// $out4 = from c; | ||
`; | ||
const statements = getModuleStatements(module); | ||
assert.equal(statements.length, 1); | ||
assert.equal(statements[0], 'out2'); | ||
}); | ||
it('should ignore block comments', () => { | ||
const module = ` | ||
/*$out1 = from a; | ||
*/$out2 /* * */= from b; | ||
/* $out3 = from c;*/ | ||
`; | ||
const statements = getModuleStatements(module); | ||
assert.equal(statements.length, 1); | ||
assert.equal(statements[0], 'out2'); | ||
}); | ||
it('should handle complex comment, field, and function scenarios', () => { | ||
const module = ` | ||
$out1 = from [{s:1}] | ||
| eval foo = map([1,2], $it -> { | ||
$lp1 = 1; | ||
return $f; | ||
}); | ||
function func1() | ||
dataset ds1 { | ||
' | ||
$dsfield = ': "value" | ||
} | ||
function func2() { | ||
$p1 = 1; | ||
$p2 = $p1 + 1; | ||
return $p2 | ||
} $out2 = from [{s:2}] | where '$foo=bar'=2; | ||
$out3 /* $f1 = 1; | ||
$f2 = 2 | ||
*/ = from [{s:3}]; | ||
$out4 = from [{' | ||
$fieldval = ': "error"}];`; | ||
const statements = getModuleStatements(module); | ||
assert.equal(statements.length, 4); | ||
assert.equal(statements[0], 'out1'); | ||
assert.equal(statements[1], 'out2'); | ||
assert.equal(statements[2], 'out3'); | ||
assert.equal(statements[3], 'out4'); | ||
}); | ||
}); | ||
}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Where are we updating
prev
? I don't see any assignments toprev
, specifically nothing likeprev = next
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
great catch, it looks like I neglected to add a test that includes string and field escape chars too so let me do that