From 45c5ebc2cebbfd4832dce1b864d774628dc6f6bb Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 19 Dec 2023 10:04:34 -0500 Subject: [PATCH] match/matchx -> strmatch/strmatchx --- docs/src/manpage.md | 23 +++++++--- docs/src/manpage.txt | 23 +++++++--- docs/src/reference-dsl-builtin-functions.md | 18 +++++++- .../src/reference-main-regular-expressions.md | 42 +++++++++++++++++++ man/manpage.txt | 23 +++++++--- man/mlr.1 | 35 +++++++++++++--- pkg/bifs/regex.go | 16 +++---- pkg/dsl/cst/builtin_function_manager.go | 38 ++++++++--------- pkg/lib/regex.go | 4 +- test/cases/dsl-match/0001/mlr | 2 +- test/cases/dsl-match/0002/mlr | 2 +- 11 files changed, 170 insertions(+), 56 deletions(-) diff --git a/docs/src/manpage.md b/docs/src/manpage.md index 283d6dd976..67e07093fa 100644 --- a/docs/src/manpage.md +++ b/docs/src/manpage.md @@ -226,12 +226,13 @@ MILLER(1) MILLER(1) roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime select sgn sha1 sha256 sha512 sin sinh skewness sort sort_collection splita splitax splitkv splitkvx splitnv splitnvx sqrt ssub stddev strfntime - strfntime_local strftime strftime_local string strip strlen strpntime - strpntime_local strptime strptime_local sub substr substr0 substr1 sum sum2 - sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper truncate - typeof unflatten unformat unformatx upntime uptime urand urand32 urandelement - urandint urandrange utf8_to_latin1 variance version ! != !=~ % & && * ** + - . - .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~ + strfntime_local strftime strftime_local string strip strlen strmatch strmatchx + strpntime strpntime_local strptime strptime_local sub substr substr0 substr1 + sum sum2 sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper + truncate typeof unflatten unformat unformatx upntime uptime urand urand32 + urandelement urandint urandrange utf8_to_latin1 variance version ! != !=~ % & + && * ** + - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | + || ~ 1mCOMMENTS-IN-DATA FLAGS0m Miller lets you put comments in your data, such as @@ -2996,6 +2997,16 @@ MILLER(1) MILLER(1) 1mstrlen0m (class=string #args=1) String length. + 1mstrmatch0m + (class=string #args=2) TODO: WRITE ME + Example: + TODO: WRITE ME + + 1mstrmatchx0m + (class=string #args=2) TODO: WRITE ME + Example: + TODO: WRITE ME + 1mstrpntime0m (class=time #args=2) strpntime: Parses timestamp as integer nanoseconds since the epoch. See also strpntime_local. Examples: diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt index b79cc6bca0..abcb77db81 100644 --- a/docs/src/manpage.txt +++ b/docs/src/manpage.txt @@ -205,12 +205,13 @@ MILLER(1) MILLER(1) roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime select sgn sha1 sha256 sha512 sin sinh skewness sort sort_collection splita splitax splitkv splitkvx splitnv splitnvx sqrt ssub stddev strfntime - strfntime_local strftime strftime_local string strip strlen strpntime - strpntime_local strptime strptime_local sub substr substr0 substr1 sum sum2 - sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper truncate - typeof unflatten unformat unformatx upntime uptime urand urand32 urandelement - urandint urandrange utf8_to_latin1 variance version ! != !=~ % & && * ** + - . - .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~ + strfntime_local strftime strftime_local string strip strlen strmatch strmatchx + strpntime strpntime_local strptime strptime_local sub substr substr0 substr1 + sum sum2 sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper + truncate typeof unflatten unformat unformatx upntime uptime urand urand32 + urandelement urandint urandrange utf8_to_latin1 variance version ! != !=~ % & + && * ** + - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | + || ~ 1mCOMMENTS-IN-DATA FLAGS0m Miller lets you put comments in your data, such as @@ -2975,6 +2976,16 @@ MILLER(1) MILLER(1) 1mstrlen0m (class=string #args=1) String length. + 1mstrmatch0m + (class=string #args=2) TODO: WRITE ME + Example: + TODO: WRITE ME + + 1mstrmatchx0m + (class=string #args=2) TODO: WRITE ME + Example: + TODO: WRITE ME + 1mstrpntime0m (class=time #args=2) strpntime: Parses timestamp as integer nanoseconds since the epoch. See also strpntime_local. Examples: diff --git a/docs/src/reference-dsl-builtin-functions.md b/docs/src/reference-dsl-builtin-functions.md index 8c3b496407..4a9d45d3f2 100644 --- a/docs/src/reference-dsl-builtin-functions.md +++ b/docs/src/reference-dsl-builtin-functions.md @@ -75,7 +75,7 @@ is 2. Unary operators such as `!` and `~` show argument-count of 1; the ternary * [**Higher-order-functions functions**](#higher-order-functions-functions): [any](#any), [apply](#apply), [every](#every), [fold](#fold), [reduce](#reduce), [select](#select), [sort](#sort). * [**Math functions**](#math-functions): [abs](#abs), [acos](#acos), [acosh](#acosh), [asin](#asin), [asinh](#asinh), [atan](#atan), [atan2](#atan2), [atanh](#atanh), [cbrt](#cbrt), [ceil](#ceil), [cos](#cos), [cosh](#cosh), [erf](#erf), [erfc](#erfc), [exp](#exp), [expm1](#expm1), [floor](#floor), [invqnorm](#invqnorm), [log](#log), [log10](#log10), [log1p](#log1p), [logifit](#logifit), [max](#max), [min](#min), [qnorm](#qnorm), [round](#round), [roundm](#roundm), [sgn](#sgn), [sin](#sin), [sinh](#sinh), [sqrt](#sqrt), [tan](#tan), [tanh](#tanh), [urand](#urand), [urand32](#urand32), [urandelement](#urandelement), [urandint](#urandint), [urandrange](#urandrange). * [**Stats functions**](#stats-functions): [antimode](#antimode), [count](#count), [distinct_count](#distinct_count), [kurtosis](#kurtosis), [maxlen](#maxlen), [mean](#mean), [meaneb](#meaneb), [median](#median), [minlen](#minlen), [mode](#mode), [null_count](#null_count), [percentile](#percentile), [percentiles](#percentiles), [skewness](#skewness), [sort_collection](#sort_collection), [stddev](#stddev), [sum](#sum), [sum2](#sum2), [sum3](#sum3), [sum4](#sum4), [variance](#variance). -* [**String functions**](#string-functions): [capitalize](#capitalize), [clean_whitespace](#clean_whitespace), [collapse_whitespace](#collapse_whitespace), [contains](#contains), [format](#format), [gssub](#gssub), [gsub](#gsub), [index](#index), [latin1_to_utf8](#latin1_to_utf8), [leftpad](#leftpad), [lstrip](#lstrip), [regextract](#regextract), [regextract_or_else](#regextract_or_else), [rightpad](#rightpad), [rstrip](#rstrip), [ssub](#ssub), [strip](#strip), [strlen](#strlen), [sub](#sub), [substr](#substr), [substr0](#substr0), [substr1](#substr1), [tolower](#tolower), [toupper](#toupper), [truncate](#truncate), [unformat](#unformat), [unformatx](#unformatx), [utf8_to_latin1](#utf8_to_latin1), [\.](#dot). +* [**String functions**](#string-functions): [capitalize](#capitalize), [clean_whitespace](#clean_whitespace), [collapse_whitespace](#collapse_whitespace), [contains](#contains), [format](#format), [gssub](#gssub), [gsub](#gsub), [index](#index), [latin1_to_utf8](#latin1_to_utf8), [leftpad](#leftpad), [lstrip](#lstrip), [regextract](#regextract), [regextract_or_else](#regextract_or_else), [rightpad](#rightpad), [rstrip](#rstrip), [ssub](#ssub), [strip](#strip), [strlen](#strlen), [strmatch](#strmatch), [strmatchx](#strmatchx), [sub](#sub), [substr](#substr), [substr0](#substr0), [substr1](#substr1), [tolower](#tolower), [toupper](#toupper), [truncate](#truncate), [unformat](#unformat), [unformatx](#unformatx), [utf8_to_latin1](#utf8_to_latin1), [\.](#dot). * [**System functions**](#system-functions): [exec](#exec), [hostname](#hostname), [os](#os), [system](#system), [version](#version). * [**Time functions**](#time-functions): [dhms2fsec](#dhms2fsec), [dhms2sec](#dhms2sec), [fsec2dhms](#fsec2dhms), [fsec2hms](#fsec2hms), [gmt2localtime](#gmt2localtime), [gmt2nsec](#gmt2nsec), [gmt2sec](#gmt2sec), [hms2fsec](#hms2fsec), [hms2sec](#hms2sec), [localtime2gmt](#localtime2gmt), [localtime2nsec](#localtime2nsec), [localtime2sec](#localtime2sec), [nsec2gmt](#nsec2gmt), [nsec2gmtdate](#nsec2gmtdate), [nsec2localdate](#nsec2localdate), [nsec2localtime](#nsec2localtime), [sec2dhms](#sec2dhms), [sec2gmt](#sec2gmt), [sec2gmtdate](#sec2gmtdate), [sec2hms](#sec2hms), [sec2localdate](#sec2localdate), [sec2localtime](#sec2localtime), [strfntime](#strfntime), [strfntime_local](#strfntime_local), [strftime](#strftime), [strftime_local](#strftime_local), [strpntime](#strpntime), [strpntime_local](#strpntime_local), [strptime](#strptime), [strptime_local](#strptime_local), [sysntime](#sysntime), [systime](#systime), [systimeint](#systimeint), [upntime](#upntime), [uptime](#uptime). * [**Typing functions**](#typing-functions): [asserting_absent](#asserting_absent), [asserting_array](#asserting_array), [asserting_bool](#asserting_bool), [asserting_boolean](#asserting_boolean), [asserting_empty](#asserting_empty), [asserting_empty_map](#asserting_empty_map), [asserting_error](#asserting_error), [asserting_float](#asserting_float), [asserting_int](#asserting_int), [asserting_map](#asserting_map), [asserting_nonempty_map](#asserting_nonempty_map), [asserting_not_array](#asserting_not_array), [asserting_not_empty](#asserting_not_empty), [asserting_not_map](#asserting_not_map), [asserting_not_null](#asserting_not_null), [asserting_null](#asserting_null), [asserting_numeric](#asserting_numeric), [asserting_present](#asserting_present), [asserting_string](#asserting_string), [is_absent](#is_absent), [is_array](#is_array), [is_bool](#is_bool), [is_boolean](#is_boolean), [is_empty](#is_empty), [is_empty_map](#is_empty_map), [is_error](#is_error), [is_float](#is_float), [is_int](#is_int), [is_map](#is_map), [is_nan](#is_nan), [is_nonempty_map](#is_nonempty_map), [is_not_array](#is_not_array), [is_not_empty](#is_not_empty), [is_not_map](#is_not_map), [is_not_null](#is_not_null), [is_null](#is_null), [is_numeric](#is_numeric), [is_present](#is_present), [is_string](#is_string), [typeof](#typeof). @@ -1350,6 +1350,22 @@ strlen (class=string #args=1) String length. +### strmatch +
+strmatch  (class=string #args=2) TODO: WRITE ME
+Example:
+TODO: WRITE ME
+
+ + +### strmatchx +
+strmatchx  (class=string #args=2) TODO: WRITE ME
+Example:
+TODO: WRITE ME
+
+ + ### sub
 sub  (class=string #args=3) '$name = sub($name, "old", "new")': replace once (first match, if there are multiple matches), with support for regular expressions. Capture groups \1 through \9 in the new part are matched from (...) in the old part, and must be used within the same call to sub -- they don't persist for subsequent DSL statements. See also =~ and regextract. See also "Regular expressions" at https://miller.readthedocs.io.
diff --git a/docs/src/reference-main-regular-expressions.md b/docs/src/reference-main-regular-expressions.md
index c221c48dec..ba6d955ff7 100644
--- a/docs/src/reference-main-regular-expressions.md
+++ b/docs/src/reference-main-regular-expressions.md
@@ -103,6 +103,48 @@ Regex captures of the form `\0` through `\9` are supported as follows:
 
 * Up to nine matches are supported: `\1` through `\9`, while `\0` is the entire match string; `\15` is treated as `\1` followed by an unrelated `5`.
 
+## Resetting captures
+
+If you use `(...)` in your regular expression, then up to 9 matches are supported for the `=~`
+operator, and an arbitrary number of matches are supported for the `match` DSL function.
+
+* Before any match is done, `"\1"` etc. in a string evaluate to themselves. 
+* After a successful match is done, `"\1"` etc. in a string evaluate to the matched substring.
+* After an unsuccessful match is done, `"\1"` etc. in a string evaluate to the empty string.
+* You can match against `null` to reset to the original state.
+
+
+mlr repl
+
+
+
+[mlr] "\1:\2"
+"\1:\2"
+
+[mlr] "abc" =~ "..."
+true
+
+[mlr] "\1:\2"
+":"
+
+[mlr] "abc" =~ "(.).(.)"
+true
+
+[mlr] "\1:\2"
+"a:c"
+
+[mlr] "abc" =~ "(.)x(.)"
+false
+
+[mlr] "\1:\2"
+":"
+
+[mlr] "abc" =~ null
+
+[mlr] "\1:\2"
+"\1:\2"
+
+ ## More information Regular expressions are those supported by the [Go regexp package](https://pkg.go.dev/regexp), which in turn are of type [RE2](https://github.com/google/re2/wiki/Syntax) except for `\C`: diff --git a/man/manpage.txt b/man/manpage.txt index b79cc6bca0..abcb77db81 100644 --- a/man/manpage.txt +++ b/man/manpage.txt @@ -205,12 +205,13 @@ MILLER(1) MILLER(1) roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime select sgn sha1 sha256 sha512 sin sinh skewness sort sort_collection splita splitax splitkv splitkvx splitnv splitnvx sqrt ssub stddev strfntime - strfntime_local strftime strftime_local string strip strlen strpntime - strpntime_local strptime strptime_local sub substr substr0 substr1 sum sum2 - sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper truncate - typeof unflatten unformat unformatx upntime uptime urand urand32 urandelement - urandint urandrange utf8_to_latin1 variance version ! != !=~ % & && * ** + - . - .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~ + strfntime_local strftime strftime_local string strip strlen strmatch strmatchx + strpntime strpntime_local strptime strptime_local sub substr substr0 substr1 + sum sum2 sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper + truncate typeof unflatten unformat unformatx upntime uptime urand urand32 + urandelement urandint urandrange utf8_to_latin1 variance version ! != !=~ % & + && * ** + - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | + || ~ 1mCOMMENTS-IN-DATA FLAGS0m Miller lets you put comments in your data, such as @@ -2975,6 +2976,16 @@ MILLER(1) MILLER(1) 1mstrlen0m (class=string #args=1) String length. + 1mstrmatch0m + (class=string #args=2) TODO: WRITE ME + Example: + TODO: WRITE ME + + 1mstrmatchx0m + (class=string #args=2) TODO: WRITE ME + Example: + TODO: WRITE ME + 1mstrpntime0m (class=time #args=2) strpntime: Parses timestamp as integer nanoseconds since the epoch. See also strpntime_local. Examples: diff --git a/man/mlr.1 b/man/mlr.1 index fd05c9f8d2..8aa70a52e4 100644 --- a/man/mlr.1 +++ b/man/mlr.1 @@ -252,12 +252,13 @@ percentiles pow qnorm reduce regextract regextract_or_else rightpad round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime select sgn sha1 sha256 sha512 sin sinh skewness sort sort_collection splita splitax splitkv splitkvx splitnv splitnvx sqrt ssub stddev strfntime -strfntime_local strftime strftime_local string strip strlen strpntime -strpntime_local strptime strptime_local sub substr substr0 substr1 sum sum2 -sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper truncate -typeof unflatten unformat unformatx upntime uptime urand urand32 urandelement -urandint urandrange utf8_to_latin1 variance version ! != !=~ % & && * ** + - . -\&.* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~ +strfntime_local strftime strftime_local string strip strlen strmatch strmatchx +strpntime strpntime_local strptime strptime_local sub substr substr0 substr1 +sum sum2 sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper +truncate typeof unflatten unformat unformatx upntime uptime urand urand32 +urandelement urandint urandrange utf8_to_latin1 variance version ! != !=~ % & +&& * ** + - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | +|| ~ .fi .if n \{\ .RE @@ -4650,6 +4651,28 @@ strftime_local(1440768801.7, "%Y-%m-%d %H:%M:%3S %z", "Asia/Istanbul") = "2015-0 .fi .if n \{\ .RE +.SS "strmatch" +.if n \{\ +.RS 0 +.\} +.nf + (class=string #args=2) TODO: WRITE ME +Example: +TODO: WRITE ME +.fi +.if n \{\ +.RE +.SS "strmatchx" +.if n \{\ +.RS 0 +.\} +.nf + (class=string #args=2) TODO: WRITE ME +Example: +TODO: WRITE ME +.fi +.if n \{\ +.RE .SS "strpntime" .if n \{\ .RS 0 diff --git a/pkg/bifs/regex.go b/pkg/bifs/regex.go index 7470c441a7..211d6ded8d 100644 --- a/pkg/bifs/regex.go +++ b/pkg/bifs/regex.go @@ -115,16 +115,16 @@ func BIF_gsub(input1, input2, input3 *mlrval.Mlrval) *mlrval.Mlrval { return mlrval.FromString(stringOutput) } -func BIF_match(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { +func BIF_strmatch(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { if !input1.IsLegit() { - return mlrval.FromNotStringError("match", input1) // TODO: CHANGE FLAVOR + return mlrval.FromNotStringError("strmatch", input1) // TODO: CHANGE FLAVOR } if !input2.IsLegit() { - return mlrval.FromNotStringError("match", input2) // TODO: CHANGE FLAVOR + return mlrval.FromNotStringError("strmatch", input2) // TODO: CHANGE FLAVOR } input1string := input1.String() if !input2.IsStringOrVoid() { - return mlrval.FromNotStringError("match", input2) + return mlrval.FromNotStringError("strmatch", input2) } boolOutput := lib.RegexStringMatchSimple(input1string, input2.AcquireStringValue()) @@ -132,16 +132,16 @@ func BIF_match(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { return mlrval.FromBool(boolOutput) } -func BIF_matchx(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { +func BIF_strmatchx(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { if !input1.IsLegit() { - return mlrval.FromNotStringError("matchx", input1) // TODO: CHANGE FLAVOR + return mlrval.FromNotStringError("strmatchx", input1) // TODO: CHANGE FLAVOR } if !input2.IsLegit() { - return mlrval.FromNotStringError("matchx", input2) // TODO: CHANGE FLAVOR + return mlrval.FromNotStringError("strmatchx", input2) // TODO: CHANGE FLAVOR } input1string := input1.String() if !input2.IsStringOrVoid() { - return mlrval.FromNotStringError("matchx", input2) + return mlrval.FromNotStringError("strmatchx", input2) } boolOutput, captures, starts, ends := lib.RegexStringMatchWithMapResults(input1string, input2.AcquireStringValue()) diff --git a/pkg/dsl/cst/builtin_function_manager.go b/pkg/dsl/cst/builtin_function_manager.go index 02a2fdf77e..c9bae293bb 100644 --- a/pkg/dsl/cst/builtin_function_manager.go +++ b/pkg/dsl/cst/builtin_function_manager.go @@ -338,25 +338,25 @@ used within subsequent DSL statements. See also "Regular expressions" at ` + lib regexCaptureBinaryFunc: bifs.BIF_string_does_not_match_regexp, }, - { - name: "match", - class: FUNC_CLASS_STRING, - help: `TODO: WRITE ME`, - examples: []string{ - `TODO: WRITE ME`, - }, - binaryFunc: bifs.BIF_match, - }, - - { - name: "matchx", - class: FUNC_CLASS_STRING, - help: `TODO: WRITE ME`, - examples: []string{ - `TODO: WRITE ME`, - }, - binaryFunc: bifs.BIF_matchx, - }, + { + name: "strmatch", + class: FUNC_CLASS_STRING, + help: `TODO: WRITE ME`, + examples: []string{ + `TODO: WRITE ME`, + }, + binaryFunc: bifs.BIF_strmatch, + }, + + { + name: "strmatchx", + class: FUNC_CLASS_STRING, + help: `TODO: WRITE ME`, + examples: []string{ + `TODO: WRITE ME`, + }, + binaryFunc: bifs.BIF_strmatchx, + }, { name: "&&", diff --git a/pkg/lib/regex.go b/pkg/lib/regex.go index 56baa21081..af0a188059 100644 --- a/pkg/lib/regex.go +++ b/pkg/lib/regex.go @@ -15,7 +15,7 @@ // where the '=~' sets the captures and the "\2:\1" uses them. (Note that // https://github.com/johnkerl/miller/issues/388 has a better suggestion which would make the // captures explicit as variables, rather than implicit within CST state: this is implemented by -// the `match` and `matchx` DSL functions. Regardless, the `=~` syntax will still be supported +// the `strmatch` and `strmatchx` DSL functions. Regardless, the `=~` syntax will still be supported // for backward compatibility and so is here to stay.) Here we make use of Go regexp-library // functions to write to, and then later interpolate from, a captures array which is stored within // CST state. (See the `runtime.State` object.) @@ -293,7 +293,7 @@ func RegexCompiledMatchSimple( return regex.Match([]byte(input)) } -// RegexStringMatchWithMapResults implements much of the `matchx` DSL function. This returns +// RegexStringMatchWithMapResults implements much of the `strmatchx` DSL function. This returns // captures via return values. This is distinct from RegexStringMatchWithCaptures which is for the // `=~` DSL operator. func RegexStringMatchWithMapResults( diff --git a/test/cases/dsl-match/0001/mlr b/test/cases/dsl-match/0001/mlr index f7c6a94416..9b015fdb78 100644 --- a/test/cases/dsl-match/0001/mlr +++ b/test/cases/dsl-match/0001/mlr @@ -1 +1 @@ -$z = match($x, $y) +$z = strmatch($x, $y) diff --git a/test/cases/dsl-match/0002/mlr b/test/cases/dsl-match/0002/mlr index 0cf77e39e0..184b3e286e 100644 --- a/test/cases/dsl-match/0002/mlr +++ b/test/cases/dsl-match/0002/mlr @@ -1 +1 @@ -$z = matchx($x, $y) +$z = strmatchx($x, $y)