From 5237aa4846009608290a5a97c3e373d4bf2ad17d Mon Sep 17 00:00:00 2001 From: Rudi Grinberg Date: Wed, 11 Sep 2024 15:47:17 +0100 Subject: [PATCH] feature: add streaming interface This allows us to partially match a string and then resume a match from where we've ended. --- lib/compile.ml | 245 +++++++++++++++++++++++++++++++++ lib/compile.mli | 29 ++++ lib/core.ml | 1 + lib/core.mli | 30 +++- lib_test/expect/dune | 1 + lib_test/expect/test_stream.ml | 203 +++++++++++++++++++++++++++ 6 files changed, 508 insertions(+), 1 deletion(-) create mode 100644 lib_test/expect/test_stream.ml diff --git a/lib/compile.ml b/lib/compile.ml index 915d5b2a..22458582 100644 --- a/lib/compile.ml +++ b/lib/compile.ml @@ -371,6 +371,251 @@ let make_match_str re positions ~len ~groups ~partial s ~pos = else final_boundary_check re positions ~last ~slen s state_info ~groups ;; +module Stream = struct + type nonrec t = + { state : State.t + ; re : re + } + + type 'a feed = + | Ok of 'a + | No_match + + let create re = + let category = Category.(search_boundary ++ inexistant) in + let state = find_initial_state re category in + { state; re } + ;; + + let feed t s ~pos ~len = + (* TODO bound checks? *) + let last = pos + len in + let state = loop_no_mark t.re ~colors:t.re.colors s ~last ~pos t.state t.state in + let info = State.get_info state in + if Idx.is_break info.idx + && + match Automata.State.status info.desc with + | Failed -> true + | Match _ | Running -> false + then No_match + else Ok { t with state } + ;; + + let finalize t s ~pos ~len = + (* TODO bound checks? *) + let last = pos + len in + let state = scan_str t.re Positions.empty s t.state ~last ~pos ~groups:false in + let info = State.get_info state in + match + let _idx, res = + let final_cat = Category.(search_boundary ++ inexistant) in + final t.re Positions.empty info final_cat + in + res + with + | Running | Failed -> false + | Match _ -> true + ;; + + module Group = struct + type slice = + { s : string + ; pos : int + ; len : int + } + + module Slices = struct + type t = slice list + + let get_substring slices ~start ~stop = + if stop = start + then "" + else ( + let slices = + let rec drop slices remains = + if remains = 0 + then slices + else ( + match slices with + | [] -> assert false + | ({ s = _; pos; len } as slice) :: xs -> + let remains' = remains - len in + if remains' >= 0 + then drop xs remains' + else ( + let pos = pos + remains in + let len = len - remains in + { slice with pos; len } :: xs)) + in + drop slices start + in + let buf = Buffer.create (stop - start) in + let rec take slices remains = + if remains > 0 + then ( + match slices with + | [] -> assert false + | { s; pos; len } :: xs -> + let remains' = remains - len in + if remains' > 0 + then ( + Buffer.add_substring buf s pos len; + take xs remains') + else Buffer.add_substring buf s pos remains) + in + take slices (stop - start); + Buffer.contents buf) + ;; + + let rec drop t remains = + if remains = 0 + then t + else ( + match t with + | [] -> [] + | ({ s = _; pos; len } as slice) :: t -> + if remains >= len + then drop t (remains - len) + else ( + let delta = len - remains in + { slice with pos = pos + delta; len = len - delta } :: t)) + ;; + + let drop_rev t remains = + (* TODO Use a proper functional queue *) + if remains = 0 then t else List.rev (drop (List.rev t) remains) + ;; + end + + type nonrec t = + { t : t + ; positions : Positions.t + ; slices : Slices.t + ; abs_pos : int + ; first_match_pos : int + } + + let no_match_starts_before t = t.first_match_pos + + let create t = + { t + ; positions = Positions.make ~groups:true t.re + ; slices = [] + ; abs_pos = 0 + ; first_match_pos = 0 + } + ;; + + module Match = struct + type t = + { pmarks : Pmark.Set.t + ; slices : slice list + ; marks : Mark_infos.t + ; positions : int array + ; start_pos : int + } + + let test_mark t mark = Pmark.Set.mem mark t.pmarks + + let get t i = + Mark_infos.offset t.marks i + |> Option.map (fun (start, stop) -> + let start = t.positions.(start) - t.start_pos in + let stop = t.positions.(stop) - t.start_pos in + Slices.get_substring t.slices ~start ~stop) + ;; + + let make ~start_pos ~pmarks ~slices ~marks ~positions = + let positions = Positions.all positions in + { pmarks; slices; positions; marks; start_pos } + ;; + end + + let rec loop re ~abs_pos ~colors ~positions s ~pos ~last st0 st = + if pos < last + then ( + let st' = next colors st s pos in + let idx = (State.get_info st').idx in + if Idx.is_idx idx + then ( + Positions.set positions (Idx.idx idx) (abs_pos + pos); + loop re ~abs_pos ~colors ~positions s ~pos:(pos + 1) ~last st' st') + else if Idx.is_break idx + then ( + Positions.set positions (Idx.break_idx idx) (abs_pos + pos); + st') + else ( + (* Unknown *) + validate re positions s ~pos st0; + loop re ~abs_pos ~colors ~positions s ~pos ~last st0 st0)) + else st + ;; + + let feed ({ t; positions; slices; abs_pos; first_match_pos = _ } as tt) s ~pos ~len = + let state = + (* TODO bound checks? *) + let last = pos + len in + loop t.re ~abs_pos ~colors:t.re.colors s ~positions ~last ~pos t.state t.state + in + let info = State.get_info state in + if Idx.is_break info.idx + && + match Automata.State.status info.desc with + | Failed -> true + | Match _ | Running -> false + then No_match + else ( + let t = { t with state } in + let slices = { s; pos; len } :: slices in + let first_match_pos = Positions.first positions in + let slices = Slices.drop_rev slices (first_match_pos - tt.first_match_pos) in + let abs_pos = abs_pos + len in + Ok { tt with t; slices; abs_pos; first_match_pos }) + ;; + + let finalize + ({ t; positions; slices; abs_pos; first_match_pos = _ } as tt) + s + ~pos + ~len + : Match.t feed + = + (* TODO bound checks? *) + let last = pos + len in + let info = + let state = + loop t.re ~abs_pos ~colors:t.re.colors s ~positions ~last ~pos t.state t.state + in + State.get_info state + in + match + match Automata.State.status info.desc with + | (Match _ | Failed) as s -> s + | Running -> + let idx, res = + let final_cat = Category.(search_boundary ++ inexistant) in + final t.re positions info final_cat + in + (match res with + | Running | Failed -> () + | Match _ -> Positions.set positions (Automata.Idx.to_int idx) (abs_pos + last)); + res + with + | Running | Failed -> No_match + | Match (marks, pmarks) -> + let first_match_position = Positions.first positions in + let slices = + let slices = + let slices = { s; pos; len } :: slices in + Slices.drop_rev slices (first_match_position - tt.first_match_pos) + in + List.rev slices + in + Ok (Match.make ~start_pos:first_match_position ~pmarks ~marks ~slices ~positions) + ;; + end +end + let match_str_no_bounds ~groups ~partial re s ~pos ~len = let positions = Positions.make ~groups re in match make_match_str re positions ~len ~groups ~partial s ~pos with diff --git a/lib/compile.mli b/lib/compile.mli index 2b77a994..5e1bde3c 100644 --- a/lib/compile.mli +++ b/lib/compile.mli @@ -1,5 +1,34 @@ type re +module Stream : sig + type t + + type 'a feed = + | Ok of 'a + | No_match + + val create : re -> t + val feed : t -> string -> pos:int -> len:int -> t feed + val finalize : t -> string -> pos:int -> len:int -> bool + + module Group : sig + type stream := t + type t + + module Match : sig + type t + + val get : t -> int -> string option + val test_mark : t -> Pmark.t -> bool + end + + val create : stream -> t + val feed : t -> string -> pos:int -> len:int -> t feed + val finalize : t -> string -> pos:int -> len:int -> Match.t feed + val no_match_starts_before : t -> int + end +end + type match_info = | Match of Group.t | Failed diff --git a/lib/core.ml b/lib/core.ml index b92b9ddb..fd114c2e 100644 --- a/lib/core.ml +++ b/lib/core.ml @@ -170,3 +170,4 @@ include struct end module Seq = Search +module Stream = Compile.Stream diff --git a/lib/core.mli b/lib/core.mli index 94ee7754..ec4164a9 100644 --- a/lib/core.mli +++ b/lib/core.mli @@ -215,7 +215,7 @@ val exec_partial_detailed (** Marks *) module Mark : sig (** Mark id *) - type t + type t = Pmark.t (** Tell if a mark was matched. *) val test : Group.t -> t -> bool @@ -229,6 +229,34 @@ module Mark : sig val compare : t -> t -> int end +module Stream : sig + type t + + type 'a feed = + | Ok of 'a + | No_match + + val create : re -> t + val feed : t -> string -> pos:int -> len:int -> t feed + val finalize : t -> string -> pos:int -> len:int -> bool + + module Group : sig + type stream := t + type t + + module Match : sig + type t + + val get : t -> int -> string option + val test_mark : t -> Pmark.t -> bool + end + + val create : stream -> t + val feed : t -> string -> pos:int -> len:int -> t feed + val finalize : t -> string -> pos:int -> len:int -> Match.t feed + end +end + (** {2 High Level Operations} *) type split_token = diff --git a/lib_test/expect/dune b/lib_test/expect/dune index 6edf846a..b2d4bf4a 100644 --- a/lib_test/expect/dune +++ b/lib_test/expect/dune @@ -1,5 +1,6 @@ (library (name re_tests) + (modules import test_stream) (libraries re_private ;; This is because of the (implicit_transitive_deps false) diff --git a/lib_test/expect/test_stream.ml b/lib_test/expect/test_stream.ml new file mode 100644 index 00000000..beb68fd3 --- /dev/null +++ b/lib_test/expect/test_stream.ml @@ -0,0 +1,203 @@ +open Import +module Stream = Re.Stream + +let feed t str = + let res = Stream.feed t str ~pos:0 ~len:(String.length str) in + let () = + match res with + | No_match -> Printf.printf "%S did not match\n" str + | Ok s -> + let status = + match Stream.finalize s "" ~pos:0 ~len:0 with + | true -> "matched" + | false -> "unmatched" + in + Printf.printf "%S not matched (status = %s)\n" str status + in + res +;; + +let%expect_test "out out of bounds" = + let stream = Re.any |> Re.compile |> Stream.create in + invalid_argument (fun () -> ignore (Stream.feed stream "foo" ~pos:2 ~len:3)); + [%expect {| Invalid_argument "index out of bounds" |}]; + invalid_argument (fun () -> ignore (Stream.finalize stream "foo" ~pos:2 ~len:3)); + [%expect {| Invalid_argument "index out of bounds" |}]; + let stream = Stream.Group.create stream in + invalid_argument (fun () -> ignore (Stream.Group.feed stream "foo" ~pos:2 ~len:3)); + [%expect {| Invalid_argument "index out of bounds" |}]; + invalid_argument (fun () -> ignore (Stream.Group.finalize stream "foo" ~pos:2 ~len:3)); + [%expect {| Invalid_argument "index out of bounds" |}] +;; + +let%expect_test "basic" = + let s = [ Re.bos; Re.str "abab" ] |> Re.seq |> Re.compile |> Stream.create in + ignore (feed s "x"); + [%expect {| "x" did not match |}]; + let suffix = "ab" in + let s = + match feed s suffix with + | Ok s -> s + | No_match -> assert false + in + [%expect {| + "ab" not matched (status = unmatched) |}]; + (let (_ : _ Stream.feed) = feed s "ab" in + [%expect {| + "ab" not matched (status = matched) |}]); + let (_ : _ Stream.feed) = feed s "xy" in + [%expect {| + "xy" did not match |}] +;; + +let%expect_test "eos" = + let s = [ Re.str "zzz"; Re.eos ] |> Re.seq |> Re.compile |> Stream.create in + ignore (feed s "zzz"); + [%expect {| "zzz" not matched (status = matched) |}]; + let s = + match feed s "z" with + | Ok s -> s + | No_match -> assert false + in + [%expect {| "z" not matched (status = unmatched) |}]; + (let str = "zz" in + match Stream.finalize s str ~pos:0 ~len:(String.length str) with + | true -> () + | false -> assert false); + [%expect {||}] +;; + +let%expect_test "finalize empty" = + let s = "abde" in + let stream = + let stream = Re.str s |> Re.whole_string |> Re.compile |> Stream.create in + match feed stream s with + | Ok s -> s + | No_match -> assert false + in + assert (Stream.finalize stream "" ~pos:0 ~len:0); + [%expect {| "abde" not matched (status = matched) |}] +;; + +let%expect_test "group - basic" = + let s = + let open Re in + str "foo" |> whole_string |> group |> compile |> Stream.create + in + let g = Stream.Group.create s in + let g = + match Stream.Group.feed g "f" ~pos:0 ~len:1 with + | No_match -> assert false + | Ok s -> s + in + (match Stream.Group.finalize g "oo" ~pos:0 ~len:2 with + | Ok _ -> () + | No_match -> assert false); + [%expect {| |}] +;; + +let pmarks set m = + Printf.printf "mark present %b\n" (Re.Stream.Group.Match.test_mark set m) +;; + +let%expect_test "group - mark entire string must match" = + let m1, f = Re.(mark (char 'f')) in + let m2, oo = Re.(mark (str "oo")) in + let re = + let open Re in + [ f; oo ] |> seq |> compile + in + let s = Stream.create re in + let g = Stream.Group.create s in + let g = + match Stream.Group.feed g "f" ~pos:0 ~len:1 with + | No_match -> assert false + | Ok s -> s + in + let g = + match Stream.Group.finalize g "oo" ~pos:0 ~len:2 with + | Ok g -> g + | No_match -> assert false + in + pmarks g m1; + [%expect {| mark present true |}]; + pmarks g m2; + [%expect {| mark present true |}] +;; + +let%expect_test "group - partial mark match" = + let m, foo = Re.(mark (str "foo")) in + let re = Re.compile foo in + let s = Stream.create re in + let g = Stream.Group.create s in + let g = + match Stream.Group.feed g "xx" ~pos:0 ~len:2 with + | No_match -> assert false + | Ok g -> g + in + let g = + match Stream.Group.feed g "foo" ~pos:0 ~len:3 with + | Ok g -> g + | No_match -> assert false + in + let g = + match Stream.Group.finalize g "garb" ~pos:0 ~len:4 with + | Ok g -> g + | No_match -> assert false + in + pmarks g m; + [%expect {| mark present true |}] +;; + +let print_match match_ n = + match Stream.Group.Match.get match_ n with + | None -> Printf.printf "match %d: \n" n + | Some s -> Printf.printf "match %d: %s\n" n s +;; + +let%expect_test "group - match group" = + let stream = + let re = Re.Pcre.re "_([a-z]+)_" |> Re.whole_string |> Re.compile in + Stream.Group.create (Stream.create re) + in + let s = "_abc_" in + let () = + match Stream.Group.finalize stream s ~pos:0 ~len:(String.length s) with + | No_match -> assert false + | Ok m -> + for i = 0 to 1 do + print_match m i + done + in + [%expect {| + match 0: _abc_ + match 1: abc + |}] +;; + +let%expect_test "group - match group" = + let stream = + let re = Re.Pcre.re "_([a-z]+)__([a-z]+)_" |> Re.whole_string |> Re.compile in + Stream.Group.create (Stream.create re) + in + let s = "_abc_" in + let stream = + match Stream.Group.feed stream s ~pos:0 ~len:(String.length s) with + | No_match -> assert false + | Ok m -> m + in + let s = "_de_" in + let () = + match Stream.Group.finalize stream s ~pos:0 ~len:(String.length s) with + | No_match -> assert false + | Ok m -> + for i = 0 to 2 do + print_match m i + done + in + [%expect {| + match 0: _abc__de_ + match 1: abc + match 2: de + |}] +;;