Skip to content

Commit f96f418

Browse files
committed
feature: add streaming interface
This allows us to partially match a string and then resume a match from where we've ended.
1 parent 0c89c14 commit f96f418

File tree

6 files changed

+481
-1
lines changed

6 files changed

+481
-1
lines changed

lib/compile.ml

Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,238 @@ let make_match_str re positions ~len ~groups ~partial s ~pos =
371371
else final_boundary_check re positions ~last ~slen s state_info ~groups
372372
;;
373373

374+
module Stream = struct
375+
type nonrec t =
376+
{ state : State.t
377+
; re : re
378+
}
379+
380+
type 'a feed =
381+
| Ok of 'a
382+
| No_match
383+
384+
let create re =
385+
let category = Category.(search_boundary ++ inexistant) in
386+
let state = find_initial_state re category in
387+
{ state; re }
388+
;;
389+
390+
let feed t s ~pos ~len =
391+
let last = pos + len in
392+
let state = loop_no_mark t.re ~colors:t.re.colors s ~last ~pos t.state t.state in
393+
let info = State.get_info state in
394+
if Idx.is_break info.idx
395+
&&
396+
match Automata.State.status info.desc with
397+
| Failed -> true
398+
| Match _ | Running -> false
399+
then No_match
400+
else Ok { t with state }
401+
;;
402+
403+
let finalize t s ~pos ~len =
404+
let last = pos + len in
405+
let state = scan_str t.re Positions.empty s t.state ~last ~pos ~groups:false in
406+
let info = State.get_info state in
407+
match
408+
let _idx, res =
409+
let final_cat = Category.(search_boundary ++ inexistant) in
410+
final t.re Positions.empty info final_cat
411+
in
412+
res
413+
with
414+
| Running | Failed -> false
415+
| Match _ -> true
416+
;;
417+
418+
module Group = struct
419+
type slice =
420+
{ s : string
421+
; pos : int
422+
; len : int
423+
}
424+
425+
module Slices = struct
426+
type t = slice list
427+
428+
let get_substring slices ~start ~stop =
429+
if stop = start
430+
then ""
431+
else (
432+
let slices =
433+
let rec drop slices remains =
434+
if remains = 0
435+
then slices
436+
else (
437+
match slices with
438+
| [] -> assert false
439+
| ({ s = _; pos; len } as slice) :: xs ->
440+
let remains' = remains - len in
441+
if remains' >= 0
442+
then drop xs remains'
443+
else (
444+
let pos = pos + remains in
445+
let len = len - remains in
446+
{ slice with pos; len } :: xs))
447+
in
448+
drop slices start
449+
in
450+
let buf = Buffer.create (stop - start) in
451+
let rec take slices remains =
452+
if remains > 0
453+
then (
454+
match slices with
455+
| [] -> assert false
456+
| { s; pos; len } :: xs ->
457+
let remains' = remains - len in
458+
if remains' > 0
459+
then (
460+
Buffer.add_substring buf s pos len;
461+
take xs remains')
462+
else Buffer.add_substring buf s pos remains)
463+
in
464+
take slices (stop - start);
465+
Buffer.contents buf)
466+
;;
467+
468+
let rec drop t remains =
469+
if remains = 0
470+
then t
471+
else (
472+
match t with
473+
| [] -> []
474+
| ({ s = _; pos; len } as slice) :: t ->
475+
if remains >= len
476+
then drop t (remains - len)
477+
else (
478+
let delta = len - remains in
479+
{ slice with pos = pos + delta; len = len - delta } :: t))
480+
;;
481+
482+
let drop_rev t remains =
483+
if remains = 0 then t else List.rev (drop (List.rev t) remains)
484+
;;
485+
end
486+
487+
type nonrec t =
488+
{ t : t
489+
; positions : Positions.t
490+
; slices : Slices.t
491+
; abs_pos : int
492+
; first_match_pos : int
493+
}
494+
495+
let create t =
496+
{ t
497+
; positions = Positions.make ~groups:true t.re
498+
; slices = []
499+
; abs_pos = 0
500+
; first_match_pos = 0
501+
}
502+
;;
503+
504+
module Match = struct
505+
type t =
506+
{ pmarks : Pmark.Set.t
507+
; slices : slice list
508+
; marks : Mark_infos.t
509+
; positions : int array
510+
; start_pos : int
511+
}
512+
513+
let test_mark t mark = Pmark.Set.mem mark t.pmarks
514+
515+
let get t i =
516+
Mark_infos.offset t.marks i
517+
|> Option.map (fun (start, stop) ->
518+
let start = t.positions.(start) - t.start_pos in
519+
let stop = t.positions.(stop) - t.start_pos in
520+
Slices.get_substring t.slices ~start ~stop)
521+
;;
522+
523+
let make ~start_pos ~pmarks ~slices ~marks ~positions =
524+
let positions = Positions.all positions in
525+
{ pmarks; slices; positions; marks; start_pos }
526+
;;
527+
end
528+
529+
let rec loop re ~abs_pos ~colors ~positions s ~pos ~last st0 st =
530+
if pos < last
531+
then (
532+
let st' = next colors st s pos in
533+
let idx = (State.get_info st').idx in
534+
if Idx.is_idx idx
535+
then (
536+
Positions.set positions (Idx.idx idx) (abs_pos + pos);
537+
loop re ~abs_pos ~colors ~positions s ~pos:(pos + 1) ~last st' st')
538+
else if Idx.is_break idx
539+
then (
540+
Positions.set positions (Idx.break_idx idx) (abs_pos + pos);
541+
st')
542+
else (
543+
(* Unknown *)
544+
validate re positions s ~pos st0;
545+
loop re ~abs_pos ~colors ~positions s ~pos ~last st0 st0))
546+
else st
547+
;;
548+
549+
let feed ({ t; positions; slices; abs_pos; first_match_pos = _ } as tt) s ~pos ~len =
550+
let last = pos + len in
551+
let state =
552+
loop t.re ~abs_pos ~colors:t.re.colors s ~positions ~last ~pos t.state t.state
553+
in
554+
let info = State.get_info state in
555+
if Idx.is_break info.idx
556+
&&
557+
match Automata.State.status info.desc with
558+
| Failed -> true
559+
| Match _ | Running -> false
560+
then No_match
561+
else (
562+
let t = { t with state } in
563+
let slices = { s; pos; len } :: slices in
564+
let first_match_pos = Positions.first positions in
565+
let slices = Slices.drop_rev slices (first_match_pos - tt.first_match_pos) in
566+
let abs_pos = abs_pos + len in
567+
Ok { tt with t; slices; abs_pos; first_match_pos })
568+
;;
569+
570+
let finalize
571+
({ t; positions; slices; abs_pos; first_match_pos = _ } as tt)
572+
s
573+
~pos
574+
~len
575+
: Match.t feed
576+
=
577+
let last = pos + len in
578+
let state =
579+
loop t.re ~abs_pos ~colors:t.re.colors s ~positions ~last ~pos t.state t.state
580+
in
581+
let info = State.get_info state in
582+
match
583+
match Automata.State.status info.desc with
584+
| (Match _ | Failed) as s -> s
585+
| Running ->
586+
let idx, res =
587+
let final_cat = Category.(search_boundary ++ inexistant) in
588+
final t.re positions info final_cat
589+
in
590+
(match res with
591+
| Running | Failed -> ()
592+
| Match _ -> Positions.set positions (Automata.Idx.to_int idx) (abs_pos + last));
593+
res
594+
with
595+
| Running | Failed -> No_match
596+
| Match (marks, pmarks) ->
597+
let first_match_position = Positions.first positions in
598+
let slices = { s; pos; len } :: slices in
599+
let slices = Slices.drop_rev slices (first_match_position - tt.first_match_pos) in
600+
let slices = List.rev slices in
601+
Ok (Match.make ~start_pos:first_match_position ~pmarks ~marks ~slices ~positions)
602+
;;
603+
end
604+
end
605+
374606
let match_str_no_bounds ~groups ~partial re s ~pos ~len =
375607
let positions = Positions.make ~groups re in
376608
match make_match_str re positions ~len ~groups ~partial s ~pos with

lib/compile.mli

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,33 @@
11
type re
22

3+
module Stream : sig
4+
type t
5+
6+
type 'a feed =
7+
| Ok of 'a
8+
| No_match
9+
10+
val create : re -> t
11+
val feed : t -> string -> pos:int -> len:int -> t feed
12+
val finalize : t -> string -> pos:int -> len:int -> bool
13+
14+
module Group : sig
15+
type stream := t
16+
type t
17+
18+
module Match : sig
19+
type t
20+
21+
val get : t -> int -> string option
22+
val test_mark : t -> Pmark.t -> bool
23+
end
24+
25+
val create : stream -> t
26+
val feed : t -> string -> pos:int -> len:int -> t feed
27+
val finalize : t -> string -> pos:int -> len:int -> Match.t feed
28+
end
29+
end
30+
331
type match_info =
432
| Match of Group.t
533
| Failed

lib/core.ml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,3 +170,4 @@ include struct
170170
end
171171

172172
module Seq = Search
173+
module Stream = Compile.Stream

lib/core.mli

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ val exec_partial_detailed
215215
(** Marks *)
216216
module Mark : sig
217217
(** Mark id *)
218-
type t
218+
type t = Pmark.t
219219

220220
(** Tell if a mark was matched. *)
221221
val test : Group.t -> t -> bool
@@ -229,6 +229,34 @@ module Mark : sig
229229
val compare : t -> t -> int
230230
end
231231

232+
module Stream : sig
233+
type t
234+
235+
type 'a feed =
236+
| Ok of 'a
237+
| No_match
238+
239+
val create : re -> t
240+
val feed : t -> string -> pos:int -> len:int -> t feed
241+
val finalize : t -> string -> pos:int -> len:int -> bool
242+
243+
module Group : sig
244+
type stream := t
245+
type t
246+
247+
module Match : sig
248+
type t
249+
250+
val get : t -> int -> string option
251+
val test_mark : t -> Pmark.t -> bool
252+
end
253+
254+
val create : stream -> t
255+
val feed : t -> string -> pos:int -> len:int -> t feed
256+
val finalize : t -> string -> pos:int -> len:int -> Match.t feed
257+
end
258+
end
259+
232260
(** {2 High Level Operations} *)
233261

234262
type split_token =

lib_test/expect/dune

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
(library
22
(name re_tests)
3+
(modules import test_stream)
34
(libraries
45
re_private
56
;; This is because of the (implicit_transitive_deps false)

0 commit comments

Comments
 (0)