Skip to content

Commit

Permalink
Merge pull request #233 from ocaml/empty-matches
Browse files Browse the repository at this point in the history
Replacement and splitting functions: fixes and improvement
  • Loading branch information
rgrinberg authored Mar 25, 2024
2 parents 55f8e7d + 2cef6a0 commit 5a9c9bf
Show file tree
Hide file tree
Showing 6 changed files with 253 additions and 59 deletions.
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
Unreleased
----------
* Add `Re.split_delim` (#233)
* Fix handling of empty matches in splitting and substitution functions (#233)

1.11.0 (19-Aug-2023)
--------------------

Expand Down
87 changes: 57 additions & 30 deletions lib/core.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1002,20 +1002,24 @@ module Rseq = struct
in
(* iterate on matches. When a match is found, search for the next
one just after its end *)
let rec aux pos () =
if pos >= limit
let rec aux pos on_match () =
if pos > limit
then Seq.Nil (* no more matches *)
else
match match_str ~groups:true ~partial:false re s
~pos ~len:(limit - pos) with
| Match substr ->
let p1, p2 = Group.offset substr 0 in
let pos = if p1=p2 then p2+1 else p2 in
Seq.Cons (substr, aux pos)
if on_match && p1 = pos && p1 = p2 then
(* skip empty match right after a match *)
aux (pos + 1) false ()
else
let pos = if p1=p2 then p2+1 else p2 in
Seq.Cons (substr, aux pos (p1 <> p2))
| Running _
| Failed -> Seq.Nil
in
aux pos
aux pos false

let matches ?pos ?len re s : _ Seq.t =
all ?pos ?len re s
Expand All @@ -1034,11 +1038,10 @@ module Rseq = struct
limit: first index we ignore (!pos < limit is an invariant) *)
let pos0 = pos in
let rec aux state i pos () = match state with
| `Idle when pos >= limit ->
if i < limit then (
let sub = String.sub s i (limit - i) in
Seq.Cons (`Text sub, aux state (i+1) pos)
) else Seq.Nil
| `Idle when pos > limit ->
(* We had an empty match at the end of the string *)
assert (i = limit);
Seq.Nil
| `Idle ->
begin match match_str ~groups:true ~partial:false re s ~pos
~len:(limit - pos) with
Expand All @@ -1047,7 +1050,10 @@ module Rseq = struct
let pos = if p1=p2 then p2+1 else p2 in
let old_i = i in
let i = p2 in
if p1 > pos0 then (
if old_i = p1 && p1 = p2 && p1 > pos0 then
(* Skip empty match right after a delimiter *)
aux state i pos ()
else if p1 > pos0 then (
(* string does not start by a delimiter *)
let text = String.sub s old_i (p1 - old_i) in
let state = `Yield (`Delim substr) in
Expand Down Expand Up @@ -1075,6 +1081,16 @@ module Rseq = struct
| Seq.Cons (`Delim _, tl) -> filter tl ()
| Seq.Cons (`Text s,tl) -> Seq.Cons (s, filter tl)
in filter seq

let split_delim ?pos ?len re s : _ Seq.t =
let seq = split_full ?pos ?len re s in
let rec filter ~delim seq () = match seq () with
| Seq.Nil -> if delim then Seq.Cons ("", fun () -> Seq.Nil) else Seq.Nil
| Seq.Cons (`Delim _, tl) ->
if delim then Seq.Cons ("", fun () -> filter ~delim:true tl ())
else filter ~delim:true tl ()
| Seq.Cons (`Text s,tl) -> Seq.Cons (s, filter ~delim:false tl)
in filter ~delim:true seq
end

module Rlist = struct
Expand All @@ -1088,6 +1104,9 @@ module Rlist = struct
let split_full ?pos ?len re s = Rseq.split_full ?pos ?len re s |> list_of_seq

let split ?pos ?len re s = Rseq.split ?pos ?len re s |> list_of_seq

let split_delim ?pos ?len re s =
Rseq.split_delim ?pos ?len re s |> list_of_seq
end

module Gen = struct
Expand Down Expand Up @@ -1122,33 +1141,41 @@ let replace ?(pos=0) ?len ?(all=true) re ~f s =
(* buffer into which we write the result *)
let buf = Buffer.create (String.length s) in
(* iterate on matched substrings. *)
let rec iter pos =
if pos < limit
let rec iter pos on_match =
if pos <= limit
then
match match_str ~groups:true ~partial:false re s ~pos ~len:(limit-pos) with
| Match substr ->
let p1, p2 = Group.offset substr 0 in
(* add string between previous match and current match *)
Buffer.add_substring buf s pos (p1-pos);
(* what should we replace the matched group with? *)
let replacing = f substr in
Buffer.add_string buf replacing;
if all then
(* if we matched a non-char e.g. ^ we must manually advance by 1 *)
iter (
if p1=p2 then (
(* a non char could be past the end of string. e.g. $ *)
if p2 < limit then Buffer.add_char buf s.[p2];
p2+1
) else
p2)
else
Buffer.add_substring buf s p2 (limit-p2)
if pos = p1 && p1 = p2 && on_match then begin
(* if we matched an empty string right after a match,
we must manually advance by 1 *)
if p2 < limit then Buffer.add_char buf s.[p2];
iter (p2 + 1) false
end else begin
(* add string between previous match and current match *)
Buffer.add_substring buf s pos (p1-pos);
(* what should we replace the matched group with? *)
let replacing = f substr in
Buffer.add_string buf replacing;
if all then
(* if we matched an empty string, we must manually advance by 1 *)
iter (
if p1=p2 then (
(* a non char could be past the end of string. e.g. $ *)
if p2 < limit then Buffer.add_char buf s.[p2];
p2+1
) else
p2)
(p1 <> p2)
else
Buffer.add_substring buf s p2 (limit-p2)
end
| Running _ -> ()
| Failed ->
Buffer.add_substring buf s pos (limit-pos)
in
iter pos;
iter pos false;
Buffer.contents buf

let replace_string ?pos ?len ?all re ~by s =
Expand Down
49 changes: 47 additions & 2 deletions lib/core.mli
Original file line number Diff line number Diff line change
Expand Up @@ -282,8 +282,9 @@ val matches_seq : ?pos:int -> ?len:int -> re -> string -> string Seq.t
(** @deprecated Use {!module-Seq.matches} instead. *)

val split : ?pos:int -> ?len:int -> re -> string -> string list
(** [split re s] splits [s] into chunks separated by [re]. It yields the chunks
themselves, not the separator.
(** [split re s] splits [s] into chunks separated by [re]. It yields
the chunks themselves, not the separator. An occurence of the
separator at the beginning or the end of the string is ignoring.
{5 Examples:}
{[
Expand All @@ -296,6 +297,34 @@ val split : ?pos:int -> ?len:int -> re -> string -> string list
# Re.split regex "No commas in this sentence.";;
- : string list = ["No commas in this sentence."]
# Re.split regex ",1,2,";;
- : string list = ["1"; "2"]
# Re.split ~pos:3 regex "1,2,3,4. Commas go brrr.";;
- : string list = ["3"; "4. Commas go brrr."]
]}
*)

val split_delim : ?pos:int -> ?len:int -> re -> string -> string list
(** [split_delim re s] splits [s] into chunks separated by [re]. It
yields the chunks themselves, not the separator. Occurences of the
separator at the beginning or the end of the string will produce
empty chunks.
{5 Examples:}
{[
# let regex = Re.compile (Re.char ',');;
val regex : re = <abstr>
# Re.split regex "Re,Ocaml,Jerome Vouillon";;
- : string list = ["Re"; "Ocaml"; "Jerome Vouillon"]
# Re.split regex "No commas in this sentence.";;
- : string list = ["No commas in this sentence."]
# Re.split regex ",1,2,";;
- : string list = [""; "1"; "2"; ""]
# Re.split ~pos:3 regex "1,2,3,4. Commas go brrr.";;
- : string list = ["3"; "4. Commas go brrr."]
]}
Expand Down Expand Up @@ -390,6 +419,22 @@ module Seq : sig
]}
@since 1.10.0 *)

val split_delim :
?pos:int -> (** Default: 0 *)
?len:int ->
re -> string -> string Seq.t
(** Same as {!module-Re.val-split_delim} but returns an iterator.
{5 Example:}
{[
# let regex = Re.compile (Re.char ',');;
val regex : re = <abstr>
# Re.Seq.split regex "Re,Ocaml,Jerome Vouillon";;
- : string Seq.t = <fun>
]}
@since 1.11.1 *)

val split_full :
?pos:int -> (** Default: 0 *)
?len:int ->
Expand Down
76 changes: 52 additions & 24 deletions lib/pcre.ml
Original file line number Diff line number Diff line change
Expand Up @@ -60,37 +60,65 @@ let pmatch ~rex s =

let substitute ~rex ~subst str =
let b = Buffer.create 1024 in
let rec loop pos =
if pos >= String.length str then
Buffer.contents b
else if Re.execp ~pos rex str then (
let rec loop pos on_match =
if Re.execp ~pos rex str then (
let ss = Re.exec ~pos rex str in
let start, fin = Re.Group.offset ss 0 in
let pat = Re.Group.get ss 0 in
Buffer.add_substring b str pos (start - pos);
Buffer.add_string b (subst pat);
loop fin
) else (
Buffer.add_substring b str pos (String.length str - pos);
loop (String.length str)
)
if on_match && start = pos && start = fin then (
(* Empty match following a match *)
if pos < String.length str then (
Buffer.add_char b str.[pos];
loop (pos + 1) false
)
) else (
let pat = Re.Group.get ss 0 in
Buffer.add_substring b str pos (start - pos);
Buffer.add_string b (subst pat);
if start = fin then (
(* Manually advance by one after an empty match *)
if fin < String.length str then (
Buffer.add_char b str.[fin];
loop (fin + 1) false
)
) else
loop fin true
)
) else
Buffer.add_substring b str pos (String.length str - pos)
in
loop 0
loop 0 false;
Buffer.contents b

let split ~rex str =
let rec loop accu pos =
if pos >= String.length str then
List.rev accu
else if Re.execp ~pos rex str then (
let finish str last accu =
let accu = String.sub str last (String.length str - last) :: accu in
List.rev accu
in
let rec loop accu last pos on_match =
if Re.execp ~pos rex str then (
let ss = Re.exec ~pos rex str in
let start, fin = Re.Group.offset ss 0 in
let s = String.sub str pos (start - pos) in
loop (s :: accu) fin
) else (
let s = String.sub str pos (String.length str - pos) in
loop (s :: accu) (String.length str)
) in
loop [] 0
if on_match && start = pos && start = fin then (
(* Empty match following a match *)
if pos = String.length str then
finish str last accu
else
loop accu last (pos + 1) false
) else (
let accu = String.sub str last (start - last) :: accu in
if start = fin then (
(* Manually advance by one after an empty match *)
if fin = String.length str then
finish str fin accu
else
loop accu fin (fin + 1) false
) else
loop accu fin fin true
)
) else
finish str last accu
in
loop [] 0 0 false

(* From PCRE *)
let string_unsafe_sub s ofs len =
Expand Down
48 changes: 46 additions & 2 deletions lib_test/test_easy.ml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ let test_iter () =
assert_equal ~printer:pp_list
["ab"; "abab"] (Re.matches ~pos:2 ~len:7 re "abab ababab");
assert_equal ~printer:pp_list
[""; ""] (Re.matches re_empty "ab");
[""; ""; ""] (Re.matches re_empty "ab");
assert_equal ~printer:pp_list
[""; "a"; ""] (Re.matches (Re.compile (Re.rep (Re.char 'a'))) "cat");
()

let test_split () =
Expand All @@ -41,6 +43,32 @@ let test_split () =
["a "; "b"] (Re.split re_bow "a b");
assert_equal ~printer:pp_list
["a"; " b"] (Re.split re_eow "a b");
assert_equal ~printer:pp_list
[] (Re.split re_whitespace "");
assert_equal ~printer:pp_list
[] (Re.split re_empty "");
()

let test_split_delim () =
assert_equal ~printer:pp_list
["aa"; "bb"; "c"; "d"; ""] (Re.split_delim re_whitespace "aa bb c d ");
assert_equal ~printer:pp_list
["a"; "b"; ""] (Re.split_delim ~pos:1 ~len:4 re_whitespace "aa b c d");
assert_equal ~printer:pp_list
[""; "a"; "full_word"; "bc"; ""]
(Re.split_delim re_whitespace " a full_word bc ");
assert_equal ~printer:pp_list
[""; "a"; "b"; "c"; "d"; ""] (Re.split_delim re_empty "abcd");
assert_equal ~printer:pp_list
["a"; "\nb"; ""] (Re.split_delim re_eol "a\nb");
assert_equal ~printer:pp_list
[""; "a "; "b"] (Re.split_delim re_bow "a b");
assert_equal ~printer:pp_list
["a"; " b"; ""] (Re.split_delim re_eow "a b");
assert_equal ~printer:pp_list
[""] (Re.split_delim re_whitespace "");
assert_equal ~printer:pp_list
[""; ""] (Re.split_delim re_empty "");
()

let map_split_delim =
Expand Down Expand Up @@ -69,8 +97,19 @@ let test_split_full () =
[`D " "; `T "a"; `D " "; `T "full_word"; `D " "; `T "bc"; `D " "]
(Re.split_full re_whitespace " a full_word bc " |> map_split_delim);
assert_equal ~printer:pp_list'
[`D ""; `T "a"; `D ""; `T "b"] (* XXX: not trivial *)
[] (Re.split_full re_whitespace "" |> map_split_delim);
assert_equal ~printer:pp_list'
[`D ""] (Re.split_full re_empty "" |> map_split_delim);
assert_equal ~printer:pp_list'
[`D ""; `T "a"; `D ""; `T "b"; `D ""] (* XXX: not trivial *)
(Re.split_full re_empty "ab" |> map_split_delim);
assert_equal ~printer:pp_list'
[] (Re.split_full re_whitespace "" |> map_split_delim);
assert_equal ~printer:pp_list'
[`D ""] (Re.split_full re_empty "" |> map_split_delim);
assert_equal ~printer:pp_list'
[`D ""; `T "c"; `D "a"; `T "t"; `D ""]
(Re.split_full (Re.compile (Re.rep (Re.char 'a'))) "cat" |> map_split_delim);
()

let test_replace () =
Expand All @@ -80,6 +119,10 @@ let test_replace () =
(Re.replace re ~f " hello world; I love chips!");
assert_equal ~printer:pp_str " Allo maman, bobo"
(Re.replace ~all:false re ~f " allo maman, bobo");
assert_equal ~printer:pp_str "a"
(Re.replace re_empty ~f:(fun _ -> "a") "");
assert_equal ~printer:pp_str "*c*t*"
(Re.replace (Re.compile (Re.rep (Re.char 'a'))) ~f:(fun _ -> "*") "cat");
()

let test_replace_string () =
Expand All @@ -101,6 +144,7 @@ let test_bug_55 () =
let suite = "easy" >:::
[ "iter" >:: test_iter
; "split" >:: test_split
; "split_delim" >:: test_split_delim
; "split_full" >:: test_split_full
; "replace" >:: test_replace
; "replace_string" >:: test_replace_string
Expand Down
Loading

0 comments on commit 5a9c9bf

Please sign in to comment.