diff --git a/CHANGES.md b/CHANGES.md index 83611065..a215ec20 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,8 @@ +Unreleased +---------- +* Add `Re.split_delim` (#233) +* Fix handling of empty matches in splitting and substitution functions (#233) + 1.11.0 (19-Aug-2023) -------------------- diff --git a/lib/core.ml b/lib/core.ml index 6fcf05d4..ecc3989c 100644 --- a/lib/core.ml +++ b/lib/core.ml @@ -1002,20 +1002,24 @@ module Rseq = struct in (* iterate on matches. When a match is found, search for the next one just after its end *) - let rec aux pos () = - if pos >= limit + let rec aux pos on_match () = + if pos > limit then Seq.Nil (* no more matches *) else match match_str ~groups:true ~partial:false re s ~pos ~len:(limit - pos) with | Match substr -> let p1, p2 = Group.offset substr 0 in - let pos = if p1=p2 then p2+1 else p2 in - Seq.Cons (substr, aux pos) + if on_match && p1 = pos && p1 = p2 then + (* skip empty match right after a match *) + aux (pos + 1) false () + else + let pos = if p1=p2 then p2+1 else p2 in + Seq.Cons (substr, aux pos (p1 <> p2)) | Running _ | Failed -> Seq.Nil in - aux pos + aux pos false let matches ?pos ?len re s : _ Seq.t = all ?pos ?len re s @@ -1034,11 +1038,10 @@ module Rseq = struct limit: first index we ignore (!pos < limit is an invariant) *) let pos0 = pos in let rec aux state i pos () = match state with - | `Idle when pos >= limit -> - if i < limit then ( - let sub = String.sub s i (limit - i) in - Seq.Cons (`Text sub, aux state (i+1) pos) - ) else Seq.Nil + | `Idle when pos > limit -> + (* We had an empty match at the end of the string *) + assert (i = limit); + Seq.Nil | `Idle -> begin match match_str ~groups:true ~partial:false re s ~pos ~len:(limit - pos) with @@ -1047,7 +1050,10 @@ module Rseq = struct let pos = if p1=p2 then p2+1 else p2 in let old_i = i in let i = p2 in - if p1 > pos0 then ( + if old_i = p1 && p1 = p2 && p1 > pos0 then + (* Skip empty match right after a delimiter *) + aux state i pos () + else if p1 > pos0 then ( (* string does not start by a delimiter *) let text = String.sub s old_i (p1 - old_i) in let state = `Yield (`Delim substr) in @@ -1075,6 +1081,16 @@ module Rseq = struct | Seq.Cons (`Delim _, tl) -> filter tl () | Seq.Cons (`Text s,tl) -> Seq.Cons (s, filter tl) in filter seq + + let split_delim ?pos ?len re s : _ Seq.t = + let seq = split_full ?pos ?len re s in + let rec filter ~delim seq () = match seq () with + | Seq.Nil -> if delim then Seq.Cons ("", fun () -> Seq.Nil) else Seq.Nil + | Seq.Cons (`Delim _, tl) -> + if delim then Seq.Cons ("", fun () -> filter ~delim:true tl ()) + else filter ~delim:true tl () + | Seq.Cons (`Text s,tl) -> Seq.Cons (s, filter ~delim:false tl) + in filter ~delim:true seq end module Rlist = struct @@ -1088,6 +1104,9 @@ module Rlist = struct let split_full ?pos ?len re s = Rseq.split_full ?pos ?len re s |> list_of_seq let split ?pos ?len re s = Rseq.split ?pos ?len re s |> list_of_seq + + let split_delim ?pos ?len re s = + Rseq.split_delim ?pos ?len re s |> list_of_seq end module Gen = struct @@ -1122,33 +1141,41 @@ let replace ?(pos=0) ?len ?(all=true) re ~f s = (* buffer into which we write the result *) let buf = Buffer.create (String.length s) in (* iterate on matched substrings. *) - let rec iter pos = - if pos < limit + let rec iter pos on_match = + if pos <= limit then match match_str ~groups:true ~partial:false re s ~pos ~len:(limit-pos) with | Match substr -> let p1, p2 = Group.offset substr 0 in - (* add string between previous match and current match *) - Buffer.add_substring buf s pos (p1-pos); - (* what should we replace the matched group with? *) - let replacing = f substr in - Buffer.add_string buf replacing; - if all then - (* if we matched a non-char e.g. ^ we must manually advance by 1 *) - iter ( - if p1=p2 then ( - (* a non char could be past the end of string. e.g. $ *) - if p2 < limit then Buffer.add_char buf s.[p2]; - p2+1 - ) else - p2) - else - Buffer.add_substring buf s p2 (limit-p2) + if pos = p1 && p1 = p2 && on_match then begin + (* if we matched an empty string right after a match, + we must manually advance by 1 *) + if p2 < limit then Buffer.add_char buf s.[p2]; + iter (p2 + 1) false + end else begin + (* add string between previous match and current match *) + Buffer.add_substring buf s pos (p1-pos); + (* what should we replace the matched group with? *) + let replacing = f substr in + Buffer.add_string buf replacing; + if all then + (* if we matched an empty string, we must manually advance by 1 *) + iter ( + if p1=p2 then ( + (* a non char could be past the end of string. e.g. $ *) + if p2 < limit then Buffer.add_char buf s.[p2]; + p2+1 + ) else + p2) + (p1 <> p2) + else + Buffer.add_substring buf s p2 (limit-p2) + end | Running _ -> () | Failed -> Buffer.add_substring buf s pos (limit-pos) in - iter pos; + iter pos false; Buffer.contents buf let replace_string ?pos ?len ?all re ~by s = diff --git a/lib/core.mli b/lib/core.mli index 1b703561..bc08c618 100644 --- a/lib/core.mli +++ b/lib/core.mli @@ -282,8 +282,9 @@ val matches_seq : ?pos:int -> ?len:int -> re -> string -> string Seq.t (** @deprecated Use {!module-Seq.matches} instead. *) val split : ?pos:int -> ?len:int -> re -> string -> string list -(** [split re s] splits [s] into chunks separated by [re]. It yields the chunks - themselves, not the separator. +(** [split re s] splits [s] into chunks separated by [re]. It yields + the chunks themselves, not the separator. An occurence of the + separator at the beginning or the end of the string is ignoring. {5 Examples:} {[ @@ -296,6 +297,34 @@ val split : ?pos:int -> ?len:int -> re -> string -> string list # Re.split regex "No commas in this sentence.";; - : string list = ["No commas in this sentence."] + # Re.split regex ",1,2,";; + - : string list = ["1"; "2"] + + # Re.split ~pos:3 regex "1,2,3,4. Commas go brrr.";; + - : string list = ["3"; "4. Commas go brrr."] + ]} +*) + +val split_delim : ?pos:int -> ?len:int -> re -> string -> string list +(** [split_delim re s] splits [s] into chunks separated by [re]. It + yields the chunks themselves, not the separator. Occurences of the + separator at the beginning or the end of the string will produce + empty chunks. + + {5 Examples:} + {[ + # let regex = Re.compile (Re.char ',');; + val regex : re = + + # Re.split regex "Re,Ocaml,Jerome Vouillon";; + - : string list = ["Re"; "Ocaml"; "Jerome Vouillon"] + + # Re.split regex "No commas in this sentence.";; + - : string list = ["No commas in this sentence."] + + # Re.split regex ",1,2,";; + - : string list = [""; "1"; "2"; ""] + # Re.split ~pos:3 regex "1,2,3,4. Commas go brrr.";; - : string list = ["3"; "4. Commas go brrr."] ]} @@ -390,6 +419,22 @@ module Seq : sig ]} @since 1.10.0 *) + val split_delim : + ?pos:int -> (** Default: 0 *) + ?len:int -> + re -> string -> string Seq.t + (** Same as {!module-Re.val-split_delim} but returns an iterator. + + {5 Example:} + {[ + # let regex = Re.compile (Re.char ',');; + val regex : re = + + # Re.Seq.split regex "Re,Ocaml,Jerome Vouillon";; + - : string Seq.t = + ]} + @since 1.11.1 *) + val split_full : ?pos:int -> (** Default: 0 *) ?len:int -> diff --git a/lib/pcre.ml b/lib/pcre.ml index f93805c2..ebc97055 100644 --- a/lib/pcre.ml +++ b/lib/pcre.ml @@ -60,37 +60,65 @@ let pmatch ~rex s = let substitute ~rex ~subst str = let b = Buffer.create 1024 in - let rec loop pos = - if pos >= String.length str then - Buffer.contents b - else if Re.execp ~pos rex str then ( + let rec loop pos on_match = + if Re.execp ~pos rex str then ( let ss = Re.exec ~pos rex str in let start, fin = Re.Group.offset ss 0 in - let pat = Re.Group.get ss 0 in - Buffer.add_substring b str pos (start - pos); - Buffer.add_string b (subst pat); - loop fin - ) else ( - Buffer.add_substring b str pos (String.length str - pos); - loop (String.length str) - ) + if on_match && start = pos && start = fin then ( + (* Empty match following a match *) + if pos < String.length str then ( + Buffer.add_char b str.[pos]; + loop (pos + 1) false + ) + ) else ( + let pat = Re.Group.get ss 0 in + Buffer.add_substring b str pos (start - pos); + Buffer.add_string b (subst pat); + if start = fin then ( + (* Manually advance by one after an empty match *) + if fin < String.length str then ( + Buffer.add_char b str.[fin]; + loop (fin + 1) false + ) + ) else + loop fin true + ) + ) else + Buffer.add_substring b str pos (String.length str - pos) in - loop 0 + loop 0 false; + Buffer.contents b let split ~rex str = - let rec loop accu pos = - if pos >= String.length str then - List.rev accu - else if Re.execp ~pos rex str then ( + let finish str last accu = + let accu = String.sub str last (String.length str - last) :: accu in + List.rev accu + in + let rec loop accu last pos on_match = + if Re.execp ~pos rex str then ( let ss = Re.exec ~pos rex str in let start, fin = Re.Group.offset ss 0 in - let s = String.sub str pos (start - pos) in - loop (s :: accu) fin - ) else ( - let s = String.sub str pos (String.length str - pos) in - loop (s :: accu) (String.length str) - ) in - loop [] 0 + if on_match && start = pos && start = fin then ( + (* Empty match following a match *) + if pos = String.length str then + finish str last accu + else + loop accu last (pos + 1) false + ) else ( + let accu = String.sub str last (start - last) :: accu in + if start = fin then ( + (* Manually advance by one after an empty match *) + if fin = String.length str then + finish str fin accu + else + loop accu fin (fin + 1) false + ) else + loop accu fin fin true + ) + ) else + finish str last accu + in + loop [] 0 0 false (* From PCRE *) let string_unsafe_sub s ofs len = diff --git a/lib_test/test_easy.ml b/lib_test/test_easy.ml index f0136f9a..74419d5a 100644 --- a/lib_test/test_easy.ml +++ b/lib_test/test_easy.ml @@ -23,7 +23,9 @@ let test_iter () = assert_equal ~printer:pp_list ["ab"; "abab"] (Re.matches ~pos:2 ~len:7 re "abab ababab"); assert_equal ~printer:pp_list - [""; ""] (Re.matches re_empty "ab"); + [""; ""; ""] (Re.matches re_empty "ab"); + assert_equal ~printer:pp_list + [""; "a"; ""] (Re.matches (Re.compile (Re.rep (Re.char 'a'))) "cat"); () let test_split () = @@ -41,6 +43,32 @@ let test_split () = ["a "; "b"] (Re.split re_bow "a b"); assert_equal ~printer:pp_list ["a"; " b"] (Re.split re_eow "a b"); + assert_equal ~printer:pp_list + [] (Re.split re_whitespace ""); + assert_equal ~printer:pp_list + [] (Re.split re_empty ""); + () + +let test_split_delim () = + assert_equal ~printer:pp_list + ["aa"; "bb"; "c"; "d"; ""] (Re.split_delim re_whitespace "aa bb c d "); + assert_equal ~printer:pp_list + ["a"; "b"; ""] (Re.split_delim ~pos:1 ~len:4 re_whitespace "aa b c d"); + assert_equal ~printer:pp_list + [""; "a"; "full_word"; "bc"; ""] + (Re.split_delim re_whitespace " a full_word bc "); + assert_equal ~printer:pp_list + [""; "a"; "b"; "c"; "d"; ""] (Re.split_delim re_empty "abcd"); + assert_equal ~printer:pp_list + ["a"; "\nb"; ""] (Re.split_delim re_eol "a\nb"); + assert_equal ~printer:pp_list + [""; "a "; "b"] (Re.split_delim re_bow "a b"); + assert_equal ~printer:pp_list + ["a"; " b"; ""] (Re.split_delim re_eow "a b"); + assert_equal ~printer:pp_list + [""] (Re.split_delim re_whitespace ""); + assert_equal ~printer:pp_list + [""; ""] (Re.split_delim re_empty ""); () let map_split_delim = @@ -69,8 +97,19 @@ let test_split_full () = [`D " "; `T "a"; `D " "; `T "full_word"; `D " "; `T "bc"; `D " "] (Re.split_full re_whitespace " a full_word bc " |> map_split_delim); assert_equal ~printer:pp_list' - [`D ""; `T "a"; `D ""; `T "b"] (* XXX: not trivial *) + [] (Re.split_full re_whitespace "" |> map_split_delim); + assert_equal ~printer:pp_list' + [`D ""] (Re.split_full re_empty "" |> map_split_delim); + assert_equal ~printer:pp_list' + [`D ""; `T "a"; `D ""; `T "b"; `D ""] (* XXX: not trivial *) (Re.split_full re_empty "ab" |> map_split_delim); + assert_equal ~printer:pp_list' + [] (Re.split_full re_whitespace "" |> map_split_delim); + assert_equal ~printer:pp_list' + [`D ""] (Re.split_full re_empty "" |> map_split_delim); + assert_equal ~printer:pp_list' + [`D ""; `T "c"; `D "a"; `T "t"; `D ""] + (Re.split_full (Re.compile (Re.rep (Re.char 'a'))) "cat" |> map_split_delim); () let test_replace () = @@ -80,6 +119,10 @@ let test_replace () = (Re.replace re ~f " hello world; I love chips!"); assert_equal ~printer:pp_str " Allo maman, bobo" (Re.replace ~all:false re ~f " allo maman, bobo"); + assert_equal ~printer:pp_str "a" + (Re.replace re_empty ~f:(fun _ -> "a") ""); + assert_equal ~printer:pp_str "*c*t*" + (Re.replace (Re.compile (Re.rep (Re.char 'a'))) ~f:(fun _ -> "*") "cat"); () let test_replace_string () = @@ -101,6 +144,7 @@ let test_bug_55 () = let suite = "easy" >::: [ "iter" >:: test_iter ; "split" >:: test_split + ; "split_delim" >:: test_split_delim ; "split_full" >:: test_split_full ; "replace" >:: test_replace ; "replace_string" >:: test_replace_string diff --git a/lib_test/test_pcre.ml b/lib_test/test_pcre.ml index d18966aa..cf89393f 100644 --- a/lib_test/test_pcre.ml +++ b/lib_test/test_pcre.ml @@ -47,6 +47,50 @@ let named_groups _ = let s = exec ~rex "testxxxyyy" in assert_equal (get_named_substring rex "many_x" s) "xxx" +let quote = Printf.sprintf "'%s'" +let pp_str x = x +let pp_list l = + l + |> List.map quote + |> String.concat ", " + |> Printf.sprintf "[ %s ]" + +let re_whitespace = regexp "[\t ]+" +let re_empty = regexp "" +let re_eol = Re.compile Re.eol +let re_bow = Re.compile Re.bow +let re_eow = Re.compile Re.eow + +let test_split () = + assert_equal ~printer:pp_list + ["aa"; "bb"; "c"; "d"; ""] (split ~rex:re_whitespace "aa bb c d "); + assert_equal ~printer:pp_list + [""; "a"; "full_word"; "bc"; ""] + (split ~rex:re_whitespace " a full_word bc "); + assert_equal ~printer:pp_list + [""; "a"; "b"; "c"; "d"; ""] (split ~rex:re_empty "abcd"); + assert_equal ~printer:pp_list + ["a"; "\nb"; ""] (split ~rex:re_eol "a\nb"); + assert_equal ~printer:pp_list + [""; "a "; "b"] (split ~rex:re_bow "a b"); + assert_equal ~printer:pp_list + ["a"; " b"; ""] (split ~rex:re_eow "a b"); + let rex = regexp "" in + assert_equal ~printer:pp_list (split ~rex "xx") [""; "x"; "x"; ""] + +let test_substitute () = + let rex = regexp "[a-zA-Z]+" in + let subst = String.capitalize_ascii in + assert_equal ~printer:pp_str " Hello World; I Love Chips!" + (substitute ~rex ~subst " hello world; I love chips!"); + assert_equal ~printer:pp_str "a" + (substitute ~rex:re_empty ~subst:(fun _ -> "a") ""); + assert_equal ~printer:pp_str "*c*t*" + (substitute ~rex:(regexp "a*") ~subst:(fun _ -> "*") "cat"); + let rex = regexp "^ *" in + assert_equal ~printer:pp_str + (substitute ~rex ~subst:(fun _ -> "A ") "test") "A test" + let test_fixtures = "test pcre features" >::: [ "test [:blank:] class" >:: test_blank_class @@ -55,7 +99,8 @@ let test_fixtures = ; "test group split 1" >:: group_split1 ; "test group split 2 - NoGroup" >:: group_split2 ; "test named groups" >:: named_groups + ; "test split" >:: test_split + ; "test substitute" >:: test_substitute ] let _ = run_test_tt_main test_fixtures -