Skip to content

Commit

Permalink
Named groups (#223)
Browse files Browse the repository at this point in the history
* Add support for named groups in Core

* Add support for named groups in Perl

* Add support for named groups in Pcre
  • Loading branch information
glondu authored Aug 15, 2023
1 parent b1bf00f commit b261cf1
Show file tree
Hide file tree
Showing 7 changed files with 100 additions and 32 deletions.
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Unreleased
* Add `Re.exec_partial_detailed` to allow resuming searches from partial inputs
(#219)
* Add support for `DOTALL` flag in `Re.Pcre.regexp` (#225)
* Add support for named groups (#223)

1.10.4 (27-Apr-2022)
--------------------
Expand Down
70 changes: 41 additions & 29 deletions lib/core.ml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ type re =
when computing a new state *)
states : state Automata.State.Table.t;
(* States of the deterministic automata *)
group_names : (string * int) list;
(* Named groups in the regular expression *)
group_count : int
(* Number of groups in the regular expression *) }

Expand All @@ -81,6 +83,8 @@ let print_re = pp_re

let group_count re = re.group_count

let group_names re = re.group_names

(* Information used during matching *)
type info =
{ re : re;
Expand Down Expand Up @@ -324,7 +328,7 @@ let match_str ~groups ~partial re s ~pos ~len =
let no_match_starts_before = if groups then info.positions.(0) else 0 in
Running { no_match_starts_before }

let mk_re ~initial ~colors ~color_repr ~ncolor ~lnl ~group_count =
let mk_re ~initial ~colors ~color_repr ~ncolor ~lnl ~group_names ~group_count =
{ initial ;
initial_states = [];
colors;
Expand All @@ -333,6 +337,7 @@ let mk_re ~initial ~colors ~color_repr ~ncolor ~lnl ~group_count =
lnl;
tbl = Automata.create_working_area ();
states = Automata.State.Table.create 97;
group_names;
group_count }

(**** Character sets ****)
Expand Down Expand Up @@ -370,7 +375,7 @@ type regexp =
| Last_end_of_line | Start | Stop
| Sem of Automata.sem * regexp
| Sem_greedy of Automata.rep_kind * regexp
| Group of regexp | No_group of regexp | Nest of regexp
| Group of string option * regexp | No_group of regexp | Nest of regexp
| Case of regexp | No_case of regexp
| Intersection of regexp list
| Complement of regexp list
Expand All @@ -389,7 +394,7 @@ module View = struct
| Last_end_of_line | Start | Stop
| Sem of Automata.sem * regexp
| Sem_greedy of Automata.rep_kind * regexp
| Group of regexp | No_group of regexp | Nest of regexp
| Group of string option * regexp | No_group of regexp | Nest of regexp
| Case of regexp | No_case of regexp
| Intersection of regexp list
| Complement of regexp list
Expand Down Expand Up @@ -424,7 +429,8 @@ let rec pp fmt t =
sexp fmt "Sem" (pair Automata.pp_sem pp) (sem, re)
| Sem_greedy (k, re) ->
sexp fmt "Sem_greedy" (pair Automata.pp_rep_kind pp) (k, re)
| Group c -> var "Group" c
| Group (None, c) -> var "Group" c
| Group (Some n, c) -> sexp fmt "Named_group" (pair str pp) (n, c)
| No_group c -> var "No_group" c
| Nest c -> var "Nest" c
| Case c -> var "Case" c
Expand Down Expand Up @@ -479,7 +485,7 @@ let colorize c regexp =
| Last_end_of_line -> lnl := true
| Sem (_, r)
| Sem_greedy (_, r)
| Group r | No_group r
| Group (_, r) | No_group r
| Nest r | Pmark (_,r) -> colorize r
| Case _ | No_case _
| Intersection _
Expand Down Expand Up @@ -574,31 +580,31 @@ let enforce_kind ids kind kind' cr =
| _ -> cr

(* XXX should probably compute a category mask *)
let rec translate ids kind ign_group ign_case greedy pos cache c = function
let rec translate ids kind ign_group ign_case greedy pos names cache c = function
| Set s ->
(A.cst ids (trans_set cache c s), kind)
| Sequence l ->
(trans_seq ids kind ign_group ign_case greedy pos cache c l, kind)
(trans_seq ids kind ign_group ign_case greedy pos names cache c l, kind)
| Alternative l ->
begin match merge_sequences l with
[r'] ->
let (cr, kind') =
translate ids kind ign_group ign_case greedy pos cache c r' in
translate ids kind ign_group ign_case greedy pos names cache c r' in
(enforce_kind ids kind kind' cr, kind)
| merged_sequences ->
(A.alt ids
(List.map
(fun r' ->
let (cr, kind') =
translate ids kind ign_group ign_case greedy
pos cache c r' in
pos names cache c r' in
enforce_kind ids kind kind' cr)
merged_sequences),
kind)
end
| Repeat (r', i, j) ->
let (cr, kind') =
translate ids kind ign_group ign_case greedy pos cache c r' in
translate ids kind ign_group ign_case greedy pos names cache c r' in
let rem =
match j with
None ->
Expand Down Expand Up @@ -652,28 +658,33 @@ let rec translate ids kind ign_group ign_case greedy pos cache c = function
(A.before ids Category.search_boundary, kind)
| Sem (kind', r') ->
let (cr, kind'') =
translate ids kind' ign_group ign_case greedy pos cache c r' in
translate ids kind' ign_group ign_case greedy pos names cache c r' in
(enforce_kind ids kind' kind'' cr,
kind')
| Sem_greedy (greedy', r') ->
translate ids kind ign_group ign_case greedy' pos cache c r'
| Group r' ->
translate ids kind ign_group ign_case greedy' pos names cache c r'
| Group (n, r') ->
if ign_group then
translate ids kind ign_group ign_case greedy pos cache c r'
translate ids kind ign_group ign_case greedy pos names cache c r'
else
let p = !pos in
let () =
match n with
| Some name -> names := (name, p / 2) :: !names
| None -> ()
in
pos := !pos + 2;
let (cr, kind') =
translate ids kind ign_group ign_case greedy pos cache c r' in
translate ids kind ign_group ign_case greedy pos names cache c r' in
(A.seq ids `First (A.mark ids p) (
A.seq ids `First cr (A.mark ids (p + 1))),
kind')
| No_group r' ->
translate ids kind true ign_case greedy pos cache c r'
translate ids kind true ign_case greedy pos names cache c r'
| Nest r' ->
let b = !pos in
let (cr, kind') =
translate ids kind ign_group ign_case greedy pos cache c r'
translate ids kind ign_group ign_case greedy pos names cache c r'
in
let e = !pos - 1 in
if e < b then
Expand All @@ -684,21 +695,21 @@ let rec translate ids kind ign_group ign_case greedy pos cache c = function
assert false
| Pmark (i, r') ->
let (cr, kind') =
translate ids kind ign_group ign_case greedy pos cache c r' in
translate ids kind ign_group ign_case greedy pos names cache c r' in
(A.seq ids `First (A.pmark ids i) cr, kind')

and trans_seq ids kind ign_group ign_case greedy pos cache c = function
and trans_seq ids kind ign_group ign_case greedy pos names cache c = function
| [] ->
A.eps ids
| [r] ->
let (cr', kind') =
translate ids kind ign_group ign_case greedy pos cache c r in
translate ids kind ign_group ign_case greedy pos names cache c r in
enforce_kind ids kind kind' cr'
| r :: rem ->
let (cr', kind') =
translate ids kind ign_group ign_case greedy pos cache c r in
translate ids kind ign_group ign_case greedy pos names cache c r in
let cr'' =
trans_seq ids kind ign_group ign_case greedy pos cache c rem in
trans_seq ids kind ign_group ign_case greedy pos names cache c rem in
if A.is_eps cr'' then
cr'
else if A.is_eps cr' then
Expand Down Expand Up @@ -741,8 +752,8 @@ let rec handle_case ign_case = function
| Sem_greedy (k, r) ->
let r' = handle_case ign_case r in
if is_charset r' then r' else Sem_greedy (k, r')
| Group r ->
Group (handle_case ign_case r)
| Group (n, r) ->
Group (n, handle_case ign_case r)
| No_group r ->
let r' = handle_case ign_case r in
if is_charset r' then r' else No_group r'
Expand Down Expand Up @@ -777,12 +788,13 @@ let compile_1 regexp =
let ncolor = if need_lnl then ncolor + 1 else ncolor in
let ids = A.create_ids () in
let pos = ref 0 in
let names = ref [] in
let (r, kind) =
translate ids
`First false false `Greedy pos (ref Cset.CSetMap.empty) colors regexp in
`First false false `Greedy pos names (ref Cset.CSetMap.empty) colors regexp in
let r = enforce_kind ids `First kind r in
(*Format.eprintf "<%d %d>@." !ids ncol;*)
mk_re ~initial:r ~colors ~color_repr ~ncolor ~lnl ~group_count:(!pos / 2)
mk_re ~initial:r ~colors ~color_repr ~ncolor ~lnl ~group_names:(List.rev !names) ~group_count:(!pos / 2)

(****)

Expand All @@ -799,7 +811,7 @@ let rec anchored = function
false
| Beg_of_str | Start ->
true
| Sem (_, r) | Sem_greedy (_, r) | Group r | No_group r | Nest r
| Sem (_, r) | Sem_greedy (_, r) | Group (_, r) | No_group r | Nest r
| Case r | No_case r | Pmark (_, r) ->
anchored r

Expand Down Expand Up @@ -851,7 +863,7 @@ let shortest r = Sem (`Shortest, r)
let first r = Sem (`First, r)
let greedy r = Sem_greedy (`Greedy, r)
let non_greedy r = Sem_greedy (`Non_greedy, r)
let group r = Group r
let group ?name r = Group (name, r)
let no_group r = No_group r
let nest r = Nest r
let mark r = let i = Pmark.gen () in (i,Pmark (i,r))
Expand Down Expand Up @@ -1159,7 +1171,7 @@ let witness t =
| Intersection _
| Complement _
| Difference (_, _) -> assert false
| Group r
| Group (_, r)
| No_group r
| Nest r
| Sem (_, r)
Expand Down
7 changes: 5 additions & 2 deletions lib/core.mli
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ val group_count : re -> int
(** Return the number of capture groups (including the one
corresponding to the entire regexp). *)

val group_names : re -> (string * int) list
(** Return named capture groups with their index. *)

val exec :
?pos:int -> (** Default: 0 *)
?len:int -> (** Default: -1 (until end of string) *)
Expand Down Expand Up @@ -581,7 +584,7 @@ val non_greedy : t -> t

(** {2 Groups (or submatches)} *)

val group : t -> t
val group : ?name:string -> t -> t
(** Delimit a group. The group is considered as matching if it is used at least
once (it may be used multiple times if is nested inside {!rep} for
instance). If it is used multiple times, the last match is what gets
Expand Down Expand Up @@ -688,7 +691,7 @@ module View : sig
| Last_end_of_line | Start | Stop
| Sem of Automata.sem * outer
| Sem_greedy of Automata.rep_kind * outer
| Group of outer | No_group of outer | Nest of outer
| Group of string option * outer | No_group of outer | Nest of outer
| Case of outer | No_case of outer
| Intersection of outer list
| Complement of outer list
Expand Down
17 changes: 17 additions & 0 deletions lib/pcre.ml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,23 @@ let exec ~rex ?pos s =
let get_substring s i =
Re.Group.get s i

let names rex =
Re.group_names rex
|> List.map fst
|> Array.of_list

let get_named_substring rex name s =
let rec loop = function
| [] -> raise Not_found
| (n, i) :: rem when n = name ->
begin
try get_substring s i
with Not_found -> loop rem
end
| _ :: rem -> loop rem
in
loop (Re.group_names rex)

let get_substring_ofs s i =
Re.Group.offset s i

Expand Down
6 changes: 6 additions & 0 deletions lib/pcre.mli
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ val exec : rex:regexp -> ?pos:int -> string -> groups
val get_substring : groups -> int -> string
(** Equivalent to {!Core.Group.get}. *)

val names : regexp -> string array
(** Return the names of named groups. *)

val get_named_substring : regexp -> string -> groups -> string
(** Return the first matched named group, or raise [Not_found]. *)

val get_substring_ofs : groups -> int -> int * int
(** Equivalent to {!Core.Group.offset}. *)

Expand Down
21 changes: 21 additions & 0 deletions lib/perl.ml
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ let parse multiline dollar_endonly dotall ungreedy s =
r
end else if accept '#' then begin
comment ()
end else if accept '<' then begin
let name = name () in
let r = regexp () in
if not (accept ')') then raise Parse_error;
Re.group ~name r
end else
raise Parse_error
end else begin
Expand Down Expand Up @@ -182,6 +187,22 @@ let parse multiline dollar_endonly dotall ungreedy s =
integer' i'
| _ ->
unget (); Some i
and name () =
if eos () then raise Parse_error else
match get () with
('_' | 'a'..'z' | 'A'..'Z') as c ->
let b = Buffer.create 32 in
Buffer.add_char b c;
name' b
| _ -> raise Parse_error
and name' b =
if eos () then raise Parse_error else
match get () with
('_' | 'a'..'z' | 'A'..'Z' | '0'..'9') as c ->
Buffer.add_char b c;
name' b
| '>' -> Buffer.contents b
| _ -> raise Parse_error
and bracket s =
if s <> [] && accept ']' then s else begin
match char () with
Expand Down
10 changes: 9 additions & 1 deletion lib_test/test_pcre.ml
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,21 @@ let group_split2 _ =
let sp = full_split ~rex "testxyyy" in
assert_equal ~printer sp [Text "test"; Delim "x"; NoGroup; Text "yyy"]

let rex = regexp "(?<many_x>x+)"

let named_groups _ =
let s = exec ~rex "testxxxyyy" in
assert_equal (get_named_substring rex "many_x" s) "xxx"

let test_fixtures =
"test pcre features" >:::
[ "test [:blank:] class" >:: test_blank_class
; "test splitting empty string" >:: split_empty
; "test split with max of 1" >:: split_max_1
; "test group split 1" >:: group_split1
; "test group split 2 - NoGroup" >:: group_split2]
; "test group split 2 - NoGroup" >:: group_split2
; "test named groups" >:: named_groups
]

let _ = run_test_tt_main test_fixtures

0 comments on commit b261cf1

Please sign in to comment.