forked from Perl/perl5
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathregcomp.c
16413 lines (14406 loc) · 655 KB
/
regcomp.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* regcomp.c
*/
/*
* 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
*
* [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
*/
/* This file contains functions for compiling a regular expression. See
* also regexec.c which funnily enough, contains functions for executing
* a regular expression.
*
* This file is also copied at build time to ext/re/re_comp.c, where
* it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
* This causes the main functions to be compiled under new names and with
* debugging support added, which makes "use re 'debug'" work.
*/
/* NOTE: this is derived from Henry Spencer's regexp code, and should not be
* confused with the original package (see point 3 below). Thanks, Henry!
*/
/* Additional note: this code is very heavily munged from Henry's version
* in places. In some spots I've traded clarity for efficiency, so don't
* blame Henry for some of the lack of readability.
*/
/* The names of the functions have been changed from regcomp and
* regexec to pregcomp and pregexec in order to avoid conflicts
* with the POSIX routines of the same names.
*/
/*
* pregcomp and pregexec -- regsub and regerror are not used in perl
*
* Copyright (c) 1986 by University of Toronto.
* Written by Henry Spencer. Not derived from licensed software.
*
* Permission is granted to anyone to use this software for any
* purpose on any computer system, and to redistribute it freely,
* subject to the following restrictions:
*
* 1. The author is not responsible for the consequences of use of
* this software, no matter how awful, even if they arise
* from defects in it.
*
* 2. The origin of this software must not be misrepresented, either
* by explicit claim or by omission.
*
* 3. Altered versions must be plainly marked as such, and must not
* be misrepresented as being the original software.
*
*
**** Alterations to Henry's code are...
****
**** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
**** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
**** by Larry Wall and others
****
**** You may distribute under the terms of either the GNU General Public
**** License or the Artistic License, as specified in the README file.
*
* Beware that some of this code is subtly aware of the way operator
* precedence is structured in regular expressions. Serious changes in
* regular-expression syntax might require a total rethink.
*/
/* Note on debug output:
*
* This is set up so that -Dr turns on debugging like all other flags that are
* enabled by -DDEBUGGING. -Drv gives more verbose output. This applies to
* all regular expressions encountered in a program, and gives a huge amount of
* output for all but the shortest programs.
*
* The ability to output pattern debugging information lexically, and with much
* finer grained control was added, with 'use re qw(Debug ....);' available even
* in non-DEBUGGING builds. This is accomplished by copying the contents of
* regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
* Those files are compiled and linked into the perl executable, and they are
* compiled essentially as if DEBUGGING were enabled, and controlled by calls
* to re.pm.
*
* That would normally mean linking errors when two functions of the same name
* are attempted to be placed into the same executable. That is solved in one
* of four ways:
* 1) Static functions aren't known outside the file they are in, so for the
* many functions of that type in this file, it just isn't a problem.
* 2) Most externally known functions are enclosed in
* #ifndef PERL_IN_XSUB_RE
* ...
* #endif
* blocks, so there is only one definition for them in the whole
* executable, the one in regcomp.c (or regexec.c). The implication of
* that is any debugging info that comes from them is controlled only by
* -Dr. Further, any static function they call will also be the version
* in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
* 3) About a dozen external functions are re-#defined in ext/re/re_top.h, to
* have different names, so that what gets loaded in the executable is
* 'Perl_foo' from regcomp.c (and regexec.c), and the identical function
* from re_comp.c (and re_exec.c), but with the name 'my_foo' Debugging
* in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
* versions and their callees are under control of re.pm. The catch is
* that references to all these go through the regexp_engine structure,
* which is initialized in regcomp.h to the Perl_foo versions, and
* substituted out in lexical scopes where 'use re' is in effect to the
* 'my_foo' ones. That structure is public API, so it would be a hard
* sell to add any additional members.
* 4) For functions in regcomp.c and re_comp.c that are called only from,
* respectively, regexec.c and re_exec.c, they can have two different
* names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
* embed.fnc.
*
* The bottom line is that if you add code to one of the public functions
* listed in ext/re/re_top.h, debugging automagically works. But if you write
* a new function that needs to do debugging or there is a chain of calls from
* it that need to do debugging, all functions in the chain should use options
* 2) or 4) above.
*
* A function may have to be split so that debugging stuff is static, but it
* calls out to some other function that only gets compiled in regcomp.c to
* access data that we don't want to duplicate.
*/
#ifdef PERL_EXT_RE_BUILD
#include "re_top.h"
#endif
#include "EXTERN.h"
#define PERL_IN_REGEX_ENGINE
#define PERL_IN_REGCOMP_ANY
#define PERL_IN_REGCOMP_C
#include "perl.h"
#ifdef PERL_IN_XSUB_RE
# include "re_comp.h"
EXTERN_C const struct regexp_engine my_reg_engine;
EXTERN_C const struct regexp_engine wild_reg_engine;
#else
# include "regcomp.h"
#endif
#include "invlist_inline.h"
#include "unicode_constants.h"
#include "regcomp_internal.h"
/* =========================================================
* BEGIN edit_distance stuff.
*
* This calculates how many single character changes of any type are needed to
* transform a string into another one. It is taken from version 3.1 of
*
* https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
*/
/* Our unsorted dictionary linked list. */
/* Note we use UVs, not chars. */
struct dictionary{
UV key;
UV value;
struct dictionary* next;
};
typedef struct dictionary item;
PERL_STATIC_INLINE item*
push(UV key, item* curr)
{
item* head;
Newx(head, 1, item);
head->key = key;
head->value = 0;
head->next = curr;
return head;
}
PERL_STATIC_INLINE item*
find(item* head, UV key)
{
item* iterator = head;
while (iterator){
if (iterator->key == key){
return iterator;
}
iterator = iterator->next;
}
return NULL;
}
PERL_STATIC_INLINE item*
uniquePush(item* head, UV key)
{
item* iterator = head;
while (iterator){
if (iterator->key == key) {
return head;
}
iterator = iterator->next;
}
return push(key, head);
}
PERL_STATIC_INLINE void
dict_free(item* head)
{
item* iterator = head;
while (iterator) {
item* temp = iterator;
iterator = iterator->next;
Safefree(temp);
}
head = NULL;
}
/* End of Dictionary Stuff */
/* All calculations/work are done here */
STATIC int
S_edit_distance(const UV* src,
const UV* tgt,
const STRLEN x, /* length of src[] */
const STRLEN y, /* length of tgt[] */
const SSize_t maxDistance
)
{
item *head = NULL;
UV swapCount, swapScore, targetCharCount, i, j;
UV *scores;
UV score_ceil = x + y;
PERL_ARGS_ASSERT_EDIT_DISTANCE;
/* initialize matrix start values */
Newx(scores, ( (x + 2) * (y + 2)), UV);
scores[0] = score_ceil;
scores[1 * (y + 2) + 0] = score_ceil;
scores[0 * (y + 2) + 1] = score_ceil;
scores[1 * (y + 2) + 1] = 0;
head = uniquePush(uniquePush(head, src[0]), tgt[0]);
/* work loops */
/* i = src index */
/* j = tgt index */
for (i=1;i<=x;i++) {
if (i < x)
head = uniquePush(head, src[i]);
scores[(i+1) * (y + 2) + 1] = i;
scores[(i+1) * (y + 2) + 0] = score_ceil;
swapCount = 0;
for (j=1;j<=y;j++) {
if (i == 1) {
if(j < y)
head = uniquePush(head, tgt[j]);
scores[1 * (y + 2) + (j + 1)] = j;
scores[0 * (y + 2) + (j + 1)] = score_ceil;
}
targetCharCount = find(head, tgt[j-1])->value;
swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
if (src[i-1] != tgt[j-1]){
scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
}
else {
swapCount = j;
scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
}
}
find(head, src[i-1])->value = i;
}
{
IV score = scores[(x+1) * (y + 2) + (y + 1)];
dict_free(head);
Safefree(scores);
return (maxDistance != 0 && maxDistance < score)?(-1):score;
}
}
/* END of edit_distance() stuff
* ========================================================= */
#ifdef PERL_RE_BUILD_AUX
/* add a data member to the struct reg_data attached to this regex, it should
* always return a non-zero return. the 's' argument is the type of the items
* being added and the n is the number of items. The length of 's' should match
* the number of items. */
U32
Perl_reg_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
{
U32 count = RExC_rxi->data ? RExC_rxi->data->count : 1;
PERL_ARGS_ASSERT_REG_ADD_DATA;
/* in the below expression we have (count + n - 1), the minus one is there
* because the struct that we allocate already contains a slot for 1 data
* item, so we do not need to allocate it the first time. IOW, the
* sizeof(*RExC_rxi->data) already accounts for one of the elements we need
* to allocate. See struct reg_data in regcomp.h
*/
Renewc(RExC_rxi->data,
sizeof(*RExC_rxi->data) + (sizeof(void*) * (count + n - 1)),
char, struct reg_data);
/* however in the data->what expression we use (count + n) and do not
* subtract one from the result because the data structure contains a
* pointer to an array, and does not allocate the first element as part of
* the data struct. */
if (count > 1)
Renew(RExC_rxi->data->what, (count + n), U8);
else {
/* when count == 1 it means we have not initialized anything.
* we always fill the 0 slot of the data array with a '%' entry, which
* means "zero" (all the other types are letters) which exists purely
* so the return from reg_add_data is ALWAYS true, so we can tell it apart
* from a "no value" idx=0 in places where we would return an index
* into reg_add_data. This is particularly important with the new "single
* pass, usually, but not always" strategy that we use, where the code
* will use a 0 to represent "not able to compute this yet".
*/
Newx(RExC_rxi->data->what, n+1, U8);
/* fill in the placeholder slot of 0 with a what of '%', we use
* this because it sorta looks like a zero (0/0) and it is not a letter
* like any of the other "whats", this type should never be created
* any other way but here. '%' happens to also not appear in this
* file for any other reason (at the time of writing this comment)*/
RExC_rxi->data->what[0]= '%';
RExC_rxi->data->data[0]= NULL;
}
RExC_rxi->data->count = count + n;
Copy(s, RExC_rxi->data->what + count, n, U8);
assert(count>0);
return count;
}
#endif /* PERL_RE_BUILD_AUX */
/*XXX: todo make this not included in a non debugging perl, but appears to be
* used anyway there, in 'use re' */
#ifndef PERL_IN_XSUB_RE
void
Perl_reginitcolors(pTHX)
{
const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
if (s) {
char *t = savepv(s);
int i = 0;
PL_colors[0] = t;
while (++i < 6) {
t = strchr(t, '\t');
if (t) {
*t = '\0';
PL_colors[i] = ++t;
}
else
PL_colors[i] = t = (char *)"";
}
} else {
int i = 0;
while (i < 6)
PL_colors[i++] = (char *)"";
}
PL_colorset = 1;
}
#endif
#ifdef TRIE_STUDY_OPT
/* search for "restudy" in this file for a detailed explanation */
#define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
STMT_START { \
if ( \
(data.flags & SCF_TRIE_RESTUDY) \
&& ! restudied++ \
) { \
dOsomething; \
goto reStudy; \
} \
} STMT_END
#else
#define CHECK_RESTUDY_GOTO_butfirst
#endif
#ifndef PERL_IN_XSUB_RE
/* return the currently in-scope regex engine (or the default if none) */
regexp_engine const *
Perl_current_re_engine(pTHX)
{
if (IN_PERL_COMPILETIME) {
HV * const table = GvHV(PL_hintgv);
SV **ptr;
if (!table || !(PL_hints & HINT_LOCALIZE_HH))
return &PL_core_reg_engine;
ptr = hv_fetchs(table, "regcomp", FALSE);
if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
return &PL_core_reg_engine;
return INT2PTR(regexp_engine*, SvIV(*ptr));
}
else {
SV *ptr;
if (!PL_curcop->cop_hints_hash)
return &PL_core_reg_engine;
ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
return &PL_core_reg_engine;
return INT2PTR(regexp_engine*, SvIV(ptr));
}
}
/*
* pregcomp - compile a regular expression into internal code
*
* Decides which engine's compiler to call based on the hint currently in
* scope
*/
REGEXP *
Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
{
regexp_engine const *eng = current_re_engine();
DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_PREGCOMP;
/* Dispatch a request to compile a regexp to correct regexp engine. */
DEBUG_COMPILE_r({
Perl_re_printf( aTHX_ "Using engine %" UVxf "\n",
PTR2UV(eng));
});
return CALLREGCOMP_ENG(eng, pattern, flags);
}
#endif
/*
=for apidoc re_compile
Compile the regular expression pattern C<pattern>, returning a pointer to the
compiled object for later matching with the internal regex engine.
This function is typically used by a custom regexp engine C<.comp()> function
to hand off to the core regexp engine those patterns it doesn't want to handle
itself (typically passing through the same flags it was called with). In
almost all other cases, a regexp should be compiled by calling L</C<pregcomp>>
to compile using the currently active regexp engine.
If C<pattern> is already a C<REGEXP>, this function does nothing but return a
pointer to the input. Otherwise the PV is extracted and treated like a string
representing a pattern. See L<perlre>.
The possible flags for C<rx_flags> are documented in L<perlreapi>. Their names
all begin with C<RXf_>.
=cut
* public entry point for the perl core's own regex compiling code.
* It's actually a wrapper for Perl_re_op_compile that only takes an SV
* pattern rather than a list of OPs, and uses the internal engine rather
* than the current one */
REGEXP *
Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
{
SV *pat = pattern; /* defeat constness! */
PERL_ARGS_ASSERT_RE_COMPILE;
return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
#ifdef PERL_IN_XSUB_RE
&my_reg_engine,
#else
&PL_core_reg_engine,
#endif
NULL, NULL, rx_flags, 0);
}
static void
S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
{
int n;
if (--cbs->refcnt > 0)
return;
for (n = 0; n < cbs->count; n++) {
REGEXP *rx = cbs->cb[n].src_regex;
if (rx) {
cbs->cb[n].src_regex = NULL;
SvREFCNT_dec_NN(rx);
}
}
Safefree(cbs->cb);
Safefree(cbs);
}
/* Ensure that there are at least 'required' spare code block slots
* available, using a simple doubling */
static void
S_grow_code_blocks(pTHX_ struct reg_code_blocks *cbs, int required)
{
required += cbs->count;
if (required < 1)
return;
if (required < cbs->size)
return;
int new_size = cbs->size;
if (new_size < 1)
new_size = 1;
while (new_size < required)
new_size *= 2;
Renew(cbs->cb, new_size, struct reg_code_block);
cbs->size = new_size;
}
static struct reg_code_blocks *
S_alloc_code_blocks(pTHX_ int ncode)
{
struct reg_code_blocks *cbs;
Newx(cbs, 1, struct reg_code_blocks);
cbs->size = 0;
cbs->count = 0;
cbs->cb = NULL;
cbs->refcnt = 1;
SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
S_grow_code_blocks(aTHX_ cbs, ncode);
return cbs;
}
/* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
* blocks, recalculate the indices. Update pat_p and plen_p in-place to
* point to the realloced string and length.
*
* This is essentially a copy of Perl_bytes_to_utf8() with the code index
* stuff added */
static void
S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
char **pat_p, STRLEN *plen_p)
{
U8 *const src = (U8*)*pat_p;
U8 *dst, *d;
int n=0;
STRLEN s = 0;
bool do_end = 0;
DECLARE_AND_GET_RE_DEBUG_FLAGS;
DEBUG_PARSE_r(Perl_re_printf( aTHX_
"UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
int nblocks = 0;
if (pRExC_state->code_blocks)
nblocks = pRExC_state->code_blocks->count;
/* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
d = dst;
while (s < *plen_p) {
append_utf8_from_native_byte(src[s], &d);
if (n < nblocks) {
assert(pRExC_state->code_blocks);
if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
pRExC_state->code_blocks->cb[n].start = d - dst - 1;
assert(*(d - 1) == '(');
do_end = 1;
}
else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
pRExC_state->code_blocks->cb[n].end = d - dst - 1;
assert(*(d - 1) == ')');
do_end = 0;
n++;
}
}
s++;
}
*d = '\0';
*plen_p = d - dst;
*pat_p = (char*) dst;
SAVEFREEPV(*pat_p);
RExC_orig_utf8 = RExC_utf8 = 1;
}
/* S_concat_pat(): concatenate a list of args to the pattern string pat,
* while recording any code block indices, and handling overloading,
* nested qr// objects etc. If pat is null, it will allocate a new
* string, or just return the first arg, if there's only one.
*
* Returns the malloced/updated pat.
* patternp and pat_count is the array of SVs to be concatted;
* oplist is the optional list of ops that generated the SVs;
* recompile_p is a pointer to a boolean that will be set if
* the regex will need to be recompiled.
* delim, if non-null is an SV that will be inserted between each element
*/
static SV*
S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
SV *pat, SV ** const patternp, int pat_count,
OP *oplist, bool *recompile_p, SV *delim)
{
SV **svp;
bool use_delim = FALSE;
bool alloced = FALSE;
/* if we know we have at least two args, create an empty string,
* then concatenate args to that. For no args, return an empty string */
if (!pat && pat_count != 1) {
pat = newSVpvs("");
SAVEFREESV(pat);
alloced = TRUE;
}
for (svp = patternp; svp < patternp + pat_count; svp++) {
SV *sv;
SV *rx = NULL;
STRLEN orig_patlen = 0;
bool code = 0;
SV *msv = use_delim ? delim : *svp;
if (!msv) msv = &PL_sv_undef;
/* if we've got a delimiter, we go round the loop twice for each
* svp slot (except the last), using the delimiter the second
* time round */
if (use_delim) {
svp--;
use_delim = FALSE;
}
else if (delim)
use_delim = TRUE;
if (SvTYPE(msv) == SVt_PVAV) {
/* we've encountered an interpolated array within
* the pattern, e.g. /...@a..../. Expand the list of elements,
* then recursively append elements.
* The code in this block is based on S_pushav() */
AV *const av = (AV*)msv;
const SSize_t maxarg = AvFILL(av) + 1;
SV **array;
if (oplist) {
assert(oplist->op_type == OP_PADAV
|| oplist->op_type == OP_RV2AV);
oplist = OpSIBLING(oplist);
}
if (SvRMAGICAL(av)) {
SSize_t i;
Newx(array, maxarg, SV*);
SAVEFREEPV(array);
for (i=0; i < maxarg; i++) {
SV ** const svp = av_fetch(av, i, FALSE);
array[i] = svp ? *svp : &PL_sv_undef;
}
}
else
array = AvARRAY(av);
if (maxarg > 0) {
pat = S_concat_pat(aTHX_ pRExC_state, pat,
array, maxarg, NULL, recompile_p,
/* $" */
GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
}
else if (!pat) {
pat = newSVpvs_flags("", SVs_TEMP);
}
continue;
}
/* we make the assumption here that each op in the list of
* op_siblings maps to one SV pushed onto the stack,
* except for code blocks, with have both an OP_NULL and
* an OP_CONST.
* This allows us to match up the list of SVs against the
* list of OPs to find the next code block.
*
* Note that PUSHMARK PADSV PADSV ..
* is optimised to
* PADRANGE PADSV PADSV ..
* so the alignment still works. */
if (oplist) {
if (oplist->op_type == OP_NULL
&& (oplist->op_flags & OPf_SPECIAL))
{
/* process next literal code block */
struct reg_code_blocks *cbs = pRExC_state->code_blocks;
S_grow_code_blocks(aTHX_ cbs, 1);
int n = cbs->count;
cbs->cb[n].start = pat ? SvCUR(pat) : 0;
cbs->cb[n].block = oplist;
cbs->cb[n].src_regex = NULL;
cbs->count++;
code = 1;
oplist = OpSIBLING(oplist); /* skip CONST */
assert(oplist);
}
oplist = OpSIBLING(oplist);
}
/* apply magic and QR overloading to arg */
SvGETMAGIC(msv);
if (SvROK(msv) && SvAMAGIC(msv)) {
SV *sv = AMG_CALLunary(msv, regexp_amg);
if (sv) {
if (SvROK(sv))
sv = SvRV(sv);
if (SvTYPE(sv) != SVt_REGEXP)
Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
msv = sv;
}
}
/* try concatenation overload ... */
if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
(sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
{
sv_setsv(pat, sv);
/* pat now represents the return value of overloaded
* concatenation of of two values:
* 1) all the components previously concatenated;
* 2) the current pattern element.
* Since the return value can be anything, any previously
* found code-blocks (even literal ones) should be discarded.
* For example, in:
* qr/(?{A})$obj/
* the overloaded concatenation of '(?{A})' and $obj
* could return anything, and not necessarily the literal
* code block. So throw away any previously found code blocks,
* and so any code-block bits in the returned string will be
* treated as run-time.
*/
struct reg_code_blocks *cbs = pRExC_state->code_blocks;
if (cbs) {
for (int n = 0; n < cbs->count; n++) {
SvREFCNT_dec(cbs->cb[n].src_regex);
}
cbs->count = 0;
}
}
else {
/* ... or failing that, try "" overload */
while (SvAMAGIC(msv)
&& (sv = AMG_CALLunary(msv, string_amg))
&& sv != msv
&& !( SvROK(msv)
&& SvROK(sv)
&& SvRV(msv) == SvRV(sv))
) {
msv = sv;
SvGETMAGIC(msv);
}
if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
msv = SvRV(msv);
if (pat) {
/* this is a partially unrolled
* sv_catsv_nomg(pat, msv);
* that allows us to adjust code block indices if
* needed */
STRLEN dlen;
char *dst = SvPV_force_nomg(pat, dlen);
orig_patlen = dlen;
if (SvUTF8(msv) && !SvUTF8(pat)) {
S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen);
sv_setpvn(pat, dst, dlen);
SvUTF8_on(pat);
}
sv_catsv_nomg(pat, msv);
rx = msv;
}
else {
/* We have only one SV to process, but we need to verify
* it is properly null terminated or we will fail asserts
* later. In theory we probably shouldn't get such SV's,
* but if we do we should handle it gracefully. */
if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
/* not a string, or a string with a trailing null */
pat = msv;
} else {
/* a string with no trailing null, we need to copy it
* so it has a trailing null */
pat = sv_2mortal(newSVsv(msv));
}
}
/* was this pattern element a literal code block? */
if (code) {
struct reg_code_blocks *cbs = pRExC_state->code_blocks;
cbs->cb[cbs->count - 1].end = SvCUR(pat) - 1;
}
}
/* extract any code blocks within any embedded qr//'s */
if (rx && SvTYPE(rx) == SVt_REGEXP
&& RX_ENGINE((REGEXP*)rx)->op_comp)
{
RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
if (ri->code_blocks && ri->code_blocks->count) {
int i;
/* the presence of an embedded qr// with code means
* we should always recompile: the text of the
* qr// may not have changed, but it may be a
* different closure than last time */
*recompile_p = 1;
if (pRExC_state->code_blocks)
S_grow_code_blocks(aTHX_ pRExC_state->code_blocks,
ri->code_blocks->count);
else
pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
ri->code_blocks->count);
for (i=0; i < ri->code_blocks->count; i++) {
struct reg_code_block *src, *dst;
STRLEN offset = orig_patlen
+ ReANY((REGEXP *)rx)->pre_prefix;
src = &ri->code_blocks->cb[i];
dst = &pRExC_state->code_blocks->cb[
pRExC_state->code_blocks->count++];
dst->start = src->start + offset;
dst->end = src->end + offset;
dst->block = src->block;
dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
src->src_regex
? src->src_regex
: (REGEXP*)rx);
}
}
}
} /* for (patternp) */
/* avoid calling magic multiple times on a single element e.g. =~ $qr */
if (alloced)
SvSETMAGIC(pat);
return pat;
}
/* see if there are any run-time code blocks in the pattern.
* False positives are allowed */
static bool
S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
char *pat, STRLEN plen)
{
int n = 0;
STRLEN s;
PERL_UNUSED_CONTEXT;
for (s = 0; s < plen; s++) {
if ( pRExC_state->code_blocks
&& n < pRExC_state->code_blocks->count
&& s == pRExC_state->code_blocks->cb[n].start)
{
s = pRExC_state->code_blocks->cb[n].end;
n++;
continue;
}
/* TODO ideally should handle [..], (#..), /#.../x to reduce false
* positives here */
if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
(pat[s+2] == '{'
|| (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
)
return 1;
}
return 0;
}
/* Handle run-time code blocks. We will already have compiled any direct
* or indirect literal code blocks. Now, take the pattern 'pat' and make a
* copy of it, but with any literal code blocks blanked out and
* appropriate chars escaped; then feed it into
*
* eval "qr'modified_pattern'"
*
* For example,
*
* a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
*
* becomes
*
* qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
*
* After eval_sv()-ing that, grab any new code blocks from the returned qr
* and merge them with any code blocks of the original regexp.
*
* If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
* instead, just save the qr and return FALSE; this tells our caller that
* the original pattern needs upgrading to utf8.
*/
static bool
S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
char *pat, STRLEN plen)
{
SV *qr;
DECLARE_AND_GET_RE_DEBUG_FLAGS;
if (pRExC_state->runtime_code_qr) {
/* this is the second time we've been called; this should
* only happen if the main pattern got upgraded to utf8
* during compilation; re-use the qr we compiled first time
* round (which should be utf8 too)
*/
qr = pRExC_state->runtime_code_qr;
pRExC_state->runtime_code_qr = NULL;
assert(RExC_utf8 && SvUTF8(qr));
}
else {
int n = 0;
STRLEN s;
char *p, *newpat;
int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
SV *sv, *qr_ref;
dSP;
/* determine how many extra chars we need for ' and \ escaping */
for (s = 0; s < plen; s++) {
if (pat[s] == '\'' || pat[s] == '\\')
newlen++;
}
Newx(newpat, newlen, char);
p = newpat;
*p++ = 'q'; *p++ = 'r'; *p++ = '\'';
for (s = 0; s < plen; s++) {
if ( pRExC_state->code_blocks
&& n < pRExC_state->code_blocks->count
&& s == pRExC_state->code_blocks->cb[n].start)
{
/* blank out literal code block so that they aren't
* recompiled: eg change from/to:
* /(?{xyz})/
* /(?=====)/
* and
* /(??{xyz})/
* /(?======)/
* and
* /(?(?{xyz}))/
* /(?(?=====))/
*/
assert(pat[s] == '(');
assert(pat[s+1] == '?');
*p++ = '(';
*p++ = '?';
s += 2;
while (s < pRExC_state->code_blocks->cb[n].end) {
*p++ = '=';
s++;
}
*p++ = ')';
n++;
continue;
}
if (pat[s] == '\'' || pat[s] == '\\')
*p++ = '\\';
*p++ = pat[s];
}
*p++ = '\'';
if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
*p++ = 'x';
if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
*p++ = 'x';
}
}