ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/svn/ircd-hybrid/lib/pcre/pcre_exec.c
Revision: 32
Committed: Sun Oct 2 20:41:23 2005 UTC (18 years, 5 months ago) by knight
Content type: text/x-csrc
File size: 80426 byte(s)
Log Message:
- svn:keywords

File Contents

# Content
1 /* $Id$ */
2
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
6
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
9
10 Written by Philip Hazel
11 Copyright (c) 1997-2005 University of Cambridge
12
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
16
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
27
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
40 */
41
42
43 /* This module contains pcre_exec(), the externally visible function that does
44 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
45 possible. There are also some static supporting functions. */
46
47
48 #include "pcre_internal.h"
49
50
51 /* Structure for building a chain of data that actually lives on the
52 stack, for holding the values of the subject pointer at the start of each
53 subpattern, so as to detect when an empty string has been matched by a
54 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
55 are on the heap, not on the stack. */
56
57 typedef struct eptrblock {
58 struct eptrblock *epb_prev;
59 const uschar *epb_saved_eptr;
60 } eptrblock;
61
62 /* Flag bits for the match() function */
63
64 #define match_condassert 0x01 /* Called to check a condition assertion */
65 #define match_isgroup 0x02 /* Set if start of bracketed group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Maximum number of ints of offset to save on the stack for recursive calls.
74 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
75 because the offset vector is always a multiple of 3 long. */
76
77 #define REC_STACK_SAVE_MAX 30
78
79 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
80
81 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
82 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
83
84
85
86 #ifdef DEBUG
87 /*************************************************
88 * Debugging function to print chars *
89 *************************************************/
90
91 /* Print a sequence of chars in printable format, stopping at the end of the
92 subject if the requested.
93
94 Arguments:
95 p points to characters
96 length number to print
97 is_subject TRUE if printing from within md->start_subject
98 md pointer to matching data block, if is_subject is TRUE
99
100 Returns: nothing
101 */
102
103 static void
104 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
105 {
106 int c;
107 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
108 while (length-- > 0)
109 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
110 }
111 #endif
112
113
114
115 /*************************************************
116 * Match a back-reference *
117 *************************************************/
118
119 /* If a back reference hasn't been set, the length that is passed is greater
120 than the number of characters left in the string, so the match fails.
121
122 Arguments:
123 offset index into the offset vector
124 eptr points into the subject
125 length length to be matched
126 md points to match data block
127 ims the ims flags
128
129 Returns: TRUE if matched
130 */
131
132 static BOOL
133 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
134 unsigned long int ims)
135 {
136 const uschar *p = md->start_subject + md->offset_vector[offset];
137
138 #ifdef DEBUG
139 if (eptr >= md->end_subject)
140 printf("matching subject <null>");
141 else
142 {
143 printf("matching subject ");
144 pchars(eptr, length, TRUE, md);
145 }
146 printf(" against backref ");
147 pchars(p, length, FALSE, md);
148 printf("\n");
149 #endif
150
151 /* Always fail if not enough characters left */
152
153 if (length > md->end_subject - eptr) return FALSE;
154
155 /* Separate the caselesss case for speed */
156
157 if ((ims & PCRE_CASELESS) != 0)
158 {
159 while (length-- > 0)
160 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
161 }
162 else
163 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
164
165 return TRUE;
166 }
167
168
169
170 /***************************************************************************
171 ****************************************************************************
172 RECURSION IN THE match() FUNCTION
173
174 The match() function is highly recursive. Some regular expressions can cause
175 it to recurse thousands of times. I was writing for Unix, so I just let it
176 call itself recursively. This uses the stack for saving everything that has
177 to be saved for a recursive call. On Unix, the stack can be large, and this
178 works fine.
179
180 It turns out that on non-Unix systems there are problems with programs that
181 use a lot of stack. (This despite the fact that every last chip has oodles
182 of memory these days, and techniques for extending the stack have been known
183 for decades.) So....
184
185 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
186 calls by keeping local variables that need to be preserved in blocks of memory
187 obtained from malloc instead instead of on the stack. Macros are used to
188 achieve this so that the actual code doesn't look very different to what it
189 always used to.
190 ****************************************************************************
191 ***************************************************************************/
192
193
194 /* These versions of the macros use the stack, as normal */
195
196 #ifndef NO_RECURSE
197 #define REGISTER register
198 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
199 #define RRETURN(ra) return ra
200 #else
201
202
203 /* These versions of the macros manage a private stack on the heap. Note
204 that the rd argument of RMATCH isn't actually used. It's the md argument of
205 match(), which never changes. */
206
207 #define REGISTER
208
209 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
210 {\
211 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
212 if (setjmp(frame->Xwhere) == 0)\
213 {\
214 newframe->Xeptr = ra;\
215 newframe->Xecode = rb;\
216 newframe->Xoffset_top = rc;\
217 newframe->Xims = re;\
218 newframe->Xeptrb = rf;\
219 newframe->Xflags = rg;\
220 newframe->Xprevframe = frame;\
221 frame = newframe;\
222 DPRINTF(("restarting from line %d\n", __LINE__));\
223 goto HEAP_RECURSE;\
224 }\
225 else\
226 {\
227 DPRINTF(("longjumped back to line %d\n", __LINE__));\
228 frame = md->thisframe;\
229 rx = frame->Xresult;\
230 }\
231 }
232
233 #define RRETURN(ra)\
234 {\
235 heapframe *newframe = frame;\
236 frame = newframe->Xprevframe;\
237 (pcre_stack_free)(newframe);\
238 if (frame != NULL)\
239 {\
240 frame->Xresult = ra;\
241 md->thisframe = frame;\
242 longjmp(frame->Xwhere, 1);\
243 }\
244 return ra;\
245 }
246
247
248 /* Structure for remembering the local variables in a private frame */
249
250 typedef struct heapframe {
251 struct heapframe *Xprevframe;
252
253 /* Function arguments that may change */
254
255 const uschar *Xeptr;
256 const uschar *Xecode;
257 int Xoffset_top;
258 long int Xims;
259 eptrblock *Xeptrb;
260 int Xflags;
261
262 /* Function local variables */
263
264 const uschar *Xcallpat;
265 const uschar *Xcharptr;
266 const uschar *Xdata;
267 const uschar *Xnext;
268 const uschar *Xpp;
269 const uschar *Xprev;
270 const uschar *Xsaved_eptr;
271
272 recursion_info Xnew_recursive;
273
274 BOOL Xcur_is_word;
275 BOOL Xcondition;
276 BOOL Xminimize;
277 BOOL Xprev_is_word;
278
279 unsigned long int Xoriginal_ims;
280
281 int Xctype;
282 int Xfc;
283 int Xfi;
284 int Xlength;
285 int Xmax;
286 int Xmin;
287 int Xnumber;
288 int Xoffset;
289 int Xop;
290 int Xsave_capture_last;
291 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
292 int Xstacksave[REC_STACK_SAVE_MAX];
293
294 eptrblock Xnewptrb;
295
296 /* Place to pass back result, and where to jump back to */
297
298 int Xresult;
299 jmp_buf Xwhere;
300
301 } heapframe;
302
303 #endif
304
305
306 /***************************************************************************
307 ***************************************************************************/
308
309
310
311 /*************************************************
312 * Match from current position *
313 *************************************************/
314
315 /* On entry ecode points to the first opcode, and eptr to the first character
316 in the subject string, while eptrb holds the value of eptr at the start of the
317 last bracketed group - used for breaking infinite loops matching zero-length
318 strings. This function is called recursively in many circumstances. Whenever it
319 returns a negative (error) response, the outer incarnation must also return the
320 same response.
321
322 Performance note: It might be tempting to extract commonly used fields from the
323 md structure (e.g. utf8, end_subject) into individual variables to improve
324 performance. Tests using gcc on a SPARC disproved this; in the first case, it
325 made performance worse.
326
327 Arguments:
328 eptr pointer in subject
329 ecode position in code
330 offset_top current top pointer
331 md pointer to "static" info for the match
332 ims current /i, /m, and /s options
333 eptrb pointer to chain of blocks containing eptr at start of
334 brackets - for testing for empty matches
335 flags can contain
336 match_condassert - this is an assertion condition
337 match_isgroup - this is the start of a bracketed group
338
339 Returns: MATCH_MATCH if matched ) these values are >= 0
340 MATCH_NOMATCH if failed to match )
341 a negative PCRE_ERROR_xxx value if aborted by an error condition
342 (e.g. stopped by recursion limit)
343 */
344
345 static int
346 match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,
347 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
348 int flags)
349 {
350 /* These variables do not need to be preserved over recursion in this function,
351 so they can be ordinary variables in all cases. Mark them with "register"
352 because they are used a lot in loops. */
353
354 register int rrc; /* Returns from recursive calls */
355 register int i; /* Used for loops not involving calls to RMATCH() */
356 register int c; /* Character values not kept over RMATCH() calls */
357 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
358
359 /* When recursion is not being used, all "local" variables that have to be
360 preserved over calls to RMATCH() are part of a "frame" which is obtained from
361 heap storage. Set up the top-level frame here; others are obtained from the
362 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
363
364 #ifdef NO_RECURSE
365 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
366 frame->Xprevframe = NULL; /* Marks the top level */
367
368 /* Copy in the original argument variables */
369
370 frame->Xeptr = eptr;
371 frame->Xecode = ecode;
372 frame->Xoffset_top = offset_top;
373 frame->Xims = ims;
374 frame->Xeptrb = eptrb;
375 frame->Xflags = flags;
376
377 /* This is where control jumps back to to effect "recursion" */
378
379 HEAP_RECURSE:
380
381 /* Macros make the argument variables come from the current frame */
382
383 #define eptr frame->Xeptr
384 #define ecode frame->Xecode
385 #define offset_top frame->Xoffset_top
386 #define ims frame->Xims
387 #define eptrb frame->Xeptrb
388 #define flags frame->Xflags
389
390 /* Ditto for the local variables */
391
392 #define callpat frame->Xcallpat
393 #define data frame->Xdata
394 #define next frame->Xnext
395 #define pp frame->Xpp
396 #define prev frame->Xprev
397 #define saved_eptr frame->Xsaved_eptr
398
399 #define new_recursive frame->Xnew_recursive
400
401 #define cur_is_word frame->Xcur_is_word
402 #define condition frame->Xcondition
403 #define minimize frame->Xminimize
404 #define prev_is_word frame->Xprev_is_word
405
406 #define original_ims frame->Xoriginal_ims
407
408 #define ctype frame->Xctype
409 #define fc frame->Xfc
410 #define fi frame->Xfi
411 #define length frame->Xlength
412 #define max frame->Xmax
413 #define min frame->Xmin
414 #define number frame->Xnumber
415 #define offset frame->Xoffset
416 #define op frame->Xop
417 #define save_capture_last frame->Xsave_capture_last
418 #define save_offset1 frame->Xsave_offset1
419 #define save_offset2 frame->Xsave_offset2
420 #define save_offset3 frame->Xsave_offset3
421 #define stacksave frame->Xstacksave
422
423 #define newptrb frame->Xnewptrb
424
425 /* When recursion is being used, local variables are allocated on the stack and
426 get preserved during recursion in the normal way. In this environment, fi and
427 i, and fc and c, can be the same variables. */
428
429 #else
430 #define fi i
431 #define fc c
432
433
434 const uschar *callpat; /* them within each of those blocks. */
435 const uschar *data; /* However, in order to accommodate the */
436 const uschar *next; /* version of this code that uses an */
437 const uschar *pp; /* external "stack" implemented on the */
438 const uschar *prev; /* heap, it is easier to declare them */
439 const uschar *saved_eptr; /* all here, so the declarations can */
440 /* be cut out in a block. The only */
441 recursion_info new_recursive; /* declarations within blocks below are */
442 /* for variables that do not have to */
443 BOOL cur_is_word; /* be preserved over a recursive call */
444 BOOL condition; /* to RMATCH(). */
445 BOOL minimize;
446 BOOL prev_is_word;
447
448 unsigned long int original_ims;
449
450
451 int ctype;
452 int length;
453 int max;
454 int min;
455 int number;
456 int offset;
457 int op;
458 int save_capture_last;
459 int save_offset1, save_offset2, save_offset3;
460 int stacksave[REC_STACK_SAVE_MAX];
461
462 eptrblock newptrb;
463 #endif
464
465 /* OK, now we can get on with the real code of the function. Recursion is
466 specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
467 these just turn into a recursive call to match() and a "return", respectively.
468 However, RMATCH isn't like a function call because it's quite a complicated
469 macro. It has to be used in one particular way. This shouldn't, however, impact
470 performance when true recursion is being used. */
471
472 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
473
474 original_ims = ims; /* Save for resetting on ')' */
475 utf8 = md->utf8; /* Local copy of the flag */
476
477 /* At the start of a bracketed group, add the current subject pointer to the
478 stack of such pointers, to be re-instated at the end of the group when we hit
479 the closing ket. When match() is called in other circumstances, we don't add to
480 this stack. */
481
482 if ((flags & match_isgroup) != 0)
483 {
484 newptrb.epb_prev = eptrb;
485 newptrb.epb_saved_eptr = eptr;
486 eptrb = &newptrb;
487 }
488
489 /* Now start processing the operations. */
490
491 for (;;)
492 {
493 op = *ecode;
494 minimize = FALSE;
495
496 /* For partial matching, remember if we ever hit the end of the subject after
497 matching at least one subject character. */
498
499 if (md->partial &&
500 eptr >= md->end_subject &&
501 eptr > md->start_match)
502 md->hitend = TRUE;
503
504 /* Opening capturing bracket. If there is space in the offset vector, save
505 the current subject position in the working slot at the top of the vector. We
506 mustn't change the current values of the data slot, because they may be set
507 from a previous iteration of this group, and be referred to by a reference
508 inside the group.
509
510 If the bracket fails to match, we need to restore this value and also the
511 values of the final offsets, in case they were set by a previous iteration of
512 the same bracket.
513
514 If there isn't enough space in the offset vector, treat this as if it were a
515 non-capturing bracket. Don't worry about setting the flag for the error case
516 here; that is handled in the code for KET. */
517
518 if (op > OP_BRA)
519 {
520 number = op - OP_BRA;
521
522 /* For extended extraction brackets (large number), we have to fish out the
523 number from a dummy opcode at the start. */
524
525 if (number > EXTRACT_BASIC_MAX)
526 number = GET2(ecode, 2+LINK_SIZE);
527 offset = number << 1;
528
529 #ifdef DEBUG
530 printf("start bracket %d subject=", number);
531 pchars(eptr, 16, TRUE, md);
532 printf("\n");
533 #endif
534
535 if (offset < md->offset_max)
536 {
537 save_offset1 = md->offset_vector[offset];
538 save_offset2 = md->offset_vector[offset+1];
539 save_offset3 = md->offset_vector[md->offset_end - number];
540 save_capture_last = md->capture_last;
541
542 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
543 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
544
545 do
546 {
547 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
548 match_isgroup);
549 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
550 md->capture_last = save_capture_last;
551 ecode += GET(ecode, 1);
552 }
553 while (*ecode == OP_ALT);
554
555 DPRINTF(("bracket %d failed\n", number));
556
557 md->offset_vector[offset] = save_offset1;
558 md->offset_vector[offset+1] = save_offset2;
559 md->offset_vector[md->offset_end - number] = save_offset3;
560
561 RRETURN(MATCH_NOMATCH);
562 }
563
564 /* Insufficient room for saving captured contents */
565
566 else op = OP_BRA;
567 }
568
569 /* Other types of node can be handled by a switch */
570
571 switch(op)
572 {
573 case OP_BRA: /* Non-capturing bracket: optimized */
574 DPRINTF(("start bracket 0\n"));
575 do
576 {
577 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
578 match_isgroup);
579 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
580 ecode += GET(ecode, 1);
581 }
582 while (*ecode == OP_ALT);
583 DPRINTF(("bracket 0 failed\n"));
584 RRETURN(MATCH_NOMATCH);
585
586 /* Conditional group: compilation checked that there are no more than
587 two branches. If the condition is false, skipping the first branch takes us
588 past the end if there is only one branch, but that's OK because that is
589 exactly what going to the ket would do. */
590
591 case OP_COND:
592 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
593 {
594 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
595 condition = (offset == CREF_RECURSE * 2)?
596 (md->recursive != NULL) :
597 (offset < offset_top && md->offset_vector[offset] >= 0);
598 RMATCH(rrc, eptr, ecode + (condition?
599 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
600 offset_top, md, ims, eptrb, match_isgroup);
601 RRETURN(rrc);
602 }
603
604 /* The condition is an assertion. Call match() to evaluate it - setting
605 the final argument TRUE causes it to stop at the end of an assertion. */
606
607 else
608 {
609 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
610 match_condassert | match_isgroup);
611 if (rrc == MATCH_MATCH)
612 {
613 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
614 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
615 }
616 else if (rrc != MATCH_NOMATCH)
617 {
618 RRETURN(rrc); /* Need braces because of following else */
619 }
620 else ecode += GET(ecode, 1);
621 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
622 match_isgroup);
623 RRETURN(rrc);
624 }
625 /* Control never reaches here */
626
627 /* Skip over conditional reference or large extraction number data if
628 encountered. */
629
630 case OP_CREF:
631 case OP_BRANUMBER:
632 ecode += 3;
633 break;
634
635 /* End of the pattern. If we are in a recursion, we should restore the
636 offsets appropriately and continue from after the call. */
637
638 case OP_END:
639 if (md->recursive != NULL && md->recursive->group_num == 0)
640 {
641 recursion_info *rec = md->recursive;
642 DPRINTF(("Hit the end in a (?0) recursion\n"));
643 md->recursive = rec->prevrec;
644 memmove(md->offset_vector, rec->offset_save,
645 rec->saved_max * sizeof(int));
646 md->start_match = rec->save_start;
647 ims = original_ims;
648 ecode = rec->after_call;
649 break;
650 }
651
652 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
653 string - backtracking will then try other alternatives, if any. */
654
655 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
656 md->end_match_ptr = eptr; /* Record where we ended */
657 md->end_offset_top = offset_top; /* and how many extracts were taken */
658 RRETURN(MATCH_MATCH);
659
660 /* Change option settings */
661
662 case OP_OPT:
663 ims = ecode[1];
664 ecode += 2;
665 DPRINTF(("ims set to %02lx\n", ims));
666 break;
667
668 /* Assertion brackets. Check the alternative branches in turn - the
669 matching won't pass the KET for an assertion. If any one branch matches,
670 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
671 start of each branch to move the current point backwards, so the code at
672 this level is identical to the lookahead case. */
673
674 case OP_ASSERT:
675 case OP_ASSERTBACK:
676 do
677 {
678 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
679 match_isgroup);
680 if (rrc == MATCH_MATCH) break;
681 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
682 ecode += GET(ecode, 1);
683 }
684 while (*ecode == OP_ALT);
685 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
686
687 /* If checking an assertion for a condition, return MATCH_MATCH. */
688
689 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
690
691 /* Continue from after the assertion, updating the offsets high water
692 mark, since extracts may have been taken during the assertion. */
693
694 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
695 ecode += 1 + LINK_SIZE;
696 offset_top = md->end_offset_top;
697 continue;
698
699 /* Negative assertion: all branches must fail to match */
700
701 case OP_ASSERT_NOT:
702 case OP_ASSERTBACK_NOT:
703 do
704 {
705 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
706 match_isgroup);
707 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
708 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
709 ecode += GET(ecode,1);
710 }
711 while (*ecode == OP_ALT);
712
713 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
714
715 ecode += 1 + LINK_SIZE;
716 continue;
717
718 /* Move the subject pointer back. This occurs only at the start of
719 each branch of a lookbehind assertion. If we are too close to the start to
720 move back, this match function fails. When working with UTF-8 we move
721 back a number of characters, not bytes. */
722
723 case OP_REVERSE:
724 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
725
726 {
727 eptr -= GET(ecode,1);
728 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
729 }
730
731 /* Skip to next op code */
732
733 ecode += 1 + LINK_SIZE;
734 break;
735
736 /* The callout item calls an external function, if one is provided, passing
737 details of the match so far. This is mainly for debugging, though the
738 function is able to force a failure. */
739
740 case OP_CALLOUT:
741 if (pcre_callout != NULL)
742 {
743 pcre_callout_block cb;
744 cb.version = 1; /* Version 1 of the callout block */
745 cb.callout_number = ecode[1];
746 cb.offset_vector = md->offset_vector;
747 cb.subject = (const char *)md->start_subject;
748 cb.subject_length = md->end_subject - md->start_subject;
749 cb.start_match = md->start_match - md->start_subject;
750 cb.current_position = eptr - md->start_subject;
751 cb.pattern_position = GET(ecode, 2);
752 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
753 cb.capture_top = offset_top/2;
754 cb.capture_last = md->capture_last;
755 cb.callout_data = md->callout_data;
756 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
757 if (rrc < 0) RRETURN(rrc);
758 }
759 ecode += 2 + 2*LINK_SIZE;
760 break;
761
762 /* Recursion either matches the current regex, or some subexpression. The
763 offset data is the offset to the starting bracket from the start of the
764 whole pattern. (This is so that it works from duplicated subpatterns.)
765
766 If there are any capturing brackets started but not finished, we have to
767 save their starting points and reinstate them after the recursion. However,
768 we don't know how many such there are (offset_top records the completed
769 total) so we just have to save all the potential data. There may be up to
770 65535 such values, which is too large to put on the stack, but using malloc
771 for small numbers seems expensive. As a compromise, the stack is used when
772 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
773 is used. A problem is what to do if the malloc fails ... there is no way of
774 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
775 values on the stack, and accept that the rest may be wrong.
776
777 There are also other values that have to be saved. We use a chained
778 sequence of blocks that actually live on the stack. Thanks to Robin Houston
779 for the original version of this logic. */
780
781 case OP_RECURSE:
782 {
783 callpat = md->start_code + GET(ecode, 1);
784 new_recursive.group_num = *callpat - OP_BRA;
785
786 /* For extended extraction brackets (large number), we have to fish out
787 the number from a dummy opcode at the start. */
788
789 if (new_recursive.group_num > EXTRACT_BASIC_MAX)
790 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
791
792 /* Add to "recursing stack" */
793
794 new_recursive.prevrec = md->recursive;
795 md->recursive = &new_recursive;
796
797 /* Find where to continue from afterwards */
798
799 ecode += 1 + LINK_SIZE;
800 new_recursive.after_call = ecode;
801
802 /* Now save the offset data. */
803
804 new_recursive.saved_max = md->offset_end;
805 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
806 new_recursive.offset_save = stacksave;
807 else
808 {
809 new_recursive.offset_save =
810 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
811 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
812 }
813
814 memcpy(new_recursive.offset_save, md->offset_vector,
815 new_recursive.saved_max * sizeof(int));
816 new_recursive.save_start = md->start_match;
817 md->start_match = eptr;
818
819 /* OK, now we can do the recursion. For each top-level alternative we
820 restore the offset and recursion data. */
821
822 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
823 do
824 {
825 RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
826 eptrb, match_isgroup);
827 if (rrc == MATCH_MATCH)
828 {
829 md->recursive = new_recursive.prevrec;
830 if (new_recursive.offset_save != stacksave)
831 (pcre_free)(new_recursive.offset_save);
832 RRETURN(MATCH_MATCH);
833 }
834 else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
835
836 md->recursive = &new_recursive;
837 memcpy(md->offset_vector, new_recursive.offset_save,
838 new_recursive.saved_max * sizeof(int));
839 callpat += GET(callpat, 1);
840 }
841 while (*callpat == OP_ALT);
842
843 DPRINTF(("Recursion didn't match\n"));
844 md->recursive = new_recursive.prevrec;
845 if (new_recursive.offset_save != stacksave)
846 (pcre_free)(new_recursive.offset_save);
847 RRETURN(MATCH_NOMATCH);
848 }
849 /* Control never reaches here */
850
851 /* "Once" brackets are like assertion brackets except that after a match,
852 the point in the subject string is not moved back. Thus there can never be
853 a move back into the brackets. Friedl calls these "atomic" subpatterns.
854 Check the alternative branches in turn - the matching won't pass the KET
855 for this kind of subpattern. If any one branch matches, we carry on as at
856 the end of a normal bracket, leaving the subject pointer. */
857
858 case OP_ONCE:
859 {
860 prev = ecode;
861 saved_eptr = eptr;
862
863 do
864 {
865 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
866 eptrb, match_isgroup);
867 if (rrc == MATCH_MATCH) break;
868 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
869 ecode += GET(ecode,1);
870 }
871 while (*ecode == OP_ALT);
872
873 /* If hit the end of the group (which could be repeated), fail */
874
875 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
876
877 /* Continue as from after the assertion, updating the offsets high water
878 mark, since extracts may have been taken. */
879
880 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
881
882 offset_top = md->end_offset_top;
883 eptr = md->end_match_ptr;
884
885 /* For a non-repeating ket, just continue at this level. This also
886 happens for a repeating ket if no characters were matched in the group.
887 This is the forcible breaking of infinite loops as implemented in Perl
888 5.005. If there is an options reset, it will get obeyed in the normal
889 course of events. */
890
891 if (*ecode == OP_KET || eptr == saved_eptr)
892 {
893 ecode += 1+LINK_SIZE;
894 break;
895 }
896
897 /* The repeating kets try the rest of the pattern or restart from the
898 preceding bracket, in the appropriate order. We need to reset any options
899 that changed within the bracket before re-running it, so check the next
900 opcode. */
901
902 if (ecode[1+LINK_SIZE] == OP_OPT)
903 {
904 ims = (ims & ~PCRE_IMS) | ecode[4];
905 DPRINTF(("ims set to %02lx at group repeat\n", ims));
906 }
907
908 if (*ecode == OP_KETRMIN)
909 {
910 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
911 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
912 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
913 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
914 }
915 else /* OP_KETRMAX */
916 {
917 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
918 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
919 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
920 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
921 }
922 }
923 RRETURN(MATCH_NOMATCH);
924
925 /* An alternation is the end of a branch; scan along to find the end of the
926 bracketed group and go to there. */
927
928 case OP_ALT:
929 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
930 break;
931
932 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
933 that it may occur zero times. It may repeat infinitely, or not at all -
934 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
935 repeat limits are compiled as a number of copies, with the optional ones
936 preceded by BRAZERO or BRAMINZERO. */
937
938 case OP_BRAZERO:
939 {
940 next = ecode+1;
941 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
942 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
943 do next += GET(next,1); while (*next == OP_ALT);
944 ecode = next + 1+LINK_SIZE;
945 }
946 break;
947
948 case OP_BRAMINZERO:
949 {
950 next = ecode+1;
951 do next += GET(next,1); while (*next == OP_ALT);
952 RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
953 match_isgroup);
954 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
955 ecode++;
956 }
957 break;
958
959 /* End of a group, repeated or non-repeating. If we are at the end of
960 an assertion "group", stop matching and return MATCH_MATCH, but record the
961 current high water mark for use by positive assertions. Do this also
962 for the "once" (not-backup up) groups. */
963
964 case OP_KET:
965 case OP_KETRMIN:
966 case OP_KETRMAX:
967 {
968 prev = ecode - GET(ecode, 1);
969 saved_eptr = eptrb->epb_saved_eptr;
970
971 /* Back up the stack of bracket start pointers. */
972
973 eptrb = eptrb->epb_prev;
974
975 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
976 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
977 *prev == OP_ONCE)
978 {
979 md->end_match_ptr = eptr; /* For ONCE */
980 md->end_offset_top = offset_top;
981 RRETURN(MATCH_MATCH);
982 }
983
984 /* In all other cases except a conditional group we have to check the
985 group number back at the start and if necessary complete handling an
986 extraction by setting the offsets and bumping the high water mark. */
987
988 if (*prev != OP_COND)
989 {
990 number = *prev - OP_BRA;
991
992 /* For extended extraction brackets (large number), we have to fish out
993 the number from a dummy opcode at the start. */
994
995 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
996 offset = number << 1;
997
998 #ifdef DEBUG
999 printf("end bracket %d", number);
1000 printf("\n");
1001 #endif
1002
1003 /* Test for a numbered group. This includes groups called as a result
1004 of recursion. Note that whole-pattern recursion is coded as a recurse
1005 into group 0, so it won't be picked up here. Instead, we catch it when
1006 the OP_END is reached. */
1007
1008 if (number > 0)
1009 {
1010 md->capture_last = number;
1011 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1012 {
1013 md->offset_vector[offset] =
1014 md->offset_vector[md->offset_end - number];
1015 md->offset_vector[offset+1] = eptr - md->start_subject;
1016 if (offset_top <= offset) offset_top = offset + 2;
1017 }
1018
1019 /* Handle a recursively called group. Restore the offsets
1020 appropriately and continue from after the call. */
1021
1022 if (md->recursive != NULL && md->recursive->group_num == number)
1023 {
1024 recursion_info *rec = md->recursive;
1025 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1026 md->recursive = rec->prevrec;
1027 md->start_match = rec->save_start;
1028 memcpy(md->offset_vector, rec->offset_save,
1029 rec->saved_max * sizeof(int));
1030 ecode = rec->after_call;
1031 ims = original_ims;
1032 break;
1033 }
1034 }
1035 }
1036
1037 /* Reset the value of the ims flags, in case they got changed during
1038 the group. */
1039
1040 ims = original_ims;
1041 DPRINTF(("ims reset to %02lx\n", ims));
1042
1043 /* For a non-repeating ket, just continue at this level. This also
1044 happens for a repeating ket if no characters were matched in the group.
1045 This is the forcible breaking of infinite loops as implemented in Perl
1046 5.005. If there is an options reset, it will get obeyed in the normal
1047 course of events. */
1048
1049 if (*ecode == OP_KET || eptr == saved_eptr)
1050 {
1051 ecode += 1 + LINK_SIZE;
1052 break;
1053 }
1054
1055 /* The repeating kets try the rest of the pattern or restart from the
1056 preceding bracket, in the appropriate order. */
1057
1058 if (*ecode == OP_KETRMIN)
1059 {
1060 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1061 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1062 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1063 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1064 }
1065 else /* OP_KETRMAX */
1066 {
1067 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1068 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1069 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1070 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1071 }
1072 }
1073
1074 RRETURN(MATCH_NOMATCH);
1075
1076 /* Start of subject unless notbol, or after internal newline if multiline */
1077
1078 case OP_CIRC:
1079 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1080 if ((ims & PCRE_MULTILINE) != 0)
1081 {
1082 if (eptr != md->start_subject && eptr[-1] != NEWLINE)
1083 RRETURN(MATCH_NOMATCH);
1084 ecode++;
1085 break;
1086 }
1087 /* ... else fall through */
1088
1089 /* Start of subject assertion */
1090
1091 case OP_SOD:
1092 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1093 ecode++;
1094 break;
1095
1096 /* Start of match assertion */
1097
1098 case OP_SOM:
1099 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1100 ecode++;
1101 break;
1102
1103 /* Assert before internal newline if multiline, or before a terminating
1104 newline unless endonly is set, else end of subject unless noteol is set. */
1105
1106 case OP_DOLL:
1107 if ((ims & PCRE_MULTILINE) != 0)
1108 {
1109 if (eptr < md->end_subject)
1110 { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
1111 else
1112 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1113 ecode++;
1114 break;
1115 }
1116 else
1117 {
1118 if (md->noteol) RRETURN(MATCH_NOMATCH);
1119 if (!md->endonly)
1120 {
1121 if (eptr < md->end_subject - 1 ||
1122 (eptr == md->end_subject - 1 && *eptr != NEWLINE))
1123 RRETURN(MATCH_NOMATCH);
1124 ecode++;
1125 break;
1126 }
1127 }
1128 /* ... else fall through */
1129
1130 /* End of subject assertion (\z) */
1131
1132 case OP_EOD:
1133 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1134 ecode++;
1135 break;
1136
1137 /* End of subject or ending \n assertion (\Z) */
1138
1139 case OP_EODN:
1140 if (eptr < md->end_subject - 1 ||
1141 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
1142 ecode++;
1143 break;
1144
1145 /* Word boundary assertions */
1146
1147 case OP_NOT_WORD_BOUNDARY:
1148 case OP_WORD_BOUNDARY:
1149 {
1150
1151 /* More streamlined when not in UTF-8 mode */
1152
1153 {
1154 prev_is_word = (eptr != md->start_subject) &&
1155 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1156 cur_is_word = (eptr < md->end_subject) &&
1157 ((md->ctypes[*eptr] & ctype_word) != 0);
1158 }
1159
1160 /* Now see if the situation is what we want */
1161
1162 if ((*ecode++ == OP_WORD_BOUNDARY)?
1163 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1164 RRETURN(MATCH_NOMATCH);
1165 }
1166 break;
1167
1168 /* Match a single character type; inline for speed */
1169
1170 case OP_ANY:
1171 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
1172 RRETURN(MATCH_NOMATCH);
1173 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1174 ecode++;
1175 break;
1176
1177 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1178 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1179
1180 case OP_ANYBYTE:
1181 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1182 ecode++;
1183 break;
1184
1185 case OP_NOT_DIGIT:
1186 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1187 GETCHARINCTEST(c, eptr);
1188 if ((md->ctypes[c] & ctype_digit) != 0)
1189 RRETURN(MATCH_NOMATCH);
1190 ecode++;
1191 break;
1192
1193 case OP_DIGIT:
1194 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1195 GETCHARINCTEST(c, eptr);
1196 if ((md->ctypes[c] & ctype_digit) == 0)
1197 RRETURN(MATCH_NOMATCH);
1198 ecode++;
1199 break;
1200
1201 case OP_NOT_WHITESPACE:
1202 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1203 GETCHARINCTEST(c, eptr);
1204 if ((md->ctypes[c] & ctype_space) != 0)
1205 RRETURN(MATCH_NOMATCH);
1206 ecode++;
1207 break;
1208
1209 case OP_WHITESPACE:
1210 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1211 GETCHARINCTEST(c, eptr);
1212 if ((md->ctypes[c] & ctype_space) == 0)
1213 RRETURN(MATCH_NOMATCH);
1214 ecode++;
1215 break;
1216
1217 case OP_NOT_WORDCHAR:
1218 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1219 GETCHARINCTEST(c, eptr);
1220 if ((md->ctypes[c] & ctype_word) != 0)
1221 RRETURN(MATCH_NOMATCH);
1222 ecode++;
1223 break;
1224
1225 case OP_WORDCHAR:
1226 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1227 GETCHARINCTEST(c, eptr);
1228 if ((md->ctypes[c] & ctype_word) == 0)
1229 RRETURN(MATCH_NOMATCH);
1230 ecode++;
1231 break;
1232
1233
1234 /* Match a back reference, possibly repeatedly. Look past the end of the
1235 item to see if there is repeat information following. The code is similar
1236 to that for character classes, but repeated for efficiency. Then obey
1237 similar code to character type repeats - written out again for speed.
1238 However, if the referenced string is the empty string, always treat
1239 it as matched, any number of times (otherwise there could be infinite
1240 loops). */
1241
1242 case OP_REF:
1243 {
1244 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1245 ecode += 3; /* Advance past item */
1246
1247 /* If the reference is unset, set the length to be longer than the amount
1248 of subject left; this ensures that every attempt at a match fails. We
1249 can't just fail here, because of the possibility of quantifiers with zero
1250 minima. */
1251
1252 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1253 md->end_subject - eptr + 1 :
1254 md->offset_vector[offset+1] - md->offset_vector[offset];
1255
1256 /* Set up for repetition, or handle the non-repeated case */
1257
1258 switch (*ecode)
1259 {
1260 case OP_CRSTAR:
1261 case OP_CRMINSTAR:
1262 case OP_CRPLUS:
1263 case OP_CRMINPLUS:
1264 case OP_CRQUERY:
1265 case OP_CRMINQUERY:
1266 c = *ecode++ - OP_CRSTAR;
1267 minimize = (c & 1) != 0;
1268 min = rep_min[c]; /* Pick up values from tables; */
1269 max = rep_max[c]; /* zero for max => infinity */
1270 if (max == 0) max = INT_MAX;
1271 break;
1272
1273 case OP_CRRANGE:
1274 case OP_CRMINRANGE:
1275 minimize = (*ecode == OP_CRMINRANGE);
1276 min = GET2(ecode, 1);
1277 max = GET2(ecode, 3);
1278 if (max == 0) max = INT_MAX;
1279 ecode += 5;
1280 break;
1281
1282 default: /* No repeat follows */
1283 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1284 eptr += length;
1285 continue; /* With the main loop */
1286 }
1287
1288 /* If the length of the reference is zero, just continue with the
1289 main loop. */
1290
1291 if (length == 0) continue;
1292
1293 /* First, ensure the minimum number of matches are present. We get back
1294 the length of the reference string explicitly rather than passing the
1295 address of eptr, so that eptr can be a register variable. */
1296
1297 for (i = 1; i <= min; i++)
1298 {
1299 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1300 eptr += length;
1301 }
1302
1303 /* If min = max, continue at the same level without recursion.
1304 They are not both allowed to be zero. */
1305
1306 if (min == max) continue;
1307
1308 /* If minimizing, keep trying and advancing the pointer */
1309
1310 if (minimize)
1311 {
1312 for (fi = min;; fi++)
1313 {
1314 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1315 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1316 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1317 RRETURN(MATCH_NOMATCH);
1318 eptr += length;
1319 }
1320 /* Control never gets here */
1321 }
1322
1323 /* If maximizing, find the longest string and work backwards */
1324
1325 else
1326 {
1327 pp = eptr;
1328 for (i = min; i < max; i++)
1329 {
1330 if (!match_ref(offset, eptr, length, md, ims)) break;
1331 eptr += length;
1332 }
1333 while (eptr >= pp)
1334 {
1335 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1336 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1337 eptr -= length;
1338 }
1339 RRETURN(MATCH_NOMATCH);
1340 }
1341 }
1342 /* Control never gets here */
1343
1344
1345
1346 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1347 used when all the characters in the class have values in the range 0-255,
1348 and either the matching is caseful, or the characters are in the range
1349 0-127 when UTF-8 processing is enabled. The only difference between
1350 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1351 encountered.
1352
1353 First, look past the end of the item to see if there is repeat information
1354 following. Then obey similar code to character type repeats - written out
1355 again for speed. */
1356
1357 case OP_NCLASS:
1358 case OP_CLASS:
1359 {
1360 data = ecode + 1; /* Save for matching */
1361 ecode += 33; /* Advance past the item */
1362
1363 switch (*ecode)
1364 {
1365 case OP_CRSTAR:
1366 case OP_CRMINSTAR:
1367 case OP_CRPLUS:
1368 case OP_CRMINPLUS:
1369 case OP_CRQUERY:
1370 case OP_CRMINQUERY:
1371 c = *ecode++ - OP_CRSTAR;
1372 minimize = (c & 1) != 0;
1373 min = rep_min[c]; /* Pick up values from tables; */
1374 max = rep_max[c]; /* zero for max => infinity */
1375 if (max == 0) max = INT_MAX;
1376 break;
1377
1378 case OP_CRRANGE:
1379 case OP_CRMINRANGE:
1380 minimize = (*ecode == OP_CRMINRANGE);
1381 min = GET2(ecode, 1);
1382 max = GET2(ecode, 3);
1383 if (max == 0) max = INT_MAX;
1384 ecode += 5;
1385 break;
1386
1387 default: /* No repeat follows */
1388 min = max = 1;
1389 break;
1390 }
1391
1392 /* First, ensure the minimum number of matches are present. */
1393
1394 /* Not UTF-8 mode */
1395 {
1396 for (i = 1; i <= min; i++)
1397 {
1398 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1399 c = *eptr++;
1400 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1401 }
1402 }
1403
1404 /* If max == min we can continue with the main loop without the
1405 need to recurse. */
1406
1407 if (min == max) continue;
1408
1409 /* If minimizing, keep testing the rest of the expression and advancing
1410 the pointer while it matches the class. */
1411
1412 if (minimize)
1413 {
1414 /* Not UTF-8 mode */
1415 {
1416 for (fi = min;; fi++)
1417 {
1418 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1419 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1420 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1421 c = *eptr++;
1422 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1423 }
1424 }
1425 /* Control never gets here */
1426 }
1427
1428 /* If maximizing, find the longest possible run, then work backwards. */
1429
1430 else
1431 {
1432 pp = eptr;
1433
1434 /* Not UTF-8 mode */
1435 {
1436 for (i = min; i < max; i++)
1437 {
1438 if (eptr >= md->end_subject) break;
1439 c = *eptr;
1440 if ((data[c/8] & (1 << (c&7))) == 0) break;
1441 eptr++;
1442 }
1443 while (eptr >= pp)
1444 {
1445 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1446 eptr--;
1447 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1448 }
1449 }
1450
1451 RRETURN(MATCH_NOMATCH);
1452 }
1453 }
1454 /* Control never gets here */
1455
1456
1457 /* Match a single character, casefully */
1458
1459 case OP_CHAR:
1460 /* Non-UTF-8 mode */
1461 {
1462 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1463 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1464 ecode += 2;
1465 }
1466 break;
1467
1468 /* Match a single character, caselessly */
1469
1470 case OP_CHARNC:
1471 /* Non-UTF-8 mode */
1472 {
1473 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1474 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1475 ecode += 2;
1476 }
1477 break;
1478
1479 /* Match a single character repeatedly; different opcodes share code. */
1480
1481 case OP_EXACT:
1482 min = max = GET2(ecode, 1);
1483 ecode += 3;
1484 goto REPEATCHAR;
1485
1486 case OP_UPTO:
1487 case OP_MINUPTO:
1488 min = 0;
1489 max = GET2(ecode, 1);
1490 minimize = *ecode == OP_MINUPTO;
1491 ecode += 3;
1492 goto REPEATCHAR;
1493
1494 case OP_STAR:
1495 case OP_MINSTAR:
1496 case OP_PLUS:
1497 case OP_MINPLUS:
1498 case OP_QUERY:
1499 case OP_MINQUERY:
1500 c = *ecode++ - OP_STAR;
1501 minimize = (c & 1) != 0;
1502 min = rep_min[c]; /* Pick up values from tables; */
1503 max = rep_max[c]; /* zero for max => infinity */
1504 if (max == 0) max = INT_MAX;
1505
1506 /* Common code for all repeated single-character matches. We can give
1507 up quickly if there are fewer than the minimum number of characters left in
1508 the subject. */
1509
1510 REPEATCHAR:
1511 /* When not in UTF-8 mode, load a single-byte character. */
1512 {
1513 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1514 fc = *ecode++;
1515 }
1516
1517 /* The value of fc at this point is always less than 256, though we may or
1518 may not be in UTF-8 mode. The code is duplicated for the caseless and
1519 caseful cases, for speed, since matching characters is likely to be quite
1520 common. First, ensure the minimum number of matches are present. If min =
1521 max, continue at the same level without recursing. Otherwise, if
1522 minimizing, keep trying the rest of the expression and advancing one
1523 matching character if failing, up to the maximum. Alternatively, if
1524 maximizing, find the maximum number of characters and work backwards. */
1525
1526 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
1527 max, eptr));
1528
1529 if ((ims & PCRE_CASELESS) != 0)
1530 {
1531 fc = md->lcc[fc];
1532 for (i = 1; i <= min; i++)
1533 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1534 if (min == max) continue;
1535 if (minimize)
1536 {
1537 for (fi = min;; fi++)
1538 {
1539 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1540 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1541 if (fi >= max || eptr >= md->end_subject ||
1542 fc != md->lcc[*eptr++])
1543 RRETURN(MATCH_NOMATCH);
1544 }
1545 /* Control never gets here */
1546 }
1547 else
1548 {
1549 pp = eptr;
1550 for (i = min; i < max; i++)
1551 {
1552 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
1553 eptr++;
1554 }
1555 while (eptr >= pp)
1556 {
1557 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1558 eptr--;
1559 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1560 }
1561 RRETURN(MATCH_NOMATCH);
1562 }
1563 /* Control never gets here */
1564 }
1565
1566 /* Caseful comparisons (includes all multi-byte characters) */
1567
1568 else
1569 {
1570 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
1571 if (min == max) continue;
1572 if (minimize)
1573 {
1574 for (fi = min;; fi++)
1575 {
1576 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1577 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1578 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
1579 RRETURN(MATCH_NOMATCH);
1580 }
1581 /* Control never gets here */
1582 }
1583 else
1584 {
1585 pp = eptr;
1586 for (i = min; i < max; i++)
1587 {
1588 if (eptr >= md->end_subject || fc != *eptr) break;
1589 eptr++;
1590 }
1591 while (eptr >= pp)
1592 {
1593 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1594 eptr--;
1595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1596 }
1597 RRETURN(MATCH_NOMATCH);
1598 }
1599 }
1600 /* Control never gets here */
1601
1602 /* Match a negated single one-byte character. The character we are
1603 checking can be multibyte. */
1604
1605 case OP_NOT:
1606 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1607 ecode++;
1608 GETCHARINCTEST(c, eptr);
1609 if ((ims & PCRE_CASELESS) != 0)
1610 {
1611 c = md->lcc[c];
1612 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
1613 }
1614 else
1615 {
1616 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
1617 }
1618 break;
1619
1620 /* Match a negated single one-byte character repeatedly. This is almost a
1621 repeat of the code for a repeated single character, but I haven't found a
1622 nice way of commoning these up that doesn't require a test of the
1623 positive/negative option for each character match. Maybe that wouldn't add
1624 very much to the time taken, but character matching *is* what this is all
1625 about... */
1626
1627 case OP_NOTEXACT:
1628 min = max = GET2(ecode, 1);
1629 ecode += 3;
1630 goto REPEATNOTCHAR;
1631
1632 case OP_NOTUPTO:
1633 case OP_NOTMINUPTO:
1634 min = 0;
1635 max = GET2(ecode, 1);
1636 minimize = *ecode == OP_NOTMINUPTO;
1637 ecode += 3;
1638 goto REPEATNOTCHAR;
1639
1640 case OP_NOTSTAR:
1641 case OP_NOTMINSTAR:
1642 case OP_NOTPLUS:
1643 case OP_NOTMINPLUS:
1644 case OP_NOTQUERY:
1645 case OP_NOTMINQUERY:
1646 c = *ecode++ - OP_NOTSTAR;
1647 minimize = (c & 1) != 0;
1648 min = rep_min[c]; /* Pick up values from tables; */
1649 max = rep_max[c]; /* zero for max => infinity */
1650 if (max == 0) max = INT_MAX;
1651
1652 /* Common code for all repeated single-byte matches. We can give up quickly
1653 if there are fewer than the minimum number of bytes left in the
1654 subject. */
1655
1656 REPEATNOTCHAR:
1657 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1658 fc = *ecode++;
1659
1660 /* The code is duplicated for the caseless and caseful cases, for speed,
1661 since matching characters is likely to be quite common. First, ensure the
1662 minimum number of matches are present. If min = max, continue at the same
1663 level without recursing. Otherwise, if minimizing, keep trying the rest of
1664 the expression and advancing one matching character if failing, up to the
1665 maximum. Alternatively, if maximizing, find the maximum number of
1666 characters and work backwards. */
1667
1668 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
1669 max, eptr));
1670
1671 if ((ims & PCRE_CASELESS) != 0)
1672 {
1673 fc = md->lcc[fc];
1674
1675 /* Not UTF-8 mode */
1676 {
1677 for (i = 1; i <= min; i++)
1678 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1679 }
1680
1681 if (min == max) continue;
1682
1683 if (minimize)
1684 {
1685 /* Not UTF-8 mode */
1686 {
1687 for (fi = min;; fi++)
1688 {
1689 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1690 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1691 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
1692 RRETURN(MATCH_NOMATCH);
1693 }
1694 }
1695 /* Control never gets here */
1696 }
1697
1698 /* Maximize case */
1699
1700 else
1701 {
1702 pp = eptr;
1703
1704 /* Not UTF-8 mode */
1705 {
1706 for (i = min; i < max; i++)
1707 {
1708 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
1709 eptr++;
1710 }
1711 while (eptr >= pp)
1712 {
1713 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1714 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1715 eptr--;
1716 }
1717 }
1718
1719 RRETURN(MATCH_NOMATCH);
1720 }
1721 /* Control never gets here */
1722 }
1723
1724 /* Caseful comparisons */
1725
1726 else
1727 {
1728 /* Not UTF-8 mode */
1729 {
1730 for (i = 1; i <= min; i++)
1731 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
1732 }
1733
1734 if (min == max) continue;
1735
1736 if (minimize)
1737 {
1738 /* Not UTF-8 mode */
1739 {
1740 for (fi = min;; fi++)
1741 {
1742 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1743 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1744 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
1745 RRETURN(MATCH_NOMATCH);
1746 }
1747 }
1748 /* Control never gets here */
1749 }
1750
1751 /* Maximize case */
1752
1753 else
1754 {
1755 pp = eptr;
1756
1757 /* Not UTF-8 mode */
1758 {
1759 for (i = min; i < max; i++)
1760 {
1761 if (eptr >= md->end_subject || fc == *eptr) break;
1762 eptr++;
1763 }
1764 while (eptr >= pp)
1765 {
1766 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1767 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1768 eptr--;
1769 }
1770 }
1771
1772 RRETURN(MATCH_NOMATCH);
1773 }
1774 }
1775 /* Control never gets here */
1776
1777 /* Match a single character type repeatedly; several different opcodes
1778 share code. This is very similar to the code for single characters, but we
1779 repeat it in the interests of efficiency. */
1780
1781 case OP_TYPEEXACT:
1782 min = max = GET2(ecode, 1);
1783 minimize = TRUE;
1784 ecode += 3;
1785 goto REPEATTYPE;
1786
1787 case OP_TYPEUPTO:
1788 case OP_TYPEMINUPTO:
1789 min = 0;
1790 max = GET2(ecode, 1);
1791 minimize = *ecode == OP_TYPEMINUPTO;
1792 ecode += 3;
1793 goto REPEATTYPE;
1794
1795 case OP_TYPESTAR:
1796 case OP_TYPEMINSTAR:
1797 case OP_TYPEPLUS:
1798 case OP_TYPEMINPLUS:
1799 case OP_TYPEQUERY:
1800 case OP_TYPEMINQUERY:
1801 c = *ecode++ - OP_TYPESTAR;
1802 minimize = (c & 1) != 0;
1803 min = rep_min[c]; /* Pick up values from tables; */
1804 max = rep_max[c]; /* zero for max => infinity */
1805 if (max == 0) max = INT_MAX;
1806
1807 /* Common code for all repeated single character type matches. Note that
1808 in UTF-8 mode, '.' matches a character of any length, but for the other
1809 character types, the valid characters are all one-byte long. */
1810
1811 REPEATTYPE:
1812 ctype = *ecode++; /* Code for the character type */
1813
1814 /* First, ensure the minimum number of matches are present. Use inline
1815 code for maximizing the speed, and do the type test once at the start
1816 (i.e. keep it out of the loop). Also we can test that there are at least
1817 the minimum number of bytes before we start. This isn't as effective in
1818 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
1819 is tidier. Also separate the UCP code, which can be the same for both UTF-8
1820 and single-bytes. */
1821
1822 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1823 if (min > 0)
1824 {
1825 /* Code for the non-UTF-8 case for minimum matching of operators other
1826 than OP_PROP and OP_NOTPROP. */
1827
1828 switch(ctype)
1829 {
1830 case OP_ANY:
1831 if ((ims & PCRE_DOTALL) == 0)
1832 {
1833 for (i = 1; i <= min; i++)
1834 if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
1835 }
1836 else eptr += min;
1837 break;
1838
1839 case OP_ANYBYTE:
1840 eptr += min;
1841 break;
1842
1843 case OP_NOT_DIGIT:
1844 for (i = 1; i <= min; i++)
1845 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
1846 break;
1847
1848 case OP_DIGIT:
1849 for (i = 1; i <= min; i++)
1850 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
1851 break;
1852
1853 case OP_NOT_WHITESPACE:
1854 for (i = 1; i <= min; i++)
1855 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
1856 break;
1857
1858 case OP_WHITESPACE:
1859 for (i = 1; i <= min; i++)
1860 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
1861 break;
1862
1863 case OP_NOT_WORDCHAR:
1864 for (i = 1; i <= min; i++)
1865 if ((md->ctypes[*eptr++] & ctype_word) != 0)
1866 RRETURN(MATCH_NOMATCH);
1867 break;
1868
1869 case OP_WORDCHAR:
1870 for (i = 1; i <= min; i++)
1871 if ((md->ctypes[*eptr++] & ctype_word) == 0)
1872 RRETURN(MATCH_NOMATCH);
1873 break;
1874
1875 default:
1876 RRETURN(PCRE_ERROR_INTERNAL);
1877 }
1878 }
1879
1880 /* If min = max, continue at the same level without recursing */
1881
1882 if (min == max) continue;
1883
1884 /* If minimizing, we have to test the rest of the pattern before each
1885 subsequent match. Again, separate the UTF-8 case for speed, and also
1886 separate the UCP cases. */
1887
1888 if (minimize)
1889 {
1890 /* Not UTF-8 mode */
1891 {
1892 for (fi = min;; fi++)
1893 {
1894 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1895 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1896 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1897 c = *eptr++;
1898 switch(ctype)
1899 {
1900 case OP_ANY:
1901 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
1902 break;
1903
1904 case OP_ANYBYTE:
1905 break;
1906
1907 case OP_NOT_DIGIT:
1908 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
1909 break;
1910
1911 case OP_DIGIT:
1912 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
1913 break;
1914
1915 case OP_NOT_WHITESPACE:
1916 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
1917 break;
1918
1919 case OP_WHITESPACE:
1920 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
1921 break;
1922
1923 case OP_NOT_WORDCHAR:
1924 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
1925 break;
1926
1927 case OP_WORDCHAR:
1928 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
1929 break;
1930
1931 default:
1932 RRETURN(PCRE_ERROR_INTERNAL);
1933 }
1934 }
1935 }
1936 /* Control never gets here */
1937 }
1938
1939 /* If maximizing it is worth using inline code for speed, doing the type
1940 test once at the start (i.e. keep it out of the loop). Again, keep the
1941 UTF-8 and UCP stuff separate. */
1942
1943 else
1944 {
1945 pp = eptr; /* Remember where we started */
1946
1947 /* Not UTF-8 mode */
1948 {
1949 switch(ctype)
1950 {
1951 case OP_ANY:
1952 if ((ims & PCRE_DOTALL) == 0)
1953 {
1954 for (i = min; i < max; i++)
1955 {
1956 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
1957 eptr++;
1958 }
1959 break;
1960 }
1961 /* For DOTALL case, fall through and treat as \C */
1962
1963 case OP_ANYBYTE:
1964 c = max - min;
1965 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
1966 eptr += c;
1967 break;
1968
1969 case OP_NOT_DIGIT:
1970 for (i = min; i < max; i++)
1971 {
1972 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
1973 break;
1974 eptr++;
1975 }
1976 break;
1977
1978 case OP_DIGIT:
1979 for (i = min; i < max; i++)
1980 {
1981 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
1982 break;
1983 eptr++;
1984 }
1985 break;
1986
1987 case OP_NOT_WHITESPACE:
1988 for (i = min; i < max; i++)
1989 {
1990 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
1991 break;
1992 eptr++;
1993 }
1994 break;
1995
1996 case OP_WHITESPACE:
1997 for (i = min; i < max; i++)
1998 {
1999 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
2000 break;
2001 eptr++;
2002 }
2003 break;
2004
2005 case OP_NOT_WORDCHAR:
2006 for (i = min; i < max; i++)
2007 {
2008 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
2009 break;
2010 eptr++;
2011 }
2012 break;
2013
2014 case OP_WORDCHAR:
2015 for (i = min; i < max; i++)
2016 {
2017 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
2018 break;
2019 eptr++;
2020 }
2021 break;
2022
2023 default:
2024 RRETURN(PCRE_ERROR_INTERNAL);
2025 }
2026
2027 /* eptr is now past the end of the maximum run */
2028
2029 while (eptr >= pp)
2030 {
2031 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2032 eptr--;
2033 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2034 }
2035 }
2036
2037 /* Get here if we can't make it match with any permitted repetitions */
2038
2039 RRETURN(MATCH_NOMATCH);
2040 }
2041 /* Control never gets here */
2042
2043 /* There's been some horrible disaster. Since all codes > OP_BRA are
2044 for capturing brackets, and there shouldn't be any gaps between 0 and
2045 OP_BRA, arrival here can only mean there is something seriously wrong
2046 in the code above or the OP_xxx definitions. */
2047
2048 default:
2049 DPRINTF(("Unknown opcode %d\n", *ecode));
2050 RRETURN(PCRE_ERROR_UNKNOWN_NODE);
2051 }
2052
2053 /* Do not stick any code in here without much thought; it is assumed
2054 that "continue" in the code above comes out to here to repeat the main
2055 loop. */
2056
2057 } /* End of main loop */
2058 /* Control never reaches here */
2059 }
2060
2061
2062 /***************************************************************************
2063 ****************************************************************************
2064 RECURSION IN THE match() FUNCTION
2065
2066 Undefine all the macros that were defined above to handle this. */
2067
2068 #ifdef NO_RECURSE
2069 #undef eptr
2070 #undef ecode
2071 #undef offset_top
2072 #undef ims
2073 #undef eptrb
2074 #undef flags
2075
2076 #undef callpat
2077 #undef charptr
2078 #undef data
2079 #undef next
2080 #undef pp
2081 #undef prev
2082 #undef saved_eptr
2083
2084 #undef new_recursive
2085
2086 #undef cur_is_word
2087 #undef condition
2088 #undef minimize
2089 #undef prev_is_word
2090
2091 #undef original_ims
2092
2093 #undef ctype
2094 #undef length
2095 #undef max
2096 #undef min
2097 #undef number
2098 #undef offset
2099 #undef op
2100 #undef save_capture_last
2101 #undef save_offset1
2102 #undef save_offset2
2103 #undef save_offset3
2104 #undef stacksave
2105
2106 #undef newptrb
2107
2108 #endif
2109
2110 /* These two are defined as macros in both cases */
2111
2112 #undef fc
2113 #undef fi
2114
2115 /***************************************************************************
2116 ***************************************************************************/
2117
2118
2119
2120 /*************************************************
2121 * Execute a Regular Expression *
2122 *************************************************/
2123
2124 /* This function applies a compiled re to a subject string and picks out
2125 portions of the string if it matches. Two elements in the vector are set for
2126 each substring: the offsets to the start and end of the substring.
2127
2128 Arguments:
2129 argument_re points to the compiled expression
2130 extra_data points to extra data or is NULL
2131 subject points to the subject string
2132 length length of subject string (may contain binary zeros)
2133 start_offset where to start in the subject string
2134 options option bits
2135 offsets points to a vector of ints to be filled in with offsets
2136 offsetcount the number of elements in the vector
2137
2138 Returns: > 0 => success; value is the number of elements filled in
2139 = 0 => success, but offsets is not big enough
2140 -1 => failed to match
2141 < -1 => some kind of unexpected problem
2142 */
2143
2144 EXPORT int
2145 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
2146 const char *subject, int length, int start_offset, int options, int *offsets,
2147 int offsetcount)
2148 {
2149 int rc, resetcount, ocount;
2150 int first_byte = -1;
2151 int req_byte = -1;
2152 int req_byte2 = -1;
2153 unsigned long int ims = 0;
2154 BOOL using_temporary_offsets = FALSE;
2155 BOOL anchored;
2156 BOOL startline;
2157 BOOL firstline;
2158 BOOL first_byte_caseless = FALSE;
2159 BOOL req_byte_caseless = FALSE;
2160 match_data match_block;
2161 const uschar *tables;
2162 const uschar *start_bits = NULL;
2163 const uschar *start_match = (const uschar *)subject + start_offset;
2164 const uschar *end_subject;
2165 const uschar *req_byte_ptr = start_match - 1;
2166
2167 pcre_study_data internal_study;
2168 const pcre_study_data *study;
2169
2170 real_pcre internal_re;
2171 const real_pcre *external_re = (const real_pcre *)argument_re;
2172 const real_pcre *re = external_re;
2173
2174 /* Plausibility checks */
2175
2176 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2177 if (re == NULL || subject == NULL ||
2178 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2179 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2180
2181 /* Fish out the optional data from the extra_data structure, first setting
2182 the default values. */
2183
2184 study = NULL;
2185 match_block.match_limit = MATCH_LIMIT;
2186 match_block.callout_data = NULL;
2187
2188 /* The table pointer is always in native byte order. */
2189
2190 tables = external_re->tables;
2191
2192 if (extra_data != NULL)
2193 {
2194 register unsigned int flags = extra_data->flags;
2195 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2196 study = (const pcre_study_data *)extra_data->study_data;
2197 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
2198 match_block.match_limit = extra_data->match_limit;
2199 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2200 match_block.callout_data = extra_data->callout_data;
2201 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
2202 }
2203
2204 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2205 is a feature that makes it possible to save compiled regex and re-use them
2206 in other programs later. */
2207
2208 if (tables == NULL) tables = _pcre_default_tables;
2209
2210 /* Check that the first field in the block is the magic number. If it is not,
2211 test for a regex that was compiled on a host of opposite endianness. If this is
2212 the case, flipped values are put in internal_re and internal_study if there was
2213 study data too. */
2214
2215 if (re->magic_number != MAGIC_NUMBER)
2216 {
2217 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2218 if (re == NULL) return PCRE_ERROR_BADMAGIC;
2219 if (study != NULL) study = &internal_study;
2220 }
2221
2222 /* Set up other data */
2223
2224 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
2225 startline = (re->options & PCRE_STARTLINE) != 0;
2226 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2227
2228 /* The code starts after the real_pcre block and the capture name table. */
2229
2230 match_block.start_code = (const uschar *)external_re + re->name_table_offset +
2231 re->name_count * re->name_entry_size;
2232
2233 match_block.start_subject = (const uschar *)subject;
2234 match_block.start_offset = start_offset;
2235 match_block.end_subject = match_block.start_subject + length;
2236 end_subject = match_block.end_subject;
2237
2238 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
2239 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
2240
2241 match_block.notbol = (options & PCRE_NOTBOL) != 0;
2242 match_block.noteol = (options & PCRE_NOTEOL) != 0;
2243 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
2244 match_block.partial = (options & PCRE_PARTIAL) != 0;
2245 match_block.hitend = FALSE;
2246
2247 match_block.recursive = NULL; /* No recursion at top level */
2248
2249 match_block.lcc = tables + lcc_offset;
2250 match_block.ctypes = tables + ctypes_offset;
2251
2252 /* Partial matching is supported only for a restricted set of regexes at the
2253 moment. */
2254
2255 if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
2256 return PCRE_ERROR_BADPARTIAL;
2257
2258 /* The ims options can vary during the matching as a result of the presence
2259 of (?ims) items in the pattern. They are kept in a local variable so that
2260 restoring at the exit of a group is easy. */
2261
2262 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
2263
2264 /* If the expression has got more back references than the offsets supplied can
2265 hold, we get a temporary chunk of working store to use during the matching.
2266 Otherwise, we can use the vector supplied, rounding down its size to a multiple
2267 of 3. */
2268
2269 ocount = offsetcount - (offsetcount % 3);
2270
2271 if (re->top_backref > 0 && re->top_backref >= ocount/3)
2272 {
2273 ocount = re->top_backref * 3 + 3;
2274 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
2275 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
2276 using_temporary_offsets = TRUE;
2277 DPRINTF(("Got memory to hold back references\n"));
2278 }
2279 else match_block.offset_vector = offsets;
2280
2281 match_block.offset_end = ocount;
2282 match_block.offset_max = (2*ocount)/3;
2283 match_block.offset_overflow = FALSE;
2284 match_block.capture_last = -1;
2285
2286 /* Compute the minimum number of offsets that we need to reset each time. Doing
2287 this makes a huge difference to execution time when there aren't many brackets
2288 in the pattern. */
2289
2290 resetcount = 2 + re->top_bracket * 2;
2291 if (resetcount > offsetcount) resetcount = ocount;
2292
2293 /* Reset the working variable associated with each extraction. These should
2294 never be used unless previously set, but they get saved and restored, and so we
2295 initialize them to avoid reading uninitialized locations. */
2296
2297 if (match_block.offset_vector != NULL)
2298 {
2299 register int *iptr = match_block.offset_vector + ocount;
2300 register int *iend = iptr - resetcount/2 + 1;
2301 while (--iptr >= iend) *iptr = -1;
2302 }
2303
2304 /* Set up the first character to match, if available. The first_byte value is
2305 never set for an anchored regular expression, but the anchoring may be forced
2306 at run time, so we have to test for anchoring. The first char may be unset for
2307 an unanchored pattern, of course. If there's no first char and the pattern was
2308 studied, there may be a bitmap of possible first characters. */
2309
2310 if (!anchored)
2311 {
2312 if ((re->options & PCRE_FIRSTSET) != 0)
2313 {
2314 first_byte = re->first_byte & 255;
2315 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2316 first_byte = match_block.lcc[first_byte];
2317 }
2318 else
2319 if (!startline && study != NULL &&
2320 (study->options & PCRE_STUDY_MAPPED) != 0)
2321 start_bits = study->start_bits;
2322 }
2323
2324 /* For anchored or unanchored matches, there may be a "last known required
2325 character" set. */
2326
2327 if ((re->options & PCRE_REQCHSET) != 0)
2328 {
2329 req_byte = re->req_byte & 255;
2330 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2331 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
2332 }
2333
2334 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
2335 the loop runs just once. */
2336
2337 do
2338 {
2339 const uschar *save_end_subject = end_subject;
2340
2341 /* Reset the maximum number of extractions we might see. */
2342
2343 if (match_block.offset_vector != NULL)
2344 {
2345 register int *iptr = match_block.offset_vector;
2346 register int *iend = iptr + resetcount;
2347 while (iptr < iend) *iptr++ = -1;
2348 }
2349
2350 /* Advance to a unique first char if possible. If firstline is TRUE, the
2351 start of the match is constrained to the first line of a multiline string.
2352 Implement this by temporarily adjusting end_subject so that we stop scanning
2353 at a newline. If the match fails at the newline, later code breaks this loop.
2354 */
2355
2356 if (firstline)
2357 {
2358 const uschar *t = start_match;
2359 while (t < save_end_subject && *t != '\n') t++;
2360 end_subject = t;
2361 }
2362
2363 /* Now test for a unique first byte */
2364
2365 if (first_byte >= 0)
2366 {
2367 if (first_byte_caseless)
2368 while (start_match < end_subject &&
2369 match_block.lcc[*start_match] != first_byte)
2370 start_match++;
2371 else
2372 while (start_match < end_subject && *start_match != first_byte)
2373 start_match++;
2374 }
2375
2376 /* Or to just after \n for a multiline match if possible */
2377
2378 else if (startline)
2379 {
2380 if (start_match > match_block.start_subject + start_offset)
2381 {
2382 while (start_match < end_subject && start_match[-1] != NEWLINE)
2383 start_match++;
2384 }
2385 }
2386
2387 /* Or to a non-unique first char after study */
2388
2389 else if (start_bits != NULL)
2390 {
2391 while (start_match < end_subject)
2392 {
2393 register unsigned int c = *start_match;
2394 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
2395 }
2396 }
2397
2398 /* Restore fudged end_subject */
2399
2400 end_subject = save_end_subject;
2401
2402 #ifdef DEBUG /* Sigh. Some compilers never learn. */
2403 printf(">>>> Match against: ");
2404 pchars(start_match, end_subject - start_match, TRUE, &match_block);
2405 printf("\n");
2406 #endif
2407
2408 /* If req_byte is set, we know that that character must appear in the subject
2409 for the match to succeed. If the first character is set, req_byte must be
2410 later in the subject; otherwise the test starts at the match point. This
2411 optimization can save a huge amount of backtracking in patterns with nested
2412 unlimited repeats that aren't going to match. Writing separate code for
2413 cased/caseless versions makes it go faster, as does using an autoincrement
2414 and backing off on a match.
2415
2416 HOWEVER: when the subject string is very, very long, searching to its end can
2417 take a long time, and give bad performance on quite ordinary patterns. This
2418 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2419 don't do this when the string is sufficiently long.
2420
2421 ALSO: this processing is disabled when partial matching is requested.
2422 */
2423
2424 if (req_byte >= 0 &&
2425 end_subject - start_match < REQ_BYTE_MAX &&
2426 !match_block.partial)
2427 {
2428 register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
2429
2430 /* We don't need to repeat the search if we haven't yet reached the
2431 place we found it at last time. */
2432
2433 if (p > req_byte_ptr)
2434 {
2435 if (req_byte_caseless)
2436 {
2437 while (p < end_subject)
2438 {
2439 register int pp = *p++;
2440 if (pp == req_byte || pp == req_byte2) { p--; break; }
2441 }
2442 }
2443 else
2444 {
2445 while (p < end_subject)
2446 {
2447 if (*p++ == req_byte) { p--; break; }
2448 }
2449 }
2450
2451 /* If we can't find the required character, break the matching loop */
2452
2453 if (p >= end_subject) break;
2454
2455 /* If we have found the required character, save the point where we
2456 found it, so that we don't search again next time round the loop if
2457 the start hasn't passed this character yet. */
2458
2459 req_byte_ptr = p;
2460 }
2461 }
2462
2463 /* When a match occurs, substrings will be set for all internal extractions;
2464 we just need to set up the whole thing as substring 0 before returning. If
2465 there were too many extractions, set the return code to zero. In the case
2466 where we had to get some local store to hold offsets for backreferences, copy
2467 those back references that we can. In this case there need not be overflow
2468 if certain parts of the pattern were not used. */
2469
2470 match_block.start_match = start_match;
2471 match_block.match_call_count = 0;
2472
2473 rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
2474 match_isgroup);
2475
2476 /* When the result is no match, if the subject's first character was a
2477 newline and the PCRE_FIRSTLINE option is set, break (which will return
2478 PCRE_ERROR_NOMATCH). The option requests that a match occur before the first
2479 newline in the subject. Otherwise, advance the pointer to the next character
2480 and continue - but the continuation will actually happen only when the
2481 pattern is not anchored. */
2482
2483 if (rc == MATCH_NOMATCH)
2484 {
2485 if (firstline && *start_match == NEWLINE) break;
2486 start_match++;
2487 continue;
2488 }
2489
2490 if (rc != MATCH_MATCH)
2491 {
2492 DPRINTF((">>>> error: returning %d\n", rc));
2493 return rc;
2494 }
2495
2496 /* We have a match! Copy the offset information from temporary store if
2497 necessary */
2498
2499 if (using_temporary_offsets)
2500 {
2501 if (offsetcount >= 4)
2502 {
2503 memcpy(offsets + 2, match_block.offset_vector + 2,
2504 (offsetcount - 2) * sizeof(int));
2505 DPRINTF(("Copied offsets from temporary memory\n"));
2506 }
2507 if (match_block.end_offset_top > offsetcount)
2508 match_block.offset_overflow = TRUE;
2509
2510 DPRINTF(("Freeing temporary memory\n"));
2511 (pcre_free)(match_block.offset_vector);
2512 }
2513
2514 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
2515
2516 if (offsetcount < 2) rc = 0; else
2517 {
2518 offsets[0] = start_match - match_block.start_subject;
2519 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
2520 }
2521
2522 DPRINTF((">>>> returning %d\n", rc));
2523 return rc;
2524 }
2525
2526 /* This "while" is the end of the "do" above */
2527
2528 while (!anchored && start_match <= end_subject);
2529
2530 if (using_temporary_offsets)
2531 {
2532 DPRINTF(("Freeing temporary memory\n"));
2533 (pcre_free)(match_block.offset_vector);
2534 }
2535
2536 if (match_block.partial && match_block.hitend)
2537 {
2538 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
2539 return PCRE_ERROR_PARTIAL;
2540 }
2541 else
2542 {
2543 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
2544 return PCRE_ERROR_NOMATCH;
2545 }
2546 }
2547
2548 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords Revision