blob: 122f447640ffb528767a89d8254caf950ddf553d [file] [log] [blame]
paul718e3742002-12-13 20:15:29 +00001/* Extended regular expression matching and search library,
2 version 0.12.
3 (Implements POSIX draft P1003.2/D11.2, except for some of the
4 internationalization features.)
5 Copyright (C) 1993, 94, 95, 96, 97, 98, 99 Free Software Foundation, Inc.
6
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Library General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
11
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Library General Public License for more details.
16
17 You should have received a copy of the GNU Library General Public
18 License along with the GNU C Library; see the file COPYING.LIB. If not,
19 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22/* AIX requires this to be the first thing in the file. */
23#if defined _AIX && !defined REGEX_MALLOC
24 #pragma alloca
25#endif
26
27#undef _GNU_SOURCE
28#define _GNU_SOURCE
29
30#ifdef HAVE_CONFIG_H
31# include <config.h>
32#endif
vincenteac314c2006-01-17 23:39:04 +000033#ifdef _WIN32
34/* Windows does not provide unistd.h, which is required for abort() */
35#include <process.h>
36#endif /* _WIN32 */
paul718e3742002-12-13 20:15:29 +000037
38#ifndef PARAMS
39# if defined __GNUC__ || (defined __STDC__ && __STDC__)
40# define PARAMS(args) args
41# else
42# define PARAMS(args) ()
43# endif /* GCC. */
44#endif /* Not PARAMS. */
45
46#if defined STDC_HEADERS && !defined emacs
47# include <stddef.h>
48#else
49/* We need this for `regex.h', and perhaps for the Emacs include files. */
50# include <sys/types.h>
51#endif
52
53#define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC)
54
55/* For platform which support the ISO C amendement 1 functionality we
56 support user defined character classes. */
57#if defined _LIBC || WIDE_CHAR_SUPPORT
58/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
59# include <wchar.h>
60# include <wctype.h>
61#endif
62
63#ifdef _LIBC
64/* We have to keep the namespace clean. */
65# define regfree(preg) __regfree (preg)
66# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
67# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
68# define regerror(errcode, preg, errbuf, errbuf_size) \
69 __regerror(errcode, preg, errbuf, errbuf_size)
70# define re_set_registers(bu, re, nu, st, en) \
71 __re_set_registers (bu, re, nu, st, en)
72# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
73 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
74# define re_match(bufp, string, size, pos, regs) \
75 __re_match (bufp, string, size, pos, regs)
76# define re_search(bufp, string, size, startpos, range, regs) \
77 __re_search (bufp, string, size, startpos, range, regs)
78# define re_compile_pattern(pattern, length, bufp) \
79 __re_compile_pattern (pattern, length, bufp)
80# define re_set_syntax(syntax) __re_set_syntax (syntax)
81# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
82 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
83# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
84
85#define btowc __btowc
86#endif
87
88/* This is for other GNU distributions with internationalized messages. */
89#if HAVE_LIBINTL_H || defined _LIBC
90# include <libintl.h>
91#else
92# define gettext(msgid) (msgid)
93#endif
94
95#ifndef gettext_noop
96/* This define is so xgettext can find the internationalizable
97 strings. */
98# define gettext_noop(String) String
99#endif
100
101/* The `emacs' switch turns on certain matching commands
102 that make sense only in Emacs. */
103#ifdef emacs
104
105# include "lisp.h"
106# include "buffer.h"
107# include "syntax.h"
108
109#else /* not emacs */
110
111/* If we are not linking with Emacs proper,
112 we can't use the relocating allocator
113 even if config.h says that we can. */
114# undef REL_ALLOC
115
116# if defined STDC_HEADERS || defined _LIBC
117# include <stdlib.h>
118# else
119char *malloc ();
120char *realloc ();
121# endif
122
123/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
124 If nothing else has been done, use the method below. */
125# ifdef INHIBIT_STRING_HEADER
126# if !(defined HAVE_BZERO && defined HAVE_BCOPY)
127# if !defined bzero && !defined bcopy
128# undef INHIBIT_STRING_HEADER
129# endif
130# endif
131# endif
132
133/* This is the normal way of making sure we have a bcopy and a bzero.
134 This is used in most programs--a few other programs avoid this
135 by defining INHIBIT_STRING_HEADER. */
136# ifndef INHIBIT_STRING_HEADER
137# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
138# include <string.h>
139# ifndef bzero
140# ifndef _LIBC
141# define bzero(s, n) (memset (s, '\0', n), (s))
142# else
143# define bzero(s, n) __bzero (s, n)
144# endif
145# endif
146# else
147# include <strings.h>
148# ifndef memcmp
149# define memcmp(s1, s2, n) bcmp (s1, s2, n)
150# endif
151# ifndef memcpy
152# define memcpy(d, s, n) (bcopy (s, d, n), (d))
153# endif
154# endif
155# endif
156
157/* Define the syntax stuff for \<, \>, etc. */
158
159/* This must be nonzero for the wordchar and notwordchar pattern
160 commands in re_match_2. */
161# ifndef Sword
162# define Sword 1
163# endif
164
165# ifdef SWITCH_ENUM_BUG
166# define SWITCH_ENUM_CAST(x) ((int)(x))
167# else
168# define SWITCH_ENUM_CAST(x) (x)
169# endif
170
171/* How many characters in the character set. */
172# define CHAR_SET_SIZE 256
173
174# ifdef SYNTAX_TABLE
175
176extern char *re_syntax_table;
177
178# else /* not SYNTAX_TABLE */
179
180static char re_syntax_table[CHAR_SET_SIZE];
181
182static void
183init_syntax_once ()
184{
185 register int c;
186 static int done;
187
188 if (done)
189 return;
190
pauld1724b62003-10-22 02:41:52 +0000191 memset (re_syntax_table, 0, sizeof re_syntax_table);
paul718e3742002-12-13 20:15:29 +0000192
193 for (c = 'a'; c <= 'z'; c++)
194 re_syntax_table[c] = Sword;
195
196 for (c = 'A'; c <= 'Z'; c++)
197 re_syntax_table[c] = Sword;
198
199 for (c = '0'; c <= '9'; c++)
200 re_syntax_table[c] = Sword;
201
202 re_syntax_table['_'] = Sword;
203
204 done = 1;
205}
206
207# endif /* not SYNTAX_TABLE */
208
209# define SYNTAX(c) re_syntax_table[c]
210
211#endif /* not emacs */
David Lamparter6b0655a2014-06-04 06:53:35 +0200212
paul718e3742002-12-13 20:15:29 +0000213/* Get the interface, including the syntax bits. */
214#include <regex-gnu.h>
215
216/* isalpha etc. are used for the character classes. */
217#include <ctype.h>
218
219/* Jim Meyering writes:
220
221 "... Some ctype macros are valid only for character codes that
222 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
223 using /bin/cc or gcc but without giving an ansi option). So, all
224 ctype uses should be through macros like ISPRINT... If
225 STDC_HEADERS is defined, then autoconf has verified that the ctype
226 macros don't need to be guarded with references to isascii. ...
227 Defining isascii to 1 should let any compiler worth its salt
228 eliminate the && through constant folding."
229 Solaris defines some of these symbols so we must undefine them first. */
230
231#undef ISASCII
232#if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
233# define ISASCII(c) 1
234#else
235# define ISASCII(c) isascii(c)
236#endif
237
238#ifdef isblank
239# define ISBLANK(c) (ISASCII (c) && isblank (c))
240#else
241# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
242#endif
243#ifdef isgraph
244# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
245#else
246# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
247#endif
248
249#undef ISPRINT
250#define ISPRINT(c) (ISASCII (c) && isprint (c))
251#define ISDIGIT(c) (ISASCII (c) && isdigit (c))
252#define ISALNUM(c) (ISASCII (c) && isalnum (c))
253#define ISALPHA(c) (ISASCII (c) && isalpha (c))
254#define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
255#define ISLOWER(c) (ISASCII (c) && islower (c))
256#define ISPUNCT(c) (ISASCII (c) && ispunct (c))
257#define ISSPACE(c) (ISASCII (c) && isspace (c))
258#define ISUPPER(c) (ISASCII (c) && isupper (c))
259#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
260
261#ifdef _tolower
262# define TOLOWER(c) _tolower(c)
263#else
264# define TOLOWER(c) tolower(c)
265#endif
266
267#ifndef NULL
268# define NULL (void *)0
269#endif
270
271/* We remove any previous definition of `SIGN_EXTEND_CHAR',
272 since ours (we hope) works properly with all combinations of
273 machines, compilers, `char' and `unsigned char' argument types.
274 (Per Bothner suggested the basic approach.) */
275#undef SIGN_EXTEND_CHAR
276#if __STDC__
277# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
278#else /* not __STDC__ */
279/* As in Harbison and Steele. */
280# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
281#endif
David Lamparter6b0655a2014-06-04 06:53:35 +0200282
paul718e3742002-12-13 20:15:29 +0000283/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
284 use `alloca' instead of `malloc'. This is because using malloc in
285 re_search* or re_match* could cause memory leaks when C-g is used in
286 Emacs; also, malloc is slower and causes storage fragmentation. On
287 the other hand, malloc is more portable, and easier to debug.
288
289 Because we sometimes use alloca, some routines have to be macros,
290 not functions -- `alloca'-allocated space disappears at the end of the
291 function it is called in. */
292
293#ifdef REGEX_MALLOC
294
295# define REGEX_ALLOCATE malloc
296# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
297# define REGEX_FREE free
298
299#else /* not REGEX_MALLOC */
300
301/* Emacs already defines alloca, sometimes. */
302# ifndef alloca
303
304/* Make alloca work the best possible way. */
305# ifdef __GNUC__
306# define alloca __builtin_alloca
307# else /* not __GNUC__ */
308# if HAVE_ALLOCA_H
309# include <alloca.h>
310# endif /* HAVE_ALLOCA_H */
311# endif /* not __GNUC__ */
312
313# endif /* not alloca */
314
315# define REGEX_ALLOCATE alloca
316
317/* Assumes a `char *destination' variable. */
318# define REGEX_REALLOCATE(source, osize, nsize) \
319 (destination = (char *) alloca (nsize), \
320 memcpy (destination, source, osize))
321
322/* No need to do anything to free, after alloca. */
323# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
324
325#endif /* not REGEX_MALLOC */
326
327/* Define how to allocate the failure stack. */
328
329#if defined REL_ALLOC && defined REGEX_MALLOC
330
331# define REGEX_ALLOCATE_STACK(size) \
332 r_alloc (&failure_stack_ptr, (size))
333# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
334 r_re_alloc (&failure_stack_ptr, (nsize))
335# define REGEX_FREE_STACK(ptr) \
336 r_alloc_free (&failure_stack_ptr)
337
338#else /* not using relocating allocator */
339
340# ifdef REGEX_MALLOC
341
342# define REGEX_ALLOCATE_STACK malloc
343# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
344# define REGEX_FREE_STACK free
345
346# else /* not REGEX_MALLOC */
347
348# define REGEX_ALLOCATE_STACK alloca
349
350# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
351 REGEX_REALLOCATE (source, osize, nsize)
352/* No need to explicitly free anything. */
353# define REGEX_FREE_STACK(arg)
354
355# endif /* not REGEX_MALLOC */
356#endif /* not using relocating allocator */
357
358
359/* True if `size1' is non-NULL and PTR is pointing anywhere inside
360 `string1' or just past its end. This works if PTR is NULL, which is
361 a good thing. */
362#define FIRST_STRING_P(ptr) \
363 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
364
365/* (Re)Allocate N items of type T using malloc, or fail. */
366#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
367#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
368#define RETALLOC_IF(addr, n, t) \
369 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
370#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
371
372#define BYTEWIDTH 8 /* In bits. */
373
374#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
375
376#undef MAX
377#undef MIN
378#define MAX(a, b) ((a) > (b) ? (a) : (b))
379#define MIN(a, b) ((a) < (b) ? (a) : (b))
380
381typedef char boolean;
382#define false 0
383#define true 1
384
385static int re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp,
386 const char *string1, int size1,
387 const char *string2, int size2,
388 int pos,
389 struct re_registers *regs,
390 int stop));
David Lamparter6b0655a2014-06-04 06:53:35 +0200391
paul718e3742002-12-13 20:15:29 +0000392/* These are the command codes that appear in compiled regular
393 expressions. Some opcodes are followed by argument bytes. A
394 command code can specify any interpretation whatsoever for its
395 arguments. Zero bytes may appear in the compiled regular expression. */
396
397typedef enum
398{
399 no_op = 0,
400
401 /* Succeed right away--no more backtracking. */
402 succeed,
403
404 /* Followed by one byte giving n, then by n literal bytes. */
405 exactn,
406
407 /* Matches any (more or less) character. */
408 anychar,
409
410 /* Matches any one char belonging to specified set. First
411 following byte is number of bitmap bytes. Then come bytes
412 for a bitmap saying which chars are in. Bits in each byte
413 are ordered low-bit-first. A character is in the set if its
414 bit is 1. A character too large to have a bit in the map is
415 automatically not in the set. */
416 charset,
417
418 /* Same parameters as charset, but match any character that is
419 not one of those specified. */
420 charset_not,
421
422 /* Start remembering the text that is matched, for storing in a
423 register. Followed by one byte with the register number, in
424 the range 0 to one less than the pattern buffer's re_nsub
425 field. Then followed by one byte with the number of groups
426 inner to this one. (This last has to be part of the
427 start_memory only because we need it in the on_failure_jump
428 of re_match_2.) */
429 start_memory,
430
431 /* Stop remembering the text that is matched and store it in a
432 memory register. Followed by one byte with the register
433 number, in the range 0 to one less than `re_nsub' in the
434 pattern buffer, and one byte with the number of inner groups,
435 just like `start_memory'. (We need the number of inner
436 groups here because we don't have any easy way of finding the
437 corresponding start_memory when we're at a stop_memory.) */
438 stop_memory,
439
440 /* Match a duplicate of something remembered. Followed by one
441 byte containing the register number. */
442 duplicate,
443
444 /* Fail unless at beginning of line. */
445 begline,
446
447 /* Fail unless at end of line. */
448 endline,
449
450 /* Succeeds if at beginning of buffer (if emacs) or at beginning
451 of string to be matched (if not). */
452 begbuf,
453
454 /* Analogously, for end of buffer/string. */
455 endbuf,
456
457 /* Followed by two byte relative address to which to jump. */
458 jump,
459
460 /* Same as jump, but marks the end of an alternative. */
461 jump_past_alt,
462
463 /* Followed by two-byte relative address of place to resume at
464 in case of failure. */
465 on_failure_jump,
466
467 /* Like on_failure_jump, but pushes a placeholder instead of the
468 current string position when executed. */
469 on_failure_keep_string_jump,
470
471 /* Throw away latest failure point and then jump to following
472 two-byte relative address. */
473 pop_failure_jump,
474
475 /* Change to pop_failure_jump if know won't have to backtrack to
476 match; otherwise change to jump. This is used to jump
477 back to the beginning of a repeat. If what follows this jump
478 clearly won't match what the repeat does, such that we can be
479 sure that there is no use backtracking out of repetitions
480 already matched, then we change it to a pop_failure_jump.
481 Followed by two-byte address. */
482 maybe_pop_jump,
483
484 /* Jump to following two-byte address, and push a dummy failure
485 point. This failure point will be thrown away if an attempt
486 is made to use it for a failure. A `+' construct makes this
487 before the first repeat. Also used as an intermediary kind
488 of jump when compiling an alternative. */
489 dummy_failure_jump,
490
491 /* Push a dummy failure point and continue. Used at the end of
492 alternatives. */
493 push_dummy_failure,
494
495 /* Followed by two-byte relative address and two-byte number n.
496 After matching N times, jump to the address upon failure. */
497 succeed_n,
498
499 /* Followed by two-byte relative address, and two-byte number n.
500 Jump to the address N times, then fail. */
501 jump_n,
502
503 /* Set the following two-byte relative address to the
504 subsequent two-byte number. The address *includes* the two
505 bytes of number. */
506 set_number_at,
507
508 wordchar, /* Matches any word-constituent character. */
509 notwordchar, /* Matches any char that is not a word-constituent. */
510
511 wordbeg, /* Succeeds if at word beginning. */
512 wordend, /* Succeeds if at word end. */
513
514 wordbound, /* Succeeds if at a word boundary. */
515 notwordbound /* Succeeds if not at a word boundary. */
516
517#ifdef emacs
518 ,before_dot, /* Succeeds if before point. */
519 at_dot, /* Succeeds if at point. */
520 after_dot, /* Succeeds if after point. */
521
522 /* Matches any character whose syntax is specified. Followed by
523 a byte which contains a syntax code, e.g., Sword. */
524 syntaxspec,
525
526 /* Matches any character whose syntax is not that specified. */
527 notsyntaxspec
528#endif /* emacs */
529} re_opcode_t;
David Lamparter6b0655a2014-06-04 06:53:35 +0200530
paul718e3742002-12-13 20:15:29 +0000531/* Common operations on the compiled pattern. */
532
533/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
534
535#define STORE_NUMBER(destination, number) \
536 do { \
537 (destination)[0] = (number) & 0377; \
538 (destination)[1] = (number) >> 8; \
539 } while (0)
540
541/* Same as STORE_NUMBER, except increment DESTINATION to
542 the byte after where the number is stored. Therefore, DESTINATION
543 must be an lvalue. */
544
545#define STORE_NUMBER_AND_INCR(destination, number) \
546 do { \
547 STORE_NUMBER (destination, number); \
548 (destination) += 2; \
549 } while (0)
550
551/* Put into DESTINATION a number stored in two contiguous bytes starting
552 at SOURCE. */
553
554#define EXTRACT_NUMBER(destination, source) \
555 do { \
556 (destination) = *(source) & 0377; \
557 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
558 } while (0)
559
560#ifdef DEBUG
561static void extract_number _RE_ARGS ((int *dest, unsigned char *source));
562static void
563extract_number (dest, source)
564 int *dest;
565 unsigned char *source;
566{
567 int temp = SIGN_EXTEND_CHAR (*(source + 1));
568 *dest = *source & 0377;
569 *dest += temp << 8;
570}
571
572# ifndef EXTRACT_MACROS /* To debug the macros. */
573# undef EXTRACT_NUMBER
574# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
575# endif /* not EXTRACT_MACROS */
576
577#endif /* DEBUG */
578
579/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
580 SOURCE must be an lvalue. */
581
582#define EXTRACT_NUMBER_AND_INCR(destination, source) \
583 do { \
584 EXTRACT_NUMBER (destination, source); \
585 (source) += 2; \
586 } while (0)
587
588#ifdef DEBUG
589static void extract_number_and_incr _RE_ARGS ((int *destination,
590 unsigned char **source));
591static void
592extract_number_and_incr (destination, source)
593 int *destination;
594 unsigned char **source;
595{
596 extract_number (destination, *source);
597 *source += 2;
598}
599
600# ifndef EXTRACT_MACROS
601# undef EXTRACT_NUMBER_AND_INCR
602# define EXTRACT_NUMBER_AND_INCR(dest, src) \
603 extract_number_and_incr (&dest, &src)
604# endif /* not EXTRACT_MACROS */
605
606#endif /* DEBUG */
David Lamparter6b0655a2014-06-04 06:53:35 +0200607
paul718e3742002-12-13 20:15:29 +0000608/* If DEBUG is defined, Regex prints many voluminous messages about what
609 it is doing (if the variable `debug' is nonzero). If linked with the
610 main program in `iregex.c', you can enter patterns and strings
611 interactively. And if linked with the main program in `main.c' and
612 the other test files, you can run the already-written tests. */
613
614#ifdef DEBUG
615
616/* We use standard I/O for debugging. */
617# include <stdio.h>
618
619/* It is useful to test things that ``must'' be true when debugging. */
ajscee3df12004-11-24 17:14:49 +0000620# include "zassert.h"
paul718e3742002-12-13 20:15:29 +0000621
622static int debug;
623
624# define DEBUG_STATEMENT(e) e
625# define DEBUG_PRINT1(x) if (debug) printf (x)
626# define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
627# define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
628# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
629# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
630 if (debug) print_partial_compiled_pattern (s, e)
631# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
632 if (debug) print_double_string (w, s1, sz1, s2, sz2)
633
634
635/* Print the fastmap in human-readable form. */
636
637void
638print_fastmap (fastmap)
639 char *fastmap;
640{
641 unsigned was_a_range = 0;
642 unsigned i = 0;
643
644 while (i < (1 << BYTEWIDTH))
645 {
646 if (fastmap[i++])
647 {
648 was_a_range = 0;
649 putchar (i - 1);
650 while (i < (1 << BYTEWIDTH) && fastmap[i])
651 {
652 was_a_range = 1;
653 i++;
654 }
655 if (was_a_range)
656 {
657 printf ("-");
658 putchar (i - 1);
659 }
660 }
661 }
662 putchar ('\n');
663}
664
665
666/* Print a compiled pattern string in human-readable form, starting at
667 the START pointer into it and ending just before the pointer END. */
668
669void
670print_partial_compiled_pattern (start, end)
671 unsigned char *start;
672 unsigned char *end;
673{
674 int mcnt, mcnt2;
675 unsigned char *p1;
676 unsigned char *p = start;
677 unsigned char *pend = end;
678
679 if (start == NULL)
680 {
681 printf ("(null)\n");
682 return;
683 }
684
685 /* Loop over pattern commands. */
686 while (p < pend)
687 {
688 printf ("%d:\t", p - start);
689
690 switch ((re_opcode_t) *p++)
691 {
692 case no_op:
693 printf ("/no_op");
694 break;
695
696 case exactn:
697 mcnt = *p++;
698 printf ("/exactn/%d", mcnt);
699 do
700 {
701 putchar ('/');
702 putchar (*p++);
703 }
704 while (--mcnt);
705 break;
706
707 case start_memory:
708 mcnt = *p++;
709 printf ("/start_memory/%d/%d", mcnt, *p++);
710 break;
711
712 case stop_memory:
713 mcnt = *p++;
714 printf ("/stop_memory/%d/%d", mcnt, *p++);
715 break;
716
717 case duplicate:
718 printf ("/duplicate/%d", *p++);
719 break;
720
721 case anychar:
722 printf ("/anychar");
723 break;
724
725 case charset:
726 case charset_not:
727 {
728 register int c, last = -100;
729 register int in_range = 0;
730
731 printf ("/charset [%s",
732 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
733
734 assert (p + *p < pend);
735
736 for (c = 0; c < 256; c++)
737 if (c / 8 < *p
738 && (p[1 + (c/8)] & (1 << (c % 8))))
739 {
740 /* Are we starting a range? */
741 if (last + 1 == c && ! in_range)
742 {
743 putchar ('-');
744 in_range = 1;
745 }
746 /* Have we broken a range? */
747 else if (last + 1 != c && in_range)
748 {
749 putchar (last);
750 in_range = 0;
751 }
752
753 if (! in_range)
754 putchar (c);
755
756 last = c;
757 }
758
759 if (in_range)
760 putchar (last);
761
762 putchar (']');
763
764 p += 1 + *p;
765 }
766 break;
767
768 case begline:
769 printf ("/begline");
770 break;
771
772 case endline:
773 printf ("/endline");
774 break;
775
776 case on_failure_jump:
777 extract_number_and_incr (&mcnt, &p);
778 printf ("/on_failure_jump to %d", p + mcnt - start);
779 break;
780
781 case on_failure_keep_string_jump:
782 extract_number_and_incr (&mcnt, &p);
783 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
784 break;
785
786 case dummy_failure_jump:
787 extract_number_and_incr (&mcnt, &p);
788 printf ("/dummy_failure_jump to %d", p + mcnt - start);
789 break;
790
791 case push_dummy_failure:
792 printf ("/push_dummy_failure");
793 break;
794
795 case maybe_pop_jump:
796 extract_number_and_incr (&mcnt, &p);
797 printf ("/maybe_pop_jump to %d", p + mcnt - start);
798 break;
799
800 case pop_failure_jump:
801 extract_number_and_incr (&mcnt, &p);
802 printf ("/pop_failure_jump to %d", p + mcnt - start);
803 break;
804
805 case jump_past_alt:
806 extract_number_and_incr (&mcnt, &p);
807 printf ("/jump_past_alt to %d", p + mcnt - start);
808 break;
809
810 case jump:
811 extract_number_and_incr (&mcnt, &p);
812 printf ("/jump to %d", p + mcnt - start);
813 break;
814
815 case succeed_n:
816 extract_number_and_incr (&mcnt, &p);
817 p1 = p + mcnt;
818 extract_number_and_incr (&mcnt2, &p);
819 printf ("/succeed_n to %d, %d times", p1 - start, mcnt2);
820 break;
821
822 case jump_n:
823 extract_number_and_incr (&mcnt, &p);
824 p1 = p + mcnt;
825 extract_number_and_incr (&mcnt2, &p);
826 printf ("/jump_n to %d, %d times", p1 - start, mcnt2);
827 break;
828
829 case set_number_at:
830 extract_number_and_incr (&mcnt, &p);
831 p1 = p + mcnt;
832 extract_number_and_incr (&mcnt2, &p);
833 printf ("/set_number_at location %d to %d", p1 - start, mcnt2);
834 break;
835
836 case wordbound:
837 printf ("/wordbound");
838 break;
839
840 case notwordbound:
841 printf ("/notwordbound");
842 break;
843
844 case wordbeg:
845 printf ("/wordbeg");
846 break;
847
848 case wordend:
849 printf ("/wordend");
850
851# ifdef emacs
852 case before_dot:
853 printf ("/before_dot");
854 break;
855
856 case at_dot:
857 printf ("/at_dot");
858 break;
859
860 case after_dot:
861 printf ("/after_dot");
862 break;
863
864 case syntaxspec:
865 printf ("/syntaxspec");
866 mcnt = *p++;
867 printf ("/%d", mcnt);
868 break;
869
870 case notsyntaxspec:
871 printf ("/notsyntaxspec");
872 mcnt = *p++;
873 printf ("/%d", mcnt);
874 break;
875# endif /* emacs */
876
877 case wordchar:
878 printf ("/wordchar");
879 break;
880
881 case notwordchar:
882 printf ("/notwordchar");
883 break;
884
885 case begbuf:
886 printf ("/begbuf");
887 break;
888
889 case endbuf:
890 printf ("/endbuf");
891 break;
892
893 default:
894 printf ("?%d", *(p-1));
895 }
896
897 putchar ('\n');
898 }
899
900 printf ("%d:\tend of pattern.\n", p - start);
901}
902
903
904void
905print_compiled_pattern (bufp)
906 struct re_pattern_buffer *bufp;
907{
908 unsigned char *buffer = bufp->buffer;
909
910 print_partial_compiled_pattern (buffer, buffer + bufp->used);
911 printf ("%ld bytes used/%ld bytes allocated.\n",
912 bufp->used, bufp->allocated);
913
914 if (bufp->fastmap_accurate && bufp->fastmap)
915 {
916 printf ("fastmap: ");
917 print_fastmap (bufp->fastmap);
918 }
919
920 printf ("re_nsub: %d\t", bufp->re_nsub);
921 printf ("regs_alloc: %d\t", bufp->regs_allocated);
922 printf ("can_be_null: %d\t", bufp->can_be_null);
923 printf ("newline_anchor: %d\n", bufp->newline_anchor);
924 printf ("no_sub: %d\t", bufp->no_sub);
925 printf ("not_bol: %d\t", bufp->not_bol);
926 printf ("not_eol: %d\t", bufp->not_eol);
927 printf ("syntax: %lx\n", bufp->syntax);
928 /* Perhaps we should print the translate table? */
929}
930
931
932void
933print_double_string (where, string1, size1, string2, size2)
934 const char *where;
935 const char *string1;
936 const char *string2;
937 int size1;
938 int size2;
939{
940 int this_char;
941
942 if (where == NULL)
943 printf ("(null)");
944 else
945 {
946 if (FIRST_STRING_P (where))
947 {
948 for (this_char = where - string1; this_char < size1; this_char++)
949 putchar (string1[this_char]);
950
951 where = string2;
952 }
953
954 for (this_char = where - string2; this_char < size2; this_char++)
955 putchar (string2[this_char]);
956 }
957}
958
959void
960printchar (c)
961 int c;
962{
963 putc (c, stderr);
964}
965
966#else /* not DEBUG */
967
968# undef assert
969# define assert(e)
970
971# define DEBUG_STATEMENT(e)
972# define DEBUG_PRINT1(x)
973# define DEBUG_PRINT2(x1, x2)
974# define DEBUG_PRINT3(x1, x2, x3)
975# define DEBUG_PRINT4(x1, x2, x3, x4)
976# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
977# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
978
979#endif /* not DEBUG */
David Lamparter6b0655a2014-06-04 06:53:35 +0200980
paul718e3742002-12-13 20:15:29 +0000981/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
982 also be assigned to arbitrarily: each pattern buffer stores its own
983 syntax, so it can be changed between regex compilations. */
984/* This has no initializer because initialized variables in Emacs
985 become read-only after dumping. */
986reg_syntax_t re_syntax_options;
987
988
989/* Specify the precise syntax of regexps for compilation. This provides
990 for compatibility for various utilities which historically have
991 different, incompatible syntaxes.
992
993 The argument SYNTAX is a bit mask comprised of the various bits
994 defined in regex.h. We return the old syntax. */
995
996reg_syntax_t
997re_set_syntax (syntax)
998 reg_syntax_t syntax;
999{
1000 reg_syntax_t ret = re_syntax_options;
1001
1002 re_syntax_options = syntax;
1003#ifdef DEBUG
1004 if (syntax & RE_DEBUG)
1005 debug = 1;
1006 else if (debug) /* was on but now is not */
1007 debug = 0;
1008#endif /* DEBUG */
1009 return ret;
1010}
1011#ifdef _LIBC
1012weak_alias (__re_set_syntax, re_set_syntax)
1013#endif
David Lamparter6b0655a2014-06-04 06:53:35 +02001014
paul718e3742002-12-13 20:15:29 +00001015/* This table gives an error message for each of the error codes listed
1016 in regex.h. Obviously the order here has to be same as there.
1017 POSIX doesn't require that we do anything for REG_NOERROR,
1018 but why not be nice? */
1019
1020static const char re_error_msgid[] =
1021 {
1022#define REG_NOERROR_IDX 0
1023 gettext_noop ("Success") /* REG_NOERROR */
1024 "\0"
1025#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
1026 gettext_noop ("No match") /* REG_NOMATCH */
1027 "\0"
1028#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
1029 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
1030 "\0"
1031#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
vincenteac314c2006-01-17 23:39:04 +00001032 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
paul718e3742002-12-13 20:15:29 +00001033 "\0"
1034#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
1035 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
1036 "\0"
1037#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
1038 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
1039 "\0"
1040#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
1041 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
1042 "\0"
1043#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
1044 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
1045 "\0"
1046#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
1047 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
1048 "\0"
1049#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
1050 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
1051 "\0"
1052#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
1053 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
1054 "\0"
1055#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
1056 gettext_noop ("Invalid range end") /* REG_ERANGE */
1057 "\0"
1058#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
1059 gettext_noop ("Memory exhausted") /* REG_ESPACE */
1060 "\0"
1061#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
1062 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
1063 "\0"
1064#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
1065 gettext_noop ("Premature end of regular expression") /* REG_EEND */
1066 "\0"
1067#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
1068 gettext_noop ("Regular expression too big") /* REG_ESIZE */
1069 "\0"
1070#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
1071 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
1072 };
1073
1074static const size_t re_error_msgid_idx[] =
1075 {
1076 REG_NOERROR_IDX,
1077 REG_NOMATCH_IDX,
1078 REG_BADPAT_IDX,
1079 REG_ECOLLATE_IDX,
1080 REG_ECTYPE_IDX,
1081 REG_EESCAPE_IDX,
1082 REG_ESUBREG_IDX,
1083 REG_EBRACK_IDX,
1084 REG_EPAREN_IDX,
1085 REG_EBRACE_IDX,
1086 REG_BADBR_IDX,
1087 REG_ERANGE_IDX,
1088 REG_ESPACE_IDX,
1089 REG_BADRPT_IDX,
1090 REG_EEND_IDX,
1091 REG_ESIZE_IDX,
1092 REG_ERPAREN_IDX
1093 };
David Lamparter6b0655a2014-06-04 06:53:35 +02001094
paul718e3742002-12-13 20:15:29 +00001095/* Avoiding alloca during matching, to placate r_alloc. */
1096
1097/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1098 searching and matching functions should not call alloca. On some
1099 systems, alloca is implemented in terms of malloc, and if we're
1100 using the relocating allocator routines, then malloc could cause a
1101 relocation, which might (if the strings being searched are in the
1102 ralloc heap) shift the data out from underneath the regexp
1103 routines.
1104
1105 Here's another reason to avoid allocation: Emacs
1106 processes input from X in a signal handler; processing X input may
1107 call malloc; if input arrives while a matching routine is calling
1108 malloc, then we're scrod. But Emacs can't just block input while
1109 calling matching routines; then we don't notice interrupts when
1110 they come in. So, Emacs blocks input around all regexp calls
1111 except the matching calls, which it leaves unprotected, in the
1112 faith that they will not malloc. */
1113
1114/* Normally, this is fine. */
1115#define MATCH_MAY_ALLOCATE
1116
1117/* When using GNU C, we are not REALLY using the C alloca, no matter
1118 what config.h may say. So don't take precautions for it. */
1119#ifdef __GNUC__
1120# undef C_ALLOCA
1121#endif
1122
1123/* The match routines may not allocate if (1) they would do it with malloc
1124 and (2) it's not safe for them to use malloc.
1125 Note that if REL_ALLOC is defined, matching would not use malloc for the
1126 failure stack, but we would still use it for the register vectors;
1127 so REL_ALLOC should not affect this. */
1128#if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1129# undef MATCH_MAY_ALLOCATE
1130#endif
1131
David Lamparter6b0655a2014-06-04 06:53:35 +02001132
paul718e3742002-12-13 20:15:29 +00001133/* Failure stack declarations and macros; both re_compile_fastmap and
1134 re_match_2 use a failure stack. These have to be macros because of
1135 REGEX_ALLOCATE_STACK. */
1136
1137
1138/* Number of failure points for which to initially allocate space
1139 when matching. If this number is exceeded, we allocate more
1140 space, so it is not a hard limit. */
1141#ifndef INIT_FAILURE_ALLOC
1142# define INIT_FAILURE_ALLOC 5
1143#endif
1144
1145/* Roughly the maximum number of failure points on the stack. Would be
1146 exactly that if always used MAX_FAILURE_ITEMS items each time we failed.
1147 This is a variable only so users of regex can assign to it; we never
1148 change it ourselves. */
1149
1150#ifdef INT_IS_16BIT
1151
1152# if defined MATCH_MAY_ALLOCATE
1153/* 4400 was enough to cause a crash on Alpha OSF/1,
1154 whose default stack limit is 2mb. */
1155long int re_max_failures = 4000;
1156# else
1157long int re_max_failures = 2000;
1158# endif
1159
1160union fail_stack_elt
1161{
1162 unsigned char *pointer;
1163 long int integer;
1164};
1165
1166typedef union fail_stack_elt fail_stack_elt_t;
1167
1168typedef struct
1169{
1170 fail_stack_elt_t *stack;
1171 unsigned long int size;
1172 unsigned long int avail; /* Offset of next open position. */
1173} fail_stack_type;
1174
1175#else /* not INT_IS_16BIT */
1176
1177# if defined MATCH_MAY_ALLOCATE
1178/* 4400 was enough to cause a crash on Alpha OSF/1,
1179 whose default stack limit is 2mb. */
1180int re_max_failures = 20000;
1181# else
1182int re_max_failures = 2000;
1183# endif
1184
1185union fail_stack_elt
1186{
1187 unsigned char *pointer;
1188 int integer;
1189};
1190
1191typedef union fail_stack_elt fail_stack_elt_t;
1192
1193typedef struct
1194{
1195 fail_stack_elt_t *stack;
1196 unsigned size;
1197 unsigned avail; /* Offset of next open position. */
1198} fail_stack_type;
1199
1200#endif /* INT_IS_16BIT */
1201
1202#define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1203#define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1204#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1205
1206
1207/* Define macros to initialize and free the failure stack.
1208 Do `return -2' if the alloc fails. */
1209
1210#ifdef MATCH_MAY_ALLOCATE
1211# define INIT_FAIL_STACK() \
1212 do { \
1213 fail_stack.stack = (fail_stack_elt_t *) \
1214 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
1215 \
1216 if (fail_stack.stack == NULL) \
1217 return -2; \
1218 \
1219 fail_stack.size = INIT_FAILURE_ALLOC; \
1220 fail_stack.avail = 0; \
1221 } while (0)
1222
1223# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1224#else
1225# define INIT_FAIL_STACK() \
1226 do { \
1227 fail_stack.avail = 0; \
1228 } while (0)
1229
1230# define RESET_FAIL_STACK()
1231#endif
1232
1233
1234/* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1235
1236 Return 1 if succeeds, and 0 if either ran out of memory
1237 allocating space for it or it was already too large.
1238
1239 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1240
1241#define DOUBLE_FAIL_STACK(fail_stack) \
1242 ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \
1243 ? 0 \
1244 : ((fail_stack).stack = (fail_stack_elt_t *) \
1245 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1246 (fail_stack).size * sizeof (fail_stack_elt_t), \
1247 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1248 \
1249 (fail_stack).stack == NULL \
1250 ? 0 \
1251 : ((fail_stack).size <<= 1, \
1252 1)))
1253
1254
1255/* Push pointer POINTER on FAIL_STACK.
1256 Return 1 if was able to do so and 0 if ran out of memory allocating
1257 space to do so. */
1258#define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1259 ((FAIL_STACK_FULL () \
1260 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1261 ? 0 \
1262 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1263 1))
1264
1265/* Push a pointer value onto the failure stack.
1266 Assumes the variable `fail_stack'. Probably should only
1267 be called from within `PUSH_FAILURE_POINT'. */
1268#define PUSH_FAILURE_POINTER(item) \
1269 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1270
1271/* This pushes an integer-valued item onto the failure stack.
1272 Assumes the variable `fail_stack'. Probably should only
1273 be called from within `PUSH_FAILURE_POINT'. */
1274#define PUSH_FAILURE_INT(item) \
1275 fail_stack.stack[fail_stack.avail++].integer = (item)
1276
1277/* Push a fail_stack_elt_t value onto the failure stack.
1278 Assumes the variable `fail_stack'. Probably should only
1279 be called from within `PUSH_FAILURE_POINT'. */
1280#define PUSH_FAILURE_ELT(item) \
1281 fail_stack.stack[fail_stack.avail++] = (item)
1282
1283/* These three POP... operations complement the three PUSH... operations.
1284 All assume that `fail_stack' is nonempty. */
1285#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1286#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1287#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1288
1289/* Used to omit pushing failure point id's when we're not debugging. */
1290#ifdef DEBUG
1291# define DEBUG_PUSH PUSH_FAILURE_INT
1292# define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1293#else
1294# define DEBUG_PUSH(item)
1295# define DEBUG_POP(item_addr)
1296#endif
1297
1298
1299/* Push the information about the state we will need
1300 if we ever fail back to it.
1301
1302 Requires variables fail_stack, regstart, regend, reg_info, and
1303 num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination'
1304 be declared.
1305
1306 Does `return FAILURE_CODE' if runs out of memory. */
1307
1308#define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1309 do { \
1310 char *destination; \
1311 /* Must be int, so when we don't save any registers, the arithmetic \
1312 of 0 + -1 isn't done as unsigned. */ \
1313 /* Can't be int, since there is not a shred of a guarantee that int \
1314 is wide enough to hold a value of something to which pointer can \
1315 be assigned */ \
1316 active_reg_t this_reg; \
1317 \
1318 DEBUG_STATEMENT (failure_id++); \
1319 DEBUG_STATEMENT (nfailure_points_pushed++); \
1320 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1321 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1322 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1323 \
1324 DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \
1325 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1326 \
1327 /* Ensure we have enough space allocated for what we will push. */ \
1328 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1329 { \
1330 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1331 return failure_code; \
1332 \
1333 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1334 (fail_stack).size); \
1335 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1336 } \
1337 \
1338 /* Push the info, starting with the registers. */ \
1339 DEBUG_PRINT1 ("\n"); \
1340 \
1341 if (1) \
1342 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1343 this_reg++) \
1344 { \
1345 DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \
1346 DEBUG_STATEMENT (num_regs_pushed++); \
1347 \
1348 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1349 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1350 \
1351 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1352 PUSH_FAILURE_POINTER (regend[this_reg]); \
1353 \
1354 DEBUG_PRINT2 (" info: %p\n ", \
1355 reg_info[this_reg].word.pointer); \
1356 DEBUG_PRINT2 (" match_null=%d", \
1357 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1358 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1359 DEBUG_PRINT2 (" matched_something=%d", \
1360 MATCHED_SOMETHING (reg_info[this_reg])); \
1361 DEBUG_PRINT2 (" ever_matched=%d", \
1362 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1363 DEBUG_PRINT1 ("\n"); \
1364 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1365 } \
1366 \
1367 DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\
1368 PUSH_FAILURE_INT (lowest_active_reg); \
1369 \
1370 DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\
1371 PUSH_FAILURE_INT (highest_active_reg); \
1372 \
1373 DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \
1374 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1375 PUSH_FAILURE_POINTER (pattern_place); \
1376 \
1377 DEBUG_PRINT2 (" Pushing string %p: `", string_place); \
1378 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1379 size2); \
1380 DEBUG_PRINT1 ("'\n"); \
1381 PUSH_FAILURE_POINTER (string_place); \
1382 \
1383 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1384 DEBUG_PUSH (failure_id); \
1385 } while (0)
1386
1387/* This is the number of items that are pushed and popped on the stack
1388 for each register. */
1389#define NUM_REG_ITEMS 3
1390
1391/* Individual items aside from the registers. */
1392#ifdef DEBUG
1393# define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1394#else
1395# define NUM_NONREG_ITEMS 4
1396#endif
1397
1398/* We push at most this many items on the stack. */
1399/* We used to use (num_regs - 1), which is the number of registers
1400 this regexp will save; but that was changed to 5
1401 to avoid stack overflow for a regexp with lots of parens. */
1402#define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1403
1404/* We actually push this many items. */
1405#define NUM_FAILURE_ITEMS \
1406 (((0 \
1407 ? 0 : highest_active_reg - lowest_active_reg + 1) \
1408 * NUM_REG_ITEMS) \
1409 + NUM_NONREG_ITEMS)
1410
1411/* How many items can still be added to the stack without overflowing it. */
1412#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1413
1414
1415/* Pops what PUSH_FAIL_STACK pushes.
1416
1417 We restore into the parameters, all of which should be lvalues:
1418 STR -- the saved data position.
1419 PAT -- the saved pattern position.
1420 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1421 REGSTART, REGEND -- arrays of string positions.
1422 REG_INFO -- array of information about each subexpression.
1423
1424 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1425 `pend', `string1', `size1', `string2', and `size2'. */
1426
1427#define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1428{ \
1429 DEBUG_STATEMENT (unsigned failure_id;) \
1430 active_reg_t this_reg; \
1431 const unsigned char *string_temp; \
1432 \
1433 assert (!FAIL_STACK_EMPTY ()); \
1434 \
1435 /* Remove failure points and point to how many regs pushed. */ \
1436 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1437 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1438 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1439 \
1440 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1441 \
1442 DEBUG_POP (&failure_id); \
1443 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
1444 \
1445 /* If the saved string location is NULL, it came from an \
1446 on_failure_keep_string_jump opcode, and we want to throw away the \
1447 saved NULL, thus retaining our current position in the string. */ \
1448 string_temp = POP_FAILURE_POINTER (); \
1449 if (string_temp != NULL) \
1450 str = (const char *) string_temp; \
1451 \
1452 DEBUG_PRINT2 (" Popping string %p: `", str); \
1453 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1454 DEBUG_PRINT1 ("'\n"); \
1455 \
1456 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1457 DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \
1458 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1459 \
1460 /* Restore register info. */ \
1461 high_reg = (active_reg_t) POP_FAILURE_INT (); \
1462 DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \
1463 \
1464 low_reg = (active_reg_t) POP_FAILURE_INT (); \
1465 DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \
1466 \
1467 if (1) \
1468 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1469 { \
1470 DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \
1471 \
1472 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1473 DEBUG_PRINT2 (" info: %p\n", \
1474 reg_info[this_reg].word.pointer); \
1475 \
1476 regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \
1477 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1478 \
1479 regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \
1480 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1481 } \
1482 else \
1483 { \
1484 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \
1485 { \
1486 reg_info[this_reg].word.integer = 0; \
1487 regend[this_reg] = 0; \
1488 regstart[this_reg] = 0; \
1489 } \
1490 highest_active_reg = high_reg; \
1491 } \
1492 \
1493 set_regs_matched_done = 0; \
1494 DEBUG_STATEMENT (nfailure_points_popped++); \
1495} /* POP_FAILURE_POINT */
1496
1497
David Lamparter6b0655a2014-06-04 06:53:35 +02001498
paul718e3742002-12-13 20:15:29 +00001499/* Structure for per-register (a.k.a. per-group) information.
1500 Other register information, such as the
1501 starting and ending positions (which are addresses), and the list of
1502 inner groups (which is a bits list) are maintained in separate
1503 variables.
1504
1505 We are making a (strictly speaking) nonportable assumption here: that
1506 the compiler will pack our bit fields into something that fits into
1507 the type of `word', i.e., is something that fits into one item on the
1508 failure stack. */
1509
1510
1511/* Declarations and macros for re_match_2. */
1512
1513typedef union
1514{
1515 fail_stack_elt_t word;
1516 struct
1517 {
1518 /* This field is one if this group can match the empty string,
1519 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1520#define MATCH_NULL_UNSET_VALUE 3
1521 unsigned match_null_string_p : 2;
1522 unsigned is_active : 1;
1523 unsigned matched_something : 1;
1524 unsigned ever_matched_something : 1;
1525 } bits;
1526} register_info_type;
1527
1528#define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1529#define IS_ACTIVE(R) ((R).bits.is_active)
1530#define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1531#define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1532
1533
1534/* Call this when have matched a real character; it sets `matched' flags
1535 for the subexpressions which we are currently inside. Also records
1536 that those subexprs have matched. */
1537#define SET_REGS_MATCHED() \
1538 do \
1539 { \
1540 if (!set_regs_matched_done) \
1541 { \
1542 active_reg_t r; \
1543 set_regs_matched_done = 1; \
1544 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1545 { \
1546 MATCHED_SOMETHING (reg_info[r]) \
1547 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1548 = 1; \
1549 } \
1550 } \
1551 } \
1552 while (0)
1553
1554/* Registers are set to a sentinel when they haven't yet matched. */
1555static char reg_unset_dummy;
1556#define REG_UNSET_VALUE (&reg_unset_dummy)
1557#define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
David Lamparter6b0655a2014-06-04 06:53:35 +02001558
paul718e3742002-12-13 20:15:29 +00001559/* Subroutine declarations and macros for regex_compile. */
1560
1561static reg_errcode_t regex_compile _RE_ARGS ((const char *pattern, size_t size,
1562 reg_syntax_t syntax,
1563 struct re_pattern_buffer *bufp));
1564static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1565static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1566 int arg1, int arg2));
1567static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1568 int arg, unsigned char *end));
1569static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1570 int arg1, int arg2, unsigned char *end));
1571static boolean at_begline_loc_p _RE_ARGS ((const char *pattern, const char *p,
1572 reg_syntax_t syntax));
1573static boolean at_endline_loc_p _RE_ARGS ((const char *p, const char *pend,
1574 reg_syntax_t syntax));
1575static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr,
1576 const char *pend,
1577 char *translate,
1578 reg_syntax_t syntax,
1579 unsigned char *b));
1580
1581/* Fetch the next character in the uncompiled pattern---translating it
1582 if necessary. Also cast from a signed character in the constant
1583 string passed to us by the user to an unsigned char that we can use
1584 as an array index (in, e.g., `translate'). */
1585#ifndef PATFETCH
1586# define PATFETCH(c) \
1587 do {if (p == pend) return REG_EEND; \
1588 c = (unsigned char) *p++; \
1589 if (translate) c = (unsigned char) translate[c]; \
1590 } while (0)
1591#endif
1592
1593/* Fetch the next character in the uncompiled pattern, with no
1594 translation. */
1595#define PATFETCH_RAW(c) \
1596 do {if (p == pend) return REG_EEND; \
1597 c = (unsigned char) *p++; \
1598 } while (0)
1599
1600/* Go backwards one character in the pattern. */
1601#define PATUNFETCH p--
1602
1603
1604/* If `translate' is non-null, return translate[D], else just D. We
1605 cast the subscript to translate because some data is declared as
1606 `char *', to avoid warnings when a string constant is passed. But
1607 when we use a character as a subscript we must make it unsigned. */
1608#ifndef TRANSLATE
1609# define TRANSLATE(d) \
1610 (translate ? (char) translate[(unsigned char) (d)] : (d))
1611#endif
1612
1613
1614/* Macros for outputting the compiled pattern into `buffer'. */
1615
1616/* If the buffer isn't allocated when it comes in, use this. */
1617#define INIT_BUF_SIZE 32
1618
1619/* Make sure we have at least N more bytes of space in buffer. */
1620#define GET_BUFFER_SPACE(n) \
1621 while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \
1622 EXTEND_BUFFER ()
1623
1624/* Make sure we have one more byte of buffer space and then add C to it. */
1625#define BUF_PUSH(c) \
1626 do { \
1627 GET_BUFFER_SPACE (1); \
1628 *b++ = (unsigned char) (c); \
1629 } while (0)
1630
1631
1632/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1633#define BUF_PUSH_2(c1, c2) \
1634 do { \
1635 GET_BUFFER_SPACE (2); \
1636 *b++ = (unsigned char) (c1); \
1637 *b++ = (unsigned char) (c2); \
1638 } while (0)
1639
1640
1641/* As with BUF_PUSH_2, except for three bytes. */
1642#define BUF_PUSH_3(c1, c2, c3) \
1643 do { \
1644 GET_BUFFER_SPACE (3); \
1645 *b++ = (unsigned char) (c1); \
1646 *b++ = (unsigned char) (c2); \
1647 *b++ = (unsigned char) (c3); \
1648 } while (0)
1649
1650
1651/* Store a jump with opcode OP at LOC to location TO. We store a
1652 relative address offset by the three bytes the jump itself occupies. */
1653#define STORE_JUMP(op, loc, to) \
1654 store_op1 (op, loc, (int) ((to) - (loc) - 3))
1655
1656/* Likewise, for a two-argument jump. */
1657#define STORE_JUMP2(op, loc, to, arg) \
1658 store_op2 (op, loc, (int) ((to) - (loc) - 3), arg)
1659
1660/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1661#define INSERT_JUMP(op, loc, to) \
1662 insert_op1 (op, loc, (int) ((to) - (loc) - 3), b)
1663
1664/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1665#define INSERT_JUMP2(op, loc, to, arg) \
1666 insert_op2 (op, loc, (int) ((to) - (loc) - 3), arg, b)
1667
1668
1669/* This is not an arbitrary limit: the arguments which represent offsets
1670 into the pattern are two bytes long. So if 2^16 bytes turns out to
1671 be too small, many things would have to change. */
1672/* Any other compiler which, like MSC, has allocation limit below 2^16
1673 bytes will have to use approach similar to what was done below for
1674 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1675 reallocating to 0 bytes. Such thing is not going to work too well.
1676 You have been warned!! */
vincenteac314c2006-01-17 23:39:04 +00001677#if defined _MSC_VER && !defined _WIN32
paul718e3742002-12-13 20:15:29 +00001678/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes.
1679 The REALLOC define eliminates a flurry of conversion warnings,
1680 but is not required. */
1681# define MAX_BUF_SIZE 65500L
1682# define REALLOC(p,s) realloc ((p), (size_t) (s))
1683#else
1684# define MAX_BUF_SIZE (1L << 16)
1685# define REALLOC(p,s) realloc ((p), (s))
1686#endif
1687
1688/* Extend the buffer by twice its current size via realloc and
1689 reset the pointers that pointed into the old block to point to the
1690 correct places in the new one. If extending the buffer results in it
1691 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1692#define EXTEND_BUFFER() \
1693 do { \
1694 unsigned char *old_buffer = bufp->buffer; \
1695 if (bufp->allocated == MAX_BUF_SIZE) \
1696 return REG_ESIZE; \
1697 bufp->allocated <<= 1; \
1698 if (bufp->allocated > MAX_BUF_SIZE) \
1699 bufp->allocated = MAX_BUF_SIZE; \
1700 bufp->buffer = (unsigned char *) REALLOC (bufp->buffer, bufp->allocated);\
1701 if (bufp->buffer == NULL) \
1702 return REG_ESPACE; \
1703 /* If the buffer moved, move all the pointers into it. */ \
1704 if (old_buffer != bufp->buffer) \
1705 { \
1706 b = (b - old_buffer) + bufp->buffer; \
1707 begalt = (begalt - old_buffer) + bufp->buffer; \
1708 if (fixup_alt_jump) \
1709 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1710 if (laststart) \
1711 laststart = (laststart - old_buffer) + bufp->buffer; \
1712 if (pending_exact) \
1713 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
1714 } \
1715 } while (0)
1716
1717
1718/* Since we have one byte reserved for the register number argument to
1719 {start,stop}_memory, the maximum number of groups we can report
1720 things about is what fits in that byte. */
1721#define MAX_REGNUM 255
1722
1723/* But patterns can have more than `MAX_REGNUM' registers. We just
1724 ignore the excess. */
1725typedef unsigned regnum_t;
1726
1727
1728/* Macros for the compile stack. */
1729
1730/* Since offsets can go either forwards or backwards, this type needs to
1731 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1732/* int may be not enough when sizeof(int) == 2. */
1733typedef long pattern_offset_t;
1734
1735typedef struct
1736{
1737 pattern_offset_t begalt_offset;
1738 pattern_offset_t fixup_alt_jump;
1739 pattern_offset_t inner_group_offset;
1740 pattern_offset_t laststart_offset;
1741 regnum_t regnum;
1742} compile_stack_elt_t;
1743
1744
1745typedef struct
1746{
1747 compile_stack_elt_t *stack;
1748 unsigned size;
1749 unsigned avail; /* Offset of next open position. */
1750} compile_stack_type;
1751
1752
1753#define INIT_COMPILE_STACK_SIZE 32
1754
1755#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1756#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1757
1758/* The next available element. */
1759#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1760
1761
1762/* Set the bit for character C in a list. */
1763#define SET_LIST_BIT(c) \
1764 (b[((unsigned char) (c)) / BYTEWIDTH] \
1765 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1766
1767
1768/* Get the next unsigned number in the uncompiled pattern. */
1769#define GET_UNSIGNED_NUMBER(num) \
1770 { if (p != pend) \
1771 { \
1772 PATFETCH (c); \
1773 while (ISDIGIT (c)) \
1774 { \
1775 if (num < 0) \
1776 num = 0; \
1777 num = num * 10 + c - '0'; \
1778 if (p == pend) \
1779 break; \
1780 PATFETCH (c); \
1781 } \
1782 } \
1783 }
1784
1785#if defined _LIBC || WIDE_CHAR_SUPPORT
1786/* The GNU C library provides support for user-defined character classes
1787 and the functions from ISO C amendement 1. */
1788# ifdef CHARCLASS_NAME_MAX
1789# define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
1790# else
1791/* This shouldn't happen but some implementation might still have this
1792 problem. Use a reasonable default value. */
1793# define CHAR_CLASS_MAX_LENGTH 256
1794# endif
1795
1796# ifdef _LIBC
1797# define IS_CHAR_CLASS(string) __wctype (string)
1798# else
1799# define IS_CHAR_CLASS(string) wctype (string)
1800# endif
1801#else
1802# define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1803
1804# define IS_CHAR_CLASS(string) \
1805 (STREQ (string, "alpha") || STREQ (string, "upper") \
1806 || STREQ (string, "lower") || STREQ (string, "digit") \
1807 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1808 || STREQ (string, "space") || STREQ (string, "print") \
1809 || STREQ (string, "punct") || STREQ (string, "graph") \
1810 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1811#endif
David Lamparter6b0655a2014-06-04 06:53:35 +02001812
paul718e3742002-12-13 20:15:29 +00001813#ifndef MATCH_MAY_ALLOCATE
1814
1815/* If we cannot allocate large objects within re_match_2_internal,
1816 we make the fail stack and register vectors global.
1817 The fail stack, we grow to the maximum size when a regexp
1818 is compiled.
1819 The register vectors, we adjust in size each time we
1820 compile a regexp, according to the number of registers it needs. */
1821
1822static fail_stack_type fail_stack;
1823
1824/* Size with which the following vectors are currently allocated.
1825 That is so we can make them bigger as needed,
1826 but never make them smaller. */
1827static int regs_allocated_size;
1828
1829static const char ** regstart, ** regend;
1830static const char ** old_regstart, ** old_regend;
1831static const char **best_regstart, **best_regend;
1832static register_info_type *reg_info;
1833static const char **reg_dummy;
1834static register_info_type *reg_info_dummy;
1835
1836/* Make the register vectors big enough for NUM_REGS registers,
1837 but don't make them smaller. */
1838
1839static
1840regex_grow_registers (num_regs)
1841 int num_regs;
1842{
1843 if (num_regs > regs_allocated_size)
1844 {
1845 RETALLOC_IF (regstart, num_regs, const char *);
1846 RETALLOC_IF (regend, num_regs, const char *);
1847 RETALLOC_IF (old_regstart, num_regs, const char *);
1848 RETALLOC_IF (old_regend, num_regs, const char *);
1849 RETALLOC_IF (best_regstart, num_regs, const char *);
1850 RETALLOC_IF (best_regend, num_regs, const char *);
1851 RETALLOC_IF (reg_info, num_regs, register_info_type);
1852 RETALLOC_IF (reg_dummy, num_regs, const char *);
1853 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
1854
1855 regs_allocated_size = num_regs;
1856 }
1857}
1858
1859#endif /* not MATCH_MAY_ALLOCATE */
David Lamparter6b0655a2014-06-04 06:53:35 +02001860
paul718e3742002-12-13 20:15:29 +00001861static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
1862 compile_stack,
1863 regnum_t regnum));
1864
1865/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1866 Returns one of error codes defined in `regex.h', or zero for success.
1867
1868 Assumes the `allocated' (and perhaps `buffer') and `translate'
1869 fields are set in BUFP on entry.
1870
1871 If it succeeds, results are put in BUFP (if it returns an error, the
1872 contents of BUFP are undefined):
1873 `buffer' is the compiled pattern;
1874 `syntax' is set to SYNTAX;
1875 `used' is set to the length of the compiled pattern;
1876 `fastmap_accurate' is zero;
1877 `re_nsub' is the number of subexpressions in PATTERN;
1878 `not_bol' and `not_eol' are zero;
1879
1880 The `fastmap' and `newline_anchor' fields are neither
1881 examined nor set. */
1882
1883/* Return, freeing storage we allocated. */
1884#define FREE_STACK_RETURN(value) \
1885 return (free (compile_stack.stack), value)
1886
1887static reg_errcode_t
1888regex_compile (pattern, size, syntax, bufp)
1889 const char *pattern;
1890 size_t size;
1891 reg_syntax_t syntax;
1892 struct re_pattern_buffer *bufp;
1893{
1894 /* We fetch characters from PATTERN here. Even though PATTERN is
1895 `char *' (i.e., signed), we declare these variables as unsigned, so
1896 they can be reliably used as array indices. */
1897 register unsigned char c, c1;
1898
1899 /* A random temporary spot in PATTERN. */
1900 const char *p1;
1901
1902 /* Points to the end of the buffer, where we should append. */
1903 register unsigned char *b;
1904
1905 /* Keeps track of unclosed groups. */
1906 compile_stack_type compile_stack;
1907
1908 /* Points to the current (ending) position in the pattern. */
1909 const char *p = pattern;
1910 const char *pend = pattern + size;
1911
1912 /* How to translate the characters in the pattern. */
1913 RE_TRANSLATE_TYPE translate = bufp->translate;
1914
1915 /* Address of the count-byte of the most recently inserted `exactn'
1916 command. This makes it possible to tell if a new exact-match
1917 character can be added to that command or if the character requires
1918 a new `exactn' command. */
1919 unsigned char *pending_exact = 0;
1920
1921 /* Address of start of the most recently finished expression.
1922 This tells, e.g., postfix * where to find the start of its
1923 operand. Reset at the beginning of groups and alternatives. */
1924 unsigned char *laststart = 0;
1925
1926 /* Address of beginning of regexp, or inside of last group. */
1927 unsigned char *begalt;
1928
1929 /* Place in the uncompiled pattern (i.e., the {) to
1930 which to go back if the interval is invalid. */
1931 const char *beg_interval;
1932
1933 /* Address of the place where a forward jump should go to the end of
1934 the containing expression. Each alternative of an `or' -- except the
1935 last -- ends with a forward jump of this sort. */
1936 unsigned char *fixup_alt_jump = 0;
1937
1938 /* Counts open-groups as they are encountered. Remembered for the
1939 matching close-group on the compile stack, so the same register
1940 number is put in the stop_memory as the start_memory. */
1941 regnum_t regnum = 0;
1942
1943#ifdef DEBUG
1944 DEBUG_PRINT1 ("\nCompiling pattern: ");
1945 if (debug)
1946 {
1947 unsigned debug_count;
1948
1949 for (debug_count = 0; debug_count < size; debug_count++)
1950 putchar (pattern[debug_count]);
1951 putchar ('\n');
1952 }
1953#endif /* DEBUG */
1954
1955 /* Initialize the compile stack. */
1956 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1957 if (compile_stack.stack == NULL)
1958 return REG_ESPACE;
1959
1960 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1961 compile_stack.avail = 0;
1962
1963 /* Initialize the pattern buffer. */
1964 bufp->syntax = syntax;
1965 bufp->fastmap_accurate = 0;
1966 bufp->not_bol = bufp->not_eol = 0;
1967
1968 /* Set `used' to zero, so that if we return an error, the pattern
1969 printer (for debugging) will think there's no pattern. We reset it
1970 at the end. */
1971 bufp->used = 0;
1972
1973 /* Always count groups, whether or not bufp->no_sub is set. */
1974 bufp->re_nsub = 0;
1975
1976#if !defined emacs && !defined SYNTAX_TABLE
1977 /* Initialize the syntax table. */
1978 init_syntax_once ();
1979#endif
1980
1981 if (bufp->allocated == 0)
1982 {
1983 if (bufp->buffer)
1984 { /* If zero allocated, but buffer is non-null, try to realloc
1985 enough space. This loses if buffer's address is bogus, but
1986 that is the user's responsibility. */
1987 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1988 }
1989 else
1990 { /* Caller did not allocate a buffer. Do it for them. */
1991 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
1992 }
1993 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
1994
1995 bufp->allocated = INIT_BUF_SIZE;
1996 }
1997
1998 begalt = b = bufp->buffer;
1999
2000 /* Loop through the uncompiled pattern until we're at the end. */
2001 while (p != pend)
2002 {
2003 PATFETCH (c);
2004
2005 switch (c)
2006 {
2007 case '^':
2008 {
2009 if ( /* If at start of pattern, it's an operator. */
2010 p == pattern + 1
2011 /* If context independent, it's an operator. */
2012 || syntax & RE_CONTEXT_INDEP_ANCHORS
2013 /* Otherwise, depends on what's come before. */
2014 || at_begline_loc_p (pattern, p, syntax))
2015 BUF_PUSH (begline);
2016 else
2017 goto normal_char;
2018 }
2019 break;
2020
2021
2022 case '$':
2023 {
2024 if ( /* If at end of pattern, it's an operator. */
2025 p == pend
2026 /* If context independent, it's an operator. */
2027 || syntax & RE_CONTEXT_INDEP_ANCHORS
2028 /* Otherwise, depends on what's next. */
2029 || at_endline_loc_p (p, pend, syntax))
2030 BUF_PUSH (endline);
2031 else
2032 goto normal_char;
2033 }
2034 break;
2035
2036
2037 case '+':
2038 case '?':
2039 if ((syntax & RE_BK_PLUS_QM)
2040 || (syntax & RE_LIMITED_OPS))
2041 goto normal_char;
2042 handle_plus:
2043 case '*':
2044 /* If there is no previous pattern... */
2045 if (!laststart)
2046 {
2047 if (syntax & RE_CONTEXT_INVALID_OPS)
2048 FREE_STACK_RETURN (REG_BADRPT);
2049 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2050 goto normal_char;
2051 }
2052
2053 {
2054 /* Are we optimizing this jump? */
2055 boolean keep_string_p = false;
2056
2057 /* 1 means zero (many) matches is allowed. */
2058 char zero_times_ok = 0, many_times_ok = 0;
2059
2060 /* If there is a sequence of repetition chars, collapse it
2061 down to just one (the right one). We can't combine
2062 interval operators with these because of, e.g., `a{2}*',
2063 which should only match an even number of `a's. */
2064
2065 for (;;)
2066 {
2067 zero_times_ok |= c != '+';
2068 many_times_ok |= c != '?';
2069
2070 if (p == pend)
2071 break;
2072
2073 PATFETCH (c);
2074
2075 if (c == '*'
2076 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
2077 ;
2078
2079 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2080 {
2081 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2082
2083 PATFETCH (c1);
2084 if (!(c1 == '+' || c1 == '?'))
2085 {
2086 PATUNFETCH;
2087 PATUNFETCH;
2088 break;
2089 }
2090
2091 c = c1;
2092 }
2093 else
2094 {
2095 PATUNFETCH;
2096 break;
2097 }
2098
2099 /* If we get here, we found another repeat character. */
2100 }
2101
2102 /* Star, etc. applied to an empty pattern is equivalent
2103 to an empty pattern. */
2104 if (!laststart)
2105 break;
2106
2107 /* Now we know whether or not zero matches is allowed
2108 and also whether or not two or more matches is allowed. */
2109 if (many_times_ok)
2110 { /* More than one repetition is allowed, so put in at the
2111 end a backward relative jump from `b' to before the next
2112 jump we're going to put in below (which jumps from
2113 laststart to after this jump).
2114
2115 But if we are at the `*' in the exact sequence `.*\n',
2116 insert an unconditional jump backwards to the .,
2117 instead of the beginning of the loop. This way we only
2118 push a failure point once, instead of every time
2119 through the loop. */
2120 assert (p - 1 > pattern);
2121
2122 /* Allocate the space for the jump. */
2123 GET_BUFFER_SPACE (3);
2124
2125 /* We know we are not at the first character of the pattern,
2126 because laststart was nonzero. And we've already
2127 incremented `p', by the way, to be the character after
2128 the `*'. Do we have to do something analogous here
2129 for null bytes, because of RE_DOT_NOT_NULL? */
2130 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
2131 && zero_times_ok
2132 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
2133 && !(syntax & RE_DOT_NEWLINE))
2134 { /* We have .*\n. */
2135 STORE_JUMP (jump, b, laststart);
2136 keep_string_p = true;
2137 }
2138 else
2139 /* Anything else. */
2140 STORE_JUMP (maybe_pop_jump, b, laststart - 3);
2141
2142 /* We've added more stuff to the buffer. */
2143 b += 3;
2144 }
2145
2146 /* On failure, jump from laststart to b + 3, which will be the
2147 end of the buffer after this jump is inserted. */
2148 GET_BUFFER_SPACE (3);
2149 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2150 : on_failure_jump,
2151 laststart, b + 3);
2152 pending_exact = 0;
2153 b += 3;
2154
2155 if (!zero_times_ok)
2156 {
2157 /* At least one repetition is required, so insert a
2158 `dummy_failure_jump' before the initial
2159 `on_failure_jump' instruction of the loop. This
2160 effects a skip over that instruction the first time
2161 we hit that loop. */
2162 GET_BUFFER_SPACE (3);
2163 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
2164 b += 3;
2165 }
2166 }
2167 break;
2168
2169
2170 case '.':
2171 laststart = b;
2172 BUF_PUSH (anychar);
2173 break;
2174
2175
2176 case '[':
2177 {
2178 boolean had_char_class = false;
2179
2180 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2181
2182 /* Ensure that we have enough space to push a charset: the
2183 opcode, the length count, and the bitset; 34 bytes in all. */
2184 GET_BUFFER_SPACE (34);
2185
2186 laststart = b;
2187
2188 /* We test `*p == '^' twice, instead of using an if
2189 statement, so we only need one BUF_PUSH. */
2190 BUF_PUSH (*p == '^' ? charset_not : charset);
2191 if (*p == '^')
2192 p++;
2193
2194 /* Remember the first position in the bracket expression. */
2195 p1 = p;
2196
2197 /* Push the number of bytes in the bitmap. */
2198 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2199
2200 /* Clear the whole map. */
pauld1724b62003-10-22 02:41:52 +00002201 memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
paul718e3742002-12-13 20:15:29 +00002202
2203 /* charset_not matches newline according to a syntax bit. */
2204 if ((re_opcode_t) b[-2] == charset_not
2205 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2206 SET_LIST_BIT ('\n');
2207
2208 /* Read in characters and ranges, setting map bits. */
2209 for (;;)
2210 {
2211 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2212
2213 PATFETCH (c);
2214
2215 /* \ might escape characters inside [...] and [^...]. */
2216 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2217 {
2218 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2219
2220 PATFETCH (c1);
2221 SET_LIST_BIT (c1);
2222 continue;
2223 }
2224
2225 /* Could be the end of the bracket expression. If it's
2226 not (i.e., when the bracket expression is `[]' so
2227 far), the ']' character bit gets set way below. */
2228 if (c == ']' && p != p1 + 1)
2229 break;
2230
2231 /* Look ahead to see if it's a range when the last thing
2232 was a character class. */
2233 if (had_char_class && c == '-' && *p != ']')
2234 FREE_STACK_RETURN (REG_ERANGE);
2235
2236 /* Look ahead to see if it's a range when the last thing
2237 was a character: if this is a hyphen not at the
2238 beginning or the end of a list, then it's the range
2239 operator. */
2240 if (c == '-'
2241 && !(p - 2 >= pattern && p[-2] == '[')
2242 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2243 && *p != ']')
2244 {
2245 reg_errcode_t ret
2246 = compile_range (&p, pend, translate, syntax, b);
2247 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2248 }
2249
2250 else if (p[0] == '-' && p[1] != ']')
2251 { /* This handles ranges made up of characters only. */
2252 reg_errcode_t ret;
2253
2254 /* Move past the `-'. */
2255 PATFETCH (c1);
2256
2257 ret = compile_range (&p, pend, translate, syntax, b);
2258 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2259 }
2260
2261 /* See if we're at the beginning of a possible character
2262 class. */
2263
2264 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2265 { /* Leave room for the null. */
2266 char str[CHAR_CLASS_MAX_LENGTH + 1];
2267
2268 PATFETCH (c);
2269 c1 = 0;
2270
2271 /* If pattern is `[[:'. */
2272 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2273
2274 for (;;)
2275 {
2276 PATFETCH (c);
2277 if ((c == ':' && *p == ']') || p == pend)
2278 break;
2279 if (c1 < CHAR_CLASS_MAX_LENGTH)
2280 str[c1++] = c;
2281 else
2282 /* This is in any case an invalid class name. */
2283 str[0] = '\0';
2284 }
2285 str[c1] = '\0';
2286
2287 /* If isn't a word bracketed by `[:' and `:]':
2288 undo the ending character, the letters, and leave
2289 the leading `:' and `[' (but set bits for them). */
2290 if (c == ':' && *p == ']')
2291 {
2292#if defined _LIBC || WIDE_CHAR_SUPPORT
2293 boolean is_lower = STREQ (str, "lower");
2294 boolean is_upper = STREQ (str, "upper");
2295 wctype_t wt;
2296 int ch;
2297
2298 wt = IS_CHAR_CLASS (str);
2299 if (wt == 0)
2300 FREE_STACK_RETURN (REG_ECTYPE);
2301
2302 /* Throw away the ] at the end of the character
2303 class. */
2304 PATFETCH (c);
2305
2306 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2307
2308 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
2309 {
2310# ifdef _LIBC
2311 if (__iswctype (__btowc (ch), wt))
2312 SET_LIST_BIT (ch);
2313# else
2314 if (iswctype (btowc (ch), wt))
2315 SET_LIST_BIT (ch);
2316# endif
2317
2318 if (translate && (is_upper || is_lower)
2319 && (ISUPPER (ch) || ISLOWER (ch)))
2320 SET_LIST_BIT (ch);
2321 }
2322
2323 had_char_class = true;
2324#else
2325 int ch;
2326 boolean is_alnum = STREQ (str, "alnum");
2327 boolean is_alpha = STREQ (str, "alpha");
2328 boolean is_blank = STREQ (str, "blank");
2329 boolean is_cntrl = STREQ (str, "cntrl");
2330 boolean is_digit = STREQ (str, "digit");
2331 boolean is_graph = STREQ (str, "graph");
2332 boolean is_lower = STREQ (str, "lower");
2333 boolean is_print = STREQ (str, "print");
2334 boolean is_punct = STREQ (str, "punct");
2335 boolean is_space = STREQ (str, "space");
2336 boolean is_upper = STREQ (str, "upper");
2337 boolean is_xdigit = STREQ (str, "xdigit");
2338
2339 if (!IS_CHAR_CLASS (str))
2340 FREE_STACK_RETURN (REG_ECTYPE);
2341
2342 /* Throw away the ] at the end of the character
2343 class. */
2344 PATFETCH (c);
2345
2346 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2347
2348 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2349 {
2350 /* This was split into 3 if's to
2351 avoid an arbitrary limit in some compiler. */
2352 if ( (is_alnum && ISALNUM (ch))
2353 || (is_alpha && ISALPHA (ch))
2354 || (is_blank && ISBLANK (ch))
2355 || (is_cntrl && ISCNTRL (ch)))
2356 SET_LIST_BIT (ch);
2357 if ( (is_digit && ISDIGIT (ch))
2358 || (is_graph && ISGRAPH (ch))
2359 || (is_lower && ISLOWER (ch))
2360 || (is_print && ISPRINT (ch)))
2361 SET_LIST_BIT (ch);
2362 if ( (is_punct && ISPUNCT (ch))
2363 || (is_space && ISSPACE (ch))
2364 || (is_upper && ISUPPER (ch))
2365 || (is_xdigit && ISXDIGIT (ch)))
2366 SET_LIST_BIT (ch);
2367 if ( translate && (is_upper || is_lower)
2368 && (ISUPPER (ch) || ISLOWER (ch)))
2369 SET_LIST_BIT (ch);
2370 }
2371 had_char_class = true;
2372#endif /* libc || wctype.h */
2373 }
2374 else
2375 {
2376 c1++;
2377 while (c1--)
2378 PATUNFETCH;
2379 SET_LIST_BIT ('[');
2380 SET_LIST_BIT (':');
2381 had_char_class = false;
2382 }
2383 }
2384 else
2385 {
2386 had_char_class = false;
2387 SET_LIST_BIT (c);
2388 }
2389 }
2390
2391 /* Discard any (non)matching list bytes that are all 0 at the
2392 end of the map. Decrease the map-length byte too. */
2393 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2394 b[-1]--;
2395 b += b[-1];
2396 }
2397 break;
2398
2399
2400 case '(':
2401 if (syntax & RE_NO_BK_PARENS)
2402 goto handle_open;
2403 else
2404 goto normal_char;
2405
2406
2407 case ')':
2408 if (syntax & RE_NO_BK_PARENS)
2409 goto handle_close;
2410 else
2411 goto normal_char;
2412
2413
2414 case '\n':
2415 if (syntax & RE_NEWLINE_ALT)
2416 goto handle_alt;
2417 else
2418 goto normal_char;
2419
2420
2421 case '|':
2422 if (syntax & RE_NO_BK_VBAR)
2423 goto handle_alt;
2424 else
2425 goto normal_char;
2426
2427
2428 case '{':
2429 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2430 goto handle_interval;
2431 else
2432 goto normal_char;
2433
2434
2435 case '\\':
2436 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2437
2438 /* Do not translate the character after the \, so that we can
2439 distinguish, e.g., \B from \b, even if we normally would
2440 translate, e.g., B to b. */
2441 PATFETCH_RAW (c);
2442
2443 switch (c)
2444 {
2445 case '(':
2446 if (syntax & RE_NO_BK_PARENS)
2447 goto normal_backslash;
2448
2449 handle_open:
2450 bufp->re_nsub++;
2451 regnum++;
2452
2453 if (COMPILE_STACK_FULL)
2454 {
2455 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2456 compile_stack_elt_t);
2457 if (compile_stack.stack == NULL) return REG_ESPACE;
2458
2459 compile_stack.size <<= 1;
2460 }
2461
2462 /* These are the values to restore when we hit end of this
2463 group. They are all relative offsets, so that if the
2464 whole pattern moves because of realloc, they will still
2465 be valid. */
2466 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2467 COMPILE_STACK_TOP.fixup_alt_jump
2468 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2469 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2470 COMPILE_STACK_TOP.regnum = regnum;
2471
2472 /* We will eventually replace the 0 with the number of
2473 groups inner to this one. But do not push a
2474 start_memory for groups beyond the last one we can
2475 represent in the compiled pattern. */
2476 if (regnum <= MAX_REGNUM)
2477 {
2478 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
2479 BUF_PUSH_3 (start_memory, regnum, 0);
2480 }
2481
2482 compile_stack.avail++;
2483
2484 fixup_alt_jump = 0;
2485 laststart = 0;
2486 begalt = b;
2487 /* If we've reached MAX_REGNUM groups, then this open
2488 won't actually generate any code, so we'll have to
2489 clear pending_exact explicitly. */
2490 pending_exact = 0;
2491 break;
2492
2493
2494 case ')':
2495 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2496
2497 if (COMPILE_STACK_EMPTY)
2498 {
2499 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2500 goto normal_backslash;
2501 else
2502 FREE_STACK_RETURN (REG_ERPAREN);
2503 }
2504
2505 handle_close:
2506 if (fixup_alt_jump)
2507 { /* Push a dummy failure point at the end of the
2508 alternative for a possible future
2509 `pop_failure_jump' to pop. See comments at
2510 `push_dummy_failure' in `re_match_2'. */
2511 BUF_PUSH (push_dummy_failure);
2512
2513 /* We allocated space for this jump when we assigned
2514 to `fixup_alt_jump', in the `handle_alt' case below. */
2515 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
2516 }
2517
2518 /* See similar code for backslashed left paren above. */
2519 if (COMPILE_STACK_EMPTY)
2520 {
2521 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2522 goto normal_char;
2523 else
2524 FREE_STACK_RETURN (REG_ERPAREN);
2525 }
2526
2527 /* Since we just checked for an empty stack above, this
2528 ``can't happen''. */
2529 assert (compile_stack.avail != 0);
2530 {
2531 /* We don't just want to restore into `regnum', because
2532 later groups should continue to be numbered higher,
2533 as in `(ab)c(de)' -- the second group is #2. */
2534 regnum_t this_group_regnum;
2535
2536 compile_stack.avail--;
2537 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2538 fixup_alt_jump
2539 = COMPILE_STACK_TOP.fixup_alt_jump
2540 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2541 : 0;
2542 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2543 this_group_regnum = COMPILE_STACK_TOP.regnum;
2544 /* If we've reached MAX_REGNUM groups, then this open
2545 won't actually generate any code, so we'll have to
2546 clear pending_exact explicitly. */
2547 pending_exact = 0;
2548
2549 /* We're at the end of the group, so now we know how many
2550 groups were inside this one. */
2551 if (this_group_regnum <= MAX_REGNUM)
2552 {
2553 unsigned char *inner_group_loc
2554 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
2555
2556 *inner_group_loc = regnum - this_group_regnum;
2557 BUF_PUSH_3 (stop_memory, this_group_regnum,
2558 regnum - this_group_regnum);
2559 }
2560 }
2561 break;
2562
2563
2564 case '|': /* `\|'. */
2565 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2566 goto normal_backslash;
2567 handle_alt:
2568 if (syntax & RE_LIMITED_OPS)
2569 goto normal_char;
2570
2571 /* Insert before the previous alternative a jump which
2572 jumps to this alternative if the former fails. */
2573 GET_BUFFER_SPACE (3);
2574 INSERT_JUMP (on_failure_jump, begalt, b + 6);
2575 pending_exact = 0;
2576 b += 3;
2577
2578 /* The alternative before this one has a jump after it
2579 which gets executed if it gets matched. Adjust that
2580 jump so it will jump to this alternative's analogous
2581 jump (put in below, which in turn will jump to the next
2582 (if any) alternative's such jump, etc.). The last such
2583 jump jumps to the correct final destination. A picture:
2584 _____ _____
2585 | | | |
2586 | v | v
2587 a | b | c
2588
2589 If we are at `b', then fixup_alt_jump right now points to a
2590 three-byte space after `a'. We'll put in the jump, set
2591 fixup_alt_jump to right after `b', and leave behind three
2592 bytes which we'll fill in when we get to after `c'. */
2593
2594 if (fixup_alt_jump)
2595 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2596
2597 /* Mark and leave space for a jump after this alternative,
2598 to be filled in later either by next alternative or
2599 when know we're at the end of a series of alternatives. */
2600 fixup_alt_jump = b;
2601 GET_BUFFER_SPACE (3);
2602 b += 3;
2603
2604 laststart = 0;
2605 begalt = b;
2606 break;
2607
2608
2609 case '{':
2610 /* If \{ is a literal. */
2611 if (!(syntax & RE_INTERVALS)
2612 /* If we're at `\{' and it's not the open-interval
2613 operator. */
2614 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2615 || (p - 2 == pattern && p == pend))
2616 goto normal_backslash;
2617
2618 handle_interval:
2619 {
2620 /* If got here, then the syntax allows intervals. */
2621
2622 /* At least (most) this many matches must be made. */
2623 int lower_bound = -1, upper_bound = -1;
2624
2625 beg_interval = p - 1;
2626
2627 if (p == pend)
2628 {
2629 if (syntax & RE_NO_BK_BRACES)
2630 goto unfetch_interval;
2631 else
2632 FREE_STACK_RETURN (REG_EBRACE);
2633 }
2634
2635 GET_UNSIGNED_NUMBER (lower_bound);
2636
2637 if (c == ',')
2638 {
2639 GET_UNSIGNED_NUMBER (upper_bound);
2640 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2641 }
2642 else
2643 /* Interval such as `{1}' => match exactly once. */
2644 upper_bound = lower_bound;
2645
2646 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2647 || lower_bound > upper_bound)
2648 {
2649 if (syntax & RE_NO_BK_BRACES)
2650 goto unfetch_interval;
2651 else
2652 FREE_STACK_RETURN (REG_BADBR);
2653 }
2654
2655 if (!(syntax & RE_NO_BK_BRACES))
2656 {
2657 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2658
2659 PATFETCH (c);
2660 }
2661
2662 if (c != '}')
2663 {
2664 if (syntax & RE_NO_BK_BRACES)
2665 goto unfetch_interval;
2666 else
2667 FREE_STACK_RETURN (REG_BADBR);
2668 }
2669
2670 /* We just parsed a valid interval. */
2671
2672 /* If it's invalid to have no preceding re. */
2673 if (!laststart)
2674 {
2675 if (syntax & RE_CONTEXT_INVALID_OPS)
2676 FREE_STACK_RETURN (REG_BADRPT);
2677 else if (syntax & RE_CONTEXT_INDEP_OPS)
2678 laststart = b;
2679 else
2680 goto unfetch_interval;
2681 }
2682
2683 /* If the upper bound is zero, don't want to succeed at
2684 all; jump from `laststart' to `b + 3', which will be
2685 the end of the buffer after we insert the jump. */
2686 if (upper_bound == 0)
2687 {
2688 GET_BUFFER_SPACE (3);
2689 INSERT_JUMP (jump, laststart, b + 3);
2690 b += 3;
2691 }
2692
2693 /* Otherwise, we have a nontrivial interval. When
2694 we're all done, the pattern will look like:
2695 set_number_at <jump count> <upper bound>
2696 set_number_at <succeed_n count> <lower bound>
2697 succeed_n <after jump addr> <succeed_n count>
2698 <body of loop>
2699 jump_n <succeed_n addr> <jump count>
2700 (The upper bound and `jump_n' are omitted if
2701 `upper_bound' is 1, though.) */
2702 else
2703 { /* If the upper bound is > 1, we need to insert
2704 more at the end of the loop. */
2705 unsigned nbytes = 10 + (upper_bound > 1) * 10;
2706
2707 GET_BUFFER_SPACE (nbytes);
2708
2709 /* Initialize lower bound of the `succeed_n', even
2710 though it will be set during matching by its
2711 attendant `set_number_at' (inserted next),
2712 because `re_compile_fastmap' needs to know.
2713 Jump to the `jump_n' we might insert below. */
2714 INSERT_JUMP2 (succeed_n, laststart,
2715 b + 5 + (upper_bound > 1) * 5,
2716 lower_bound);
2717 b += 5;
2718
2719 /* Code to initialize the lower bound. Insert
2720 before the `succeed_n'. The `5' is the last two
2721 bytes of this `set_number_at', plus 3 bytes of
2722 the following `succeed_n'. */
2723 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
2724 b += 5;
2725
2726 if (upper_bound > 1)
2727 { /* More than one repetition is allowed, so
2728 append a backward jump to the `succeed_n'
2729 that starts this interval.
2730
2731 When we've reached this during matching,
2732 we'll have matched the interval once, so
2733 jump back only `upper_bound - 1' times. */
2734 STORE_JUMP2 (jump_n, b, laststart + 5,
2735 upper_bound - 1);
2736 b += 5;
2737
2738 /* The location we want to set is the second
2739 parameter of the `jump_n'; that is `b-2' as
2740 an absolute address. `laststart' will be
2741 the `set_number_at' we're about to insert;
2742 `laststart+3' the number to set, the source
2743 for the relative address. But we are
2744 inserting into the middle of the pattern --
2745 so everything is getting moved up by 5.
2746 Conclusion: (b - 2) - (laststart + 3) + 5,
2747 i.e., b - laststart.
2748
2749 We insert this at the beginning of the loop
2750 so that if we fail during matching, we'll
2751 reinitialize the bounds. */
2752 insert_op2 (set_number_at, laststart, b - laststart,
2753 upper_bound - 1, b);
2754 b += 5;
2755 }
2756 }
2757 pending_exact = 0;
2758 beg_interval = NULL;
2759 }
2760 break;
2761
2762 unfetch_interval:
2763 /* If an invalid interval, match the characters as literals. */
2764 assert (beg_interval);
2765 p = beg_interval;
2766 beg_interval = NULL;
2767
2768 /* normal_char and normal_backslash need `c'. */
2769 PATFETCH (c);
2770
2771 if (!(syntax & RE_NO_BK_BRACES))
2772 {
2773 if (p > pattern && p[-1] == '\\')
2774 goto normal_backslash;
2775 }
2776 goto normal_char;
2777
2778#ifdef emacs
2779 /* There is no way to specify the before_dot and after_dot
2780 operators. rms says this is ok. --karl */
2781 case '=':
2782 BUF_PUSH (at_dot);
2783 break;
2784
2785 case 's':
2786 laststart = b;
2787 PATFETCH (c);
2788 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2789 break;
2790
2791 case 'S':
2792 laststart = b;
2793 PATFETCH (c);
2794 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2795 break;
2796#endif /* emacs */
2797
2798
2799 case 'w':
2800 if (syntax & RE_NO_GNU_OPS)
2801 goto normal_char;
2802 laststart = b;
2803 BUF_PUSH (wordchar);
2804 break;
2805
2806
2807 case 'W':
2808 if (syntax & RE_NO_GNU_OPS)
2809 goto normal_char;
2810 laststart = b;
2811 BUF_PUSH (notwordchar);
2812 break;
2813
2814
2815 case '<':
2816 if (syntax & RE_NO_GNU_OPS)
2817 goto normal_char;
2818 BUF_PUSH (wordbeg);
2819 break;
2820
2821 case '>':
2822 if (syntax & RE_NO_GNU_OPS)
2823 goto normal_char;
2824 BUF_PUSH (wordend);
2825 break;
2826
2827 case 'b':
2828 if (syntax & RE_NO_GNU_OPS)
2829 goto normal_char;
2830 BUF_PUSH (wordbound);
2831 break;
2832
2833 case 'B':
2834 if (syntax & RE_NO_GNU_OPS)
2835 goto normal_char;
2836 BUF_PUSH (notwordbound);
2837 break;
2838
2839 case '`':
2840 if (syntax & RE_NO_GNU_OPS)
2841 goto normal_char;
2842 BUF_PUSH (begbuf);
2843 break;
2844
2845 case '\'':
2846 if (syntax & RE_NO_GNU_OPS)
2847 goto normal_char;
2848 BUF_PUSH (endbuf);
2849 break;
2850
2851 case '1': case '2': case '3': case '4': case '5':
2852 case '6': case '7': case '8': case '9':
2853 if (syntax & RE_NO_BK_REFS)
2854 goto normal_char;
2855
2856 c1 = c - '0';
2857
2858 if (c1 > regnum)
2859 FREE_STACK_RETURN (REG_ESUBREG);
2860
2861 /* Can't back reference to a subexpression if inside of it. */
2862 if (group_in_compile_stack (compile_stack, (regnum_t) c1))
2863 goto normal_char;
2864
2865 laststart = b;
2866 BUF_PUSH_2 (duplicate, c1);
2867 break;
2868
2869
2870 case '+':
2871 case '?':
2872 if (syntax & RE_BK_PLUS_QM)
2873 goto handle_plus;
2874 else
2875 goto normal_backslash;
2876
2877 default:
2878 normal_backslash:
2879 /* You might think it would be useful for \ to mean
2880 not to translate; but if we don't translate it
2881 it will never match anything. */
2882 c = TRANSLATE (c);
2883 goto normal_char;
2884 }
2885 break;
2886
2887
2888 default:
2889 /* Expects the character in `c'. */
2890 normal_char:
2891 /* If no exactn currently being built. */
2892 if (!pending_exact
2893
2894 /* If last exactn not at current position. */
2895 || pending_exact + *pending_exact + 1 != b
2896
2897 /* We have only one byte following the exactn for the count. */
2898 || *pending_exact == (1 << BYTEWIDTH) - 1
2899
2900 /* If followed by a repetition operator. */
2901 || *p == '*' || *p == '^'
2902 || ((syntax & RE_BK_PLUS_QM)
2903 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
2904 : (*p == '+' || *p == '?'))
2905 || ((syntax & RE_INTERVALS)
2906 && ((syntax & RE_NO_BK_BRACES)
2907 ? *p == '{'
2908 : (p[0] == '\\' && p[1] == '{'))))
2909 {
2910 /* Start building a new exactn. */
2911
2912 laststart = b;
2913
2914 BUF_PUSH_2 (exactn, 0);
2915 pending_exact = b - 1;
2916 }
2917
2918 BUF_PUSH (c);
2919 (*pending_exact)++;
2920 break;
2921 } /* switch (c) */
2922 } /* while p != pend */
2923
2924
2925 /* Through the pattern now. */
2926
2927 if (fixup_alt_jump)
2928 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2929
2930 if (!COMPILE_STACK_EMPTY)
2931 FREE_STACK_RETURN (REG_EPAREN);
2932
2933 /* If we don't want backtracking, force success
2934 the first time we reach the end of the compiled pattern. */
2935 if (syntax & RE_NO_POSIX_BACKTRACKING)
2936 BUF_PUSH (succeed);
2937
2938 free (compile_stack.stack);
2939
2940 /* We have succeeded; set the length of the buffer. */
2941 bufp->used = b - bufp->buffer;
2942
2943#ifdef DEBUG
2944 if (debug)
2945 {
2946 DEBUG_PRINT1 ("\nCompiled pattern: \n");
2947 print_compiled_pattern (bufp);
2948 }
2949#endif /* DEBUG */
2950
2951#ifndef MATCH_MAY_ALLOCATE
2952 /* Initialize the failure stack to the largest possible stack. This
2953 isn't necessary unless we're trying to avoid calling alloca in
2954 the search and match routines. */
2955 {
2956 int num_regs = bufp->re_nsub + 1;
2957
2958 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
2959 is strictly greater than re_max_failures, the largest possible stack
2960 is 2 * re_max_failures failure points. */
2961 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
2962 {
2963 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
2964
2965# ifdef emacs
2966 if (! fail_stack.stack)
2967 fail_stack.stack
2968 = (fail_stack_elt_t *) xmalloc (fail_stack.size
2969 * sizeof (fail_stack_elt_t));
2970 else
2971 fail_stack.stack
2972 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
2973 (fail_stack.size
2974 * sizeof (fail_stack_elt_t)));
2975# else /* not emacs */
2976 if (! fail_stack.stack)
2977 fail_stack.stack
2978 = (fail_stack_elt_t *) malloc (fail_stack.size
2979 * sizeof (fail_stack_elt_t));
2980 else
2981 fail_stack.stack
2982 = (fail_stack_elt_t *) realloc (fail_stack.stack,
2983 (fail_stack.size
2984 * sizeof (fail_stack_elt_t)));
2985# endif /* not emacs */
2986 }
2987
2988 regex_grow_registers (num_regs);
2989 }
2990#endif /* not MATCH_MAY_ALLOCATE */
2991
2992 return REG_NOERROR;
2993} /* regex_compile */
David Lamparter6b0655a2014-06-04 06:53:35 +02002994
paul718e3742002-12-13 20:15:29 +00002995/* Subroutines for `regex_compile'. */
2996
2997/* Store OP at LOC followed by two-byte integer parameter ARG. */
2998
2999static void
3000store_op1 (op, loc, arg)
3001 re_opcode_t op;
3002 unsigned char *loc;
3003 int arg;
3004{
3005 *loc = (unsigned char) op;
3006 STORE_NUMBER (loc + 1, arg);
3007}
3008
3009
3010/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3011
3012static void
3013store_op2 (op, loc, arg1, arg2)
3014 re_opcode_t op;
3015 unsigned char *loc;
3016 int arg1, arg2;
3017{
3018 *loc = (unsigned char) op;
3019 STORE_NUMBER (loc + 1, arg1);
3020 STORE_NUMBER (loc + 3, arg2);
3021}
3022
3023
3024/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3025 for OP followed by two-byte integer parameter ARG. */
3026
3027static void
3028insert_op1 (op, loc, arg, end)
3029 re_opcode_t op;
3030 unsigned char *loc;
3031 int arg;
3032 unsigned char *end;
3033{
3034 register unsigned char *pfrom = end;
3035 register unsigned char *pto = end + 3;
3036
3037 while (pfrom != loc)
3038 *--pto = *--pfrom;
3039
3040 store_op1 (op, loc, arg);
3041}
3042
3043
3044/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3045
3046static void
3047insert_op2 (op, loc, arg1, arg2, end)
3048 re_opcode_t op;
3049 unsigned char *loc;
3050 int arg1, arg2;
3051 unsigned char *end;
3052{
3053 register unsigned char *pfrom = end;
3054 register unsigned char *pto = end + 5;
3055
3056 while (pfrom != loc)
3057 *--pto = *--pfrom;
3058
3059 store_op2 (op, loc, arg1, arg2);
3060}
3061
3062
3063/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3064 after an alternative or a begin-subexpression. We assume there is at
3065 least one character before the ^. */
3066
3067static boolean
3068at_begline_loc_p (pattern, p, syntax)
3069 const char *pattern, *p;
3070 reg_syntax_t syntax;
3071{
3072 const char *prev = p - 2;
3073 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3074
3075 return
3076 /* After a subexpression? */
3077 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3078 /* After an alternative? */
3079 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3080}
3081
3082
3083/* The dual of at_begline_loc_p. This one is for $. We assume there is
3084 at least one character after the $, i.e., `P < PEND'. */
3085
3086static boolean
3087at_endline_loc_p (p, pend, syntax)
3088 const char *p, *pend;
3089 reg_syntax_t syntax;
3090{
3091 const char *next = p;
3092 boolean next_backslash = *next == '\\';
3093 const char *next_next = p + 1 < pend ? p + 1 : 0;
3094
3095 return
3096 /* Before a subexpression? */
3097 (syntax & RE_NO_BK_PARENS ? *next == ')'
3098 : next_backslash && next_next && *next_next == ')')
3099 /* Before an alternative? */
3100 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3101 : next_backslash && next_next && *next_next == '|');
3102}
3103
3104
3105/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3106 false if it's not. */
3107
3108static boolean
3109group_in_compile_stack (compile_stack, regnum)
3110 compile_stack_type compile_stack;
3111 regnum_t regnum;
3112{
3113 int this_element;
3114
3115 for (this_element = compile_stack.avail - 1;
3116 this_element >= 0;
3117 this_element--)
3118 if (compile_stack.stack[this_element].regnum == regnum)
3119 return true;
3120
3121 return false;
3122}
3123
3124
3125/* Read the ending character of a range (in a bracket expression) from the
3126 uncompiled pattern *P_PTR (which ends at PEND). We assume the
3127 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
3128 Then we set the translation of all bits between the starting and
3129 ending characters (inclusive) in the compiled pattern B.
3130
3131 Return an error code.
3132
3133 We use these short variable names so we can use the same macros as
3134 `regex_compile' itself. */
3135
3136static reg_errcode_t
3137compile_range (p_ptr, pend, translate, syntax, b)
3138 const char **p_ptr, *pend;
3139 RE_TRANSLATE_TYPE translate;
3140 reg_syntax_t syntax;
3141 unsigned char *b;
3142{
3143 unsigned this_char;
3144
3145 const char *p = *p_ptr;
3146 unsigned int range_start, range_end;
3147
3148 if (p == pend)
3149 return REG_ERANGE;
3150
3151 /* Even though the pattern is a signed `char *', we need to fetch
3152 with unsigned char *'s; if the high bit of the pattern character
3153 is set, the range endpoints will be negative if we fetch using a
3154 signed char *.
3155
3156 We also want to fetch the endpoints without translating them; the
3157 appropriate translation is done in the bit-setting loop below. */
3158 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
3159 range_start = ((const unsigned char *) p)[-2];
3160 range_end = ((const unsigned char *) p)[0];
3161
3162 /* Have to increment the pointer into the pattern string, so the
3163 caller isn't still at the ending character. */
3164 (*p_ptr)++;
3165
3166 /* If the start is after the end, the range is empty. */
3167 if (range_start > range_end)
3168 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3169
3170 /* Here we see why `this_char' has to be larger than an `unsigned
3171 char' -- the range is inclusive, so if `range_end' == 0xff
3172 (assuming 8-bit characters), we would otherwise go into an infinite
3173 loop, since all characters <= 0xff. */
3174 for (this_char = range_start; this_char <= range_end; this_char++)
3175 {
3176 SET_LIST_BIT (TRANSLATE (this_char));
3177 }
3178
3179 return REG_NOERROR;
3180}
David Lamparter6b0655a2014-06-04 06:53:35 +02003181
paul718e3742002-12-13 20:15:29 +00003182/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3183 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3184 characters can start a string that matches the pattern. This fastmap
3185 is used by re_search to skip quickly over impossible starting points.
3186
3187 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3188 area as BUFP->fastmap.
3189
3190 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3191 the pattern buffer.
3192
3193 Returns 0 if we succeed, -2 if an internal error. */
3194
3195int
3196re_compile_fastmap (bufp)
3197 struct re_pattern_buffer *bufp;
3198{
3199 int j, k;
3200#ifdef MATCH_MAY_ALLOCATE
3201 fail_stack_type fail_stack;
3202#endif
3203#ifndef REGEX_MALLOC
3204 char *destination;
3205#endif
3206
3207 register char *fastmap = bufp->fastmap;
3208 unsigned char *pattern = bufp->buffer;
3209 unsigned char *p = pattern;
3210 register unsigned char *pend = pattern + bufp->used;
3211
3212#ifdef REL_ALLOC
3213 /* This holds the pointer to the failure stack, when
3214 it is allocated relocatably. */
3215 fail_stack_elt_t *failure_stack_ptr;
3216#endif
3217
3218 /* Assume that each path through the pattern can be null until
3219 proven otherwise. We set this false at the bottom of switch
3220 statement, to which we get only if a particular path doesn't
3221 match the empty string. */
3222 boolean path_can_be_null = true;
3223
3224 /* We aren't doing a `succeed_n' to begin with. */
3225 boolean succeed_n_p = false;
3226
3227 assert (fastmap != NULL && p != NULL);
3228
3229 INIT_FAIL_STACK ();
pauld1724b62003-10-22 02:41:52 +00003230 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
paul718e3742002-12-13 20:15:29 +00003231 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3232 bufp->can_be_null = 0;
3233
3234 while (1)
3235 {
3236 if (p == pend || *p == succeed)
3237 {
3238 /* We have reached the (effective) end of pattern. */
3239 if (!FAIL_STACK_EMPTY ())
3240 {
3241 bufp->can_be_null |= path_can_be_null;
3242
3243 /* Reset for next path. */
3244 path_can_be_null = true;
3245
3246 p = fail_stack.stack[--fail_stack.avail].pointer;
3247
3248 continue;
3249 }
3250 else
3251 break;
3252 }
3253
3254 /* We should never be about to go beyond the end of the pattern. */
3255 assert (p < pend);
3256
3257 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3258 {
3259
3260 /* I guess the idea here is to simply not bother with a fastmap
3261 if a backreference is used, since it's too hard to figure out
3262 the fastmap for the corresponding group. Setting
3263 `can_be_null' stops `re_search_2' from using the fastmap, so
3264 that is all we do. */
3265 case duplicate:
3266 bufp->can_be_null = 1;
3267 goto done;
3268
3269
3270 /* Following are the cases which match a character. These end
3271 with `break'. */
3272
3273 case exactn:
3274 fastmap[p[1]] = 1;
3275 break;
3276
3277
3278 case charset:
3279 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3280 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3281 fastmap[j] = 1;
3282 break;
3283
3284
3285 case charset_not:
3286 /* Chars beyond end of map must be allowed. */
3287 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3288 fastmap[j] = 1;
3289
3290 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3291 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3292 fastmap[j] = 1;
3293 break;
3294
3295
3296 case wordchar:
3297 for (j = 0; j < (1 << BYTEWIDTH); j++)
3298 if (SYNTAX (j) == Sword)
3299 fastmap[j] = 1;
3300 break;
3301
3302
3303 case notwordchar:
3304 for (j = 0; j < (1 << BYTEWIDTH); j++)
3305 if (SYNTAX (j) != Sword)
3306 fastmap[j] = 1;
3307 break;
3308
3309
3310 case anychar:
3311 {
3312 int fastmap_newline = fastmap['\n'];
3313
3314 /* `.' matches anything ... */
3315 for (j = 0; j < (1 << BYTEWIDTH); j++)
3316 fastmap[j] = 1;
3317
3318 /* ... except perhaps newline. */
3319 if (!(bufp->syntax & RE_DOT_NEWLINE))
3320 fastmap['\n'] = fastmap_newline;
3321
3322 /* Return if we have already set `can_be_null'; if we have,
3323 then the fastmap is irrelevant. Something's wrong here. */
3324 else if (bufp->can_be_null)
3325 goto done;
3326
3327 /* Otherwise, have to check alternative paths. */
3328 break;
3329 }
3330
3331#ifdef emacs
3332 case syntaxspec:
3333 k = *p++;
3334 for (j = 0; j < (1 << BYTEWIDTH); j++)
3335 if (SYNTAX (j) == (enum syntaxcode) k)
3336 fastmap[j] = 1;
3337 break;
3338
3339
3340 case notsyntaxspec:
3341 k = *p++;
3342 for (j = 0; j < (1 << BYTEWIDTH); j++)
3343 if (SYNTAX (j) != (enum syntaxcode) k)
3344 fastmap[j] = 1;
3345 break;
3346
3347
3348 /* All cases after this match the empty string. These end with
3349 `continue'. */
3350
3351
3352 case before_dot:
3353 case at_dot:
3354 case after_dot:
3355 continue;
3356#endif /* emacs */
3357
3358
3359 case no_op:
3360 case begline:
3361 case endline:
3362 case begbuf:
3363 case endbuf:
3364 case wordbound:
3365 case notwordbound:
3366 case wordbeg:
3367 case wordend:
3368 case push_dummy_failure:
3369 continue;
3370
3371
3372 case jump_n:
3373 case pop_failure_jump:
3374 case maybe_pop_jump:
3375 case jump:
3376 case jump_past_alt:
3377 case dummy_failure_jump:
3378 EXTRACT_NUMBER_AND_INCR (j, p);
3379 p += j;
3380 if (j > 0)
3381 continue;
3382
3383 /* Jump backward implies we just went through the body of a
3384 loop and matched nothing. Opcode jumped to should be
3385 `on_failure_jump' or `succeed_n'. Just treat it like an
3386 ordinary jump. For a * loop, it has pushed its failure
3387 point already; if so, discard that as redundant. */
3388 if ((re_opcode_t) *p != on_failure_jump
3389 && (re_opcode_t) *p != succeed_n)
3390 continue;
3391
3392 p++;
3393 EXTRACT_NUMBER_AND_INCR (j, p);
3394 p += j;
3395
3396 /* If what's on the stack is where we are now, pop it. */
3397 if (!FAIL_STACK_EMPTY ()
3398 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
3399 fail_stack.avail--;
3400
3401 continue;
3402
3403
3404 case on_failure_jump:
3405 case on_failure_keep_string_jump:
3406 handle_on_failure_jump:
3407 EXTRACT_NUMBER_AND_INCR (j, p);
3408
3409 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
3410 end of the pattern. We don't want to push such a point,
3411 since when we restore it above, entering the switch will
3412 increment `p' past the end of the pattern. We don't need
3413 to push such a point since we obviously won't find any more
3414 fastmap entries beyond `pend'. Such a pattern can match
3415 the null string, though. */
3416 if (p + j < pend)
3417 {
3418 if (!PUSH_PATTERN_OP (p + j, fail_stack))
3419 {
3420 RESET_FAIL_STACK ();
3421 return -2;
3422 }
3423 }
3424 else
3425 bufp->can_be_null = 1;
3426
3427 if (succeed_n_p)
3428 {
3429 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
3430 succeed_n_p = false;
3431 }
3432
3433 continue;
3434
3435
3436 case succeed_n:
3437 /* Get to the number of times to succeed. */
3438 p += 2;
3439
3440 /* Increment p past the n for when k != 0. */
3441 EXTRACT_NUMBER_AND_INCR (k, p);
3442 if (k == 0)
3443 {
3444 p -= 4;
3445 succeed_n_p = true; /* Spaghetti code alert. */
3446 goto handle_on_failure_jump;
3447 }
3448 continue;
3449
3450
3451 case set_number_at:
3452 p += 4;
3453 continue;
3454
3455
3456 case start_memory:
3457 case stop_memory:
3458 p += 2;
3459 continue;
3460
3461
3462 default:
3463 abort (); /* We have listed all the cases. */
3464 } /* switch *p++ */
3465
3466 /* Getting here means we have found the possible starting
3467 characters for one path of the pattern -- and that the empty
3468 string does not match. We need not follow this path further.
3469 Instead, look at the next alternative (remembered on the
3470 stack), or quit if no more. The test at the top of the loop
3471 does these things. */
3472 path_can_be_null = false;
3473 p = pend;
3474 } /* while p */
3475
3476 /* Set `can_be_null' for the last path (also the first path, if the
3477 pattern is empty). */
3478 bufp->can_be_null |= path_can_be_null;
3479
3480 done:
3481 RESET_FAIL_STACK ();
3482 return 0;
3483} /* re_compile_fastmap */
3484#ifdef _LIBC
3485weak_alias (__re_compile_fastmap, re_compile_fastmap)
3486#endif
David Lamparter6b0655a2014-06-04 06:53:35 +02003487
paul718e3742002-12-13 20:15:29 +00003488/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3489 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3490 this memory for recording register information. STARTS and ENDS
3491 must be allocated using the malloc library routine, and must each
3492 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3493
3494 If NUM_REGS == 0, then subsequent matches should allocate their own
3495 register data.
3496
3497 Unless this function is called, the first search or match using
3498 PATTERN_BUFFER will allocate its own register data, without
3499 freeing the old data. */
3500
3501void
3502re_set_registers (bufp, regs, num_regs, starts, ends)
3503 struct re_pattern_buffer *bufp;
3504 struct re_registers *regs;
3505 unsigned num_regs;
3506 regoff_t *starts, *ends;
3507{
3508 if (num_regs)
3509 {
3510 bufp->regs_allocated = REGS_REALLOCATE;
3511 regs->num_regs = num_regs;
3512 regs->start = starts;
3513 regs->end = ends;
3514 }
3515 else
3516 {
3517 bufp->regs_allocated = REGS_UNALLOCATED;
3518 regs->num_regs = 0;
3519 regs->start = regs->end = (regoff_t *) 0;
3520 }
3521}
3522#ifdef _LIBC
3523weak_alias (__re_set_registers, re_set_registers)
3524#endif
David Lamparter6b0655a2014-06-04 06:53:35 +02003525
paul718e3742002-12-13 20:15:29 +00003526/* Searching routines. */
3527
3528/* Like re_search_2, below, but only one string is specified, and
3529 doesn't let you say where to stop matching. */
3530
3531int
3532re_search (bufp, string, size, startpos, range, regs)
3533 struct re_pattern_buffer *bufp;
3534 const char *string;
3535 int size, startpos, range;
3536 struct re_registers *regs;
3537{
3538 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3539 regs, size);
3540}
3541#ifdef _LIBC
3542weak_alias (__re_search, re_search)
3543#endif
3544
3545
3546/* Using the compiled pattern in BUFP->buffer, first tries to match the
3547 virtual concatenation of STRING1 and STRING2, starting first at index
3548 STARTPOS, then at STARTPOS + 1, and so on.
3549
3550 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3551
3552 RANGE is how far to scan while trying to match. RANGE = 0 means try
3553 only at STARTPOS; in general, the last start tried is STARTPOS +
3554 RANGE.
3555
3556 In REGS, return the indices of the virtual concatenation of STRING1
3557 and STRING2 that matched the entire BUFP->buffer and its contained
3558 subexpressions.
3559
3560 Do not consider matching one past the index STOP in the virtual
3561 concatenation of STRING1 and STRING2.
3562
3563 We return either the position in the strings at which the match was
3564 found, -1 if no match, or -2 if error (such as failure
3565 stack overflow). */
3566
3567int
3568re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
3569 struct re_pattern_buffer *bufp;
3570 const char *string1, *string2;
3571 int size1, size2;
3572 int startpos;
3573 int range;
3574 struct re_registers *regs;
3575 int stop;
3576{
3577 int val;
3578 register char *fastmap = bufp->fastmap;
3579 register RE_TRANSLATE_TYPE translate = bufp->translate;
3580 int total_size = size1 + size2;
3581 int endpos = startpos + range;
3582
3583 /* Check for out-of-range STARTPOS. */
3584 if (startpos < 0 || startpos > total_size)
3585 return -1;
3586
3587 /* Fix up RANGE if it might eventually take us outside
3588 the virtual concatenation of STRING1 and STRING2.
3589 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
3590 if (endpos < 0)
3591 range = 0 - startpos;
3592 else if (endpos > total_size)
3593 range = total_size - startpos;
3594
3595 /* If the search isn't to be a backwards one, don't waste time in a
3596 search for a pattern that must be anchored. */
3597 if (bufp->used > 0 && range > 0
3598 && ((re_opcode_t) bufp->buffer[0] == begbuf
3599 /* `begline' is like `begbuf' if it cannot match at newlines. */
3600 || ((re_opcode_t) bufp->buffer[0] == begline
3601 && !bufp->newline_anchor)))
3602 {
3603 if (startpos > 0)
3604 return -1;
3605 else
3606 range = 1;
3607 }
3608
3609#ifdef emacs
3610 /* In a forward search for something that starts with \=.
3611 don't keep searching past point. */
3612 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
3613 {
3614 range = PT - startpos;
3615 if (range <= 0)
3616 return -1;
3617 }
3618#endif /* emacs */
3619
3620 /* Update the fastmap now if not correct already. */
3621 if (fastmap && !bufp->fastmap_accurate)
3622 if (re_compile_fastmap (bufp) == -2)
3623 return -2;
3624
3625 /* Loop through the string, looking for a place to start matching. */
3626 for (;;)
3627 {
3628 /* If a fastmap is supplied, skip quickly over characters that
3629 cannot be the start of a match. If the pattern can match the
3630 null string, however, we don't need to skip characters; we want
3631 the first null string. */
3632 if (fastmap && startpos < total_size && !bufp->can_be_null)
3633 {
3634 if (range > 0) /* Searching forwards. */
3635 {
3636 register const char *d;
3637 register int lim = 0;
3638 int irange = range;
3639
3640 if (startpos < size1 && startpos + range >= size1)
3641 lim = range - (size1 - startpos);
3642
3643 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
3644
3645 /* Written out as an if-else to avoid testing `translate'
3646 inside the loop. */
3647 if (translate)
3648 while (range > lim
3649 && !fastmap[(unsigned char)
3650 translate[(unsigned char) *d++]])
3651 range--;
3652 else
3653 while (range > lim && !fastmap[(unsigned char) *d++])
3654 range--;
3655
3656 startpos += irange - range;
3657 }
3658 else /* Searching backwards. */
3659 {
3660 register char c = (size1 == 0 || startpos >= size1
3661 ? string2[startpos - size1]
3662 : string1[startpos]);
3663
3664 if (!fastmap[(unsigned char) TRANSLATE (c)])
3665 goto advance;
3666 }
3667 }
3668
3669 /* If can't match the null string, and that's all we have left, fail. */
3670 if (range >= 0 && startpos == total_size && fastmap
3671 && !bufp->can_be_null)
3672 return -1;
3673
3674 val = re_match_2_internal (bufp, string1, size1, string2, size2,
3675 startpos, regs, stop);
3676#ifndef REGEX_MALLOC
3677# ifdef C_ALLOCA
3678 alloca (0);
3679# endif
3680#endif
3681
3682 if (val >= 0)
3683 return startpos;
3684
3685 if (val == -2)
3686 return -2;
3687
3688 advance:
3689 if (!range)
3690 break;
3691 else if (range > 0)
3692 {
3693 range--;
3694 startpos++;
3695 }
3696 else
3697 {
3698 range++;
3699 startpos--;
3700 }
3701 }
3702 return -1;
3703} /* re_search_2 */
3704#ifdef _LIBC
3705weak_alias (__re_search_2, re_search_2)
3706#endif
David Lamparter6b0655a2014-06-04 06:53:35 +02003707
paul718e3742002-12-13 20:15:29 +00003708/* This converts PTR, a pointer into one of the search strings `string1'
3709 and `string2' into an offset from the beginning of that string. */
3710#define POINTER_TO_OFFSET(ptr) \
3711 (FIRST_STRING_P (ptr) \
3712 ? ((regoff_t) ((ptr) - string1)) \
3713 : ((regoff_t) ((ptr) - string2 + size1)))
3714
3715/* Macros for dealing with the split strings in re_match_2. */
3716
3717#define MATCHING_IN_FIRST_STRING (dend == end_match_1)
3718
3719/* Call before fetching a character with *d. This switches over to
3720 string2 if necessary. */
3721#define PREFETCH() \
3722 while (d == dend) \
3723 { \
3724 /* End of string2 => fail. */ \
3725 if (dend == end_match_2) \
3726 goto fail; \
3727 /* End of string1 => advance to string2. */ \
3728 d = string2; \
3729 dend = end_match_2; \
3730 }
3731
3732
3733/* Test if at very beginning or at very end of the virtual concatenation
3734 of `string1' and `string2'. If only one string, it's `string2'. */
3735#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3736#define AT_STRINGS_END(d) ((d) == end2)
3737
3738
3739/* Test if D points to a character which is word-constituent. We have
3740 two special cases to check for: if past the end of string1, look at
3741 the first character in string2; and if before the beginning of
3742 string2, look at the last character in string1. */
3743#define WORDCHAR_P(d) \
3744 (SYNTAX ((d) == end1 ? *string2 \
3745 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
3746 == Sword)
3747
3748/* Disabled due to a compiler bug -- see comment at case wordbound */
3749#if 0
3750/* Test if the character before D and the one at D differ with respect
3751 to being word-constituent. */
3752#define AT_WORD_BOUNDARY(d) \
3753 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
3754 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
3755#endif
3756
3757/* Free everything we malloc. */
3758#ifdef MATCH_MAY_ALLOCATE
3759# define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
3760# define FREE_VARIABLES() \
3761 do { \
3762 REGEX_FREE_STACK (fail_stack.stack); \
3763 FREE_VAR (regstart); \
3764 FREE_VAR (regend); \
3765 FREE_VAR (old_regstart); \
3766 FREE_VAR (old_regend); \
3767 FREE_VAR (best_regstart); \
3768 FREE_VAR (best_regend); \
3769 FREE_VAR (reg_info); \
3770 FREE_VAR (reg_dummy); \
3771 FREE_VAR (reg_info_dummy); \
3772 } while (0)
3773#else
3774# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
3775#endif /* not MATCH_MAY_ALLOCATE */
3776
3777/* These values must meet several constraints. They must not be valid
3778 register values; since we have a limit of 255 registers (because
3779 we use only one byte in the pattern for the register number), we can
3780 use numbers larger than 255. They must differ by 1, because of
3781 NUM_FAILURE_ITEMS above. And the value for the lowest register must
3782 be larger than the value for the highest register, so we do not try
3783 to actually save any registers when none are active. */
3784#define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
3785#define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
David Lamparter6b0655a2014-06-04 06:53:35 +02003786
paul718e3742002-12-13 20:15:29 +00003787/* Matching routines. */
3788
3789#ifndef emacs /* Emacs never uses this. */
3790/* re_match is like re_match_2 except it takes only a single string. */
3791
3792int
3793re_match (bufp, string, size, pos, regs)
3794 struct re_pattern_buffer *bufp;
3795 const char *string;
3796 int size, pos;
3797 struct re_registers *regs;
3798{
3799 int result = re_match_2_internal (bufp, NULL, 0, string, size,
3800 pos, regs, size);
3801# ifndef REGEX_MALLOC
3802# ifdef C_ALLOCA
3803 alloca (0);
3804# endif
3805# endif
3806 return result;
3807}
3808# ifdef _LIBC
3809weak_alias (__re_match, re_match)
3810# endif
3811#endif /* not emacs */
3812
3813static boolean group_match_null_string_p _RE_ARGS ((unsigned char **p,
3814 unsigned char *end,
3815 register_info_type *reg_info));
3816static boolean alt_match_null_string_p _RE_ARGS ((unsigned char *p,
3817 unsigned char *end,
3818 register_info_type *reg_info));
3819static boolean common_op_match_null_string_p _RE_ARGS ((unsigned char **p,
3820 unsigned char *end,
3821 register_info_type *reg_info));
3822static int bcmp_translate _RE_ARGS ((const char *s1, const char *s2,
3823 int len, char *translate));
3824
3825/* re_match_2 matches the compiled pattern in BUFP against the
3826 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
3827 and SIZE2, respectively). We start matching at POS, and stop
3828 matching at STOP.
3829
3830 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
3831 store offsets for the substring each group matched in REGS. See the
3832 documentation for exactly how many groups we fill.
3833
3834 We return -1 if no match, -2 if an internal error (such as the
3835 failure stack overflowing). Otherwise, we return the length of the
3836 matched substring. */
3837
3838int
3839re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
3840 struct re_pattern_buffer *bufp;
3841 const char *string1, *string2;
3842 int size1, size2;
3843 int pos;
3844 struct re_registers *regs;
3845 int stop;
3846{
3847 int result = re_match_2_internal (bufp, string1, size1, string2, size2,
3848 pos, regs, stop);
3849#ifndef REGEX_MALLOC
3850# ifdef C_ALLOCA
3851 alloca (0);
3852# endif
3853#endif
3854 return result;
3855}
3856#ifdef _LIBC
3857weak_alias (__re_match_2, re_match_2)
3858#endif
3859
3860/* This is a separate function so that we can force an alloca cleanup
3861 afterwards. */
3862static int
3863re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
3864 struct re_pattern_buffer *bufp;
3865 const char *string1, *string2;
3866 int size1, size2;
3867 int pos;
3868 struct re_registers *regs;
3869 int stop;
3870{
3871 /* General temporaries. */
3872 int mcnt;
3873 unsigned char *p1;
3874
3875 /* Just past the end of the corresponding string. */
3876 const char *end1, *end2;
3877
3878 /* Pointers into string1 and string2, just past the last characters in
3879 each to consider matching. */
3880 const char *end_match_1, *end_match_2;
3881
3882 /* Where we are in the data, and the end of the current string. */
3883 const char *d, *dend;
3884
3885 /* Where we are in the pattern, and the end of the pattern. */
3886 unsigned char *p = bufp->buffer;
3887 register unsigned char *pend = p + bufp->used;
3888
3889 /* Mark the opcode just after a start_memory, so we can test for an
3890 empty subpattern when we get to the stop_memory. */
3891 unsigned char *just_past_start_mem = 0;
3892
3893 /* We use this to map every character in the string. */
3894 RE_TRANSLATE_TYPE translate = bufp->translate;
3895
3896 /* Failure point stack. Each place that can handle a failure further
3897 down the line pushes a failure point on this stack. It consists of
3898 restart, regend, and reg_info for all registers corresponding to
3899 the subexpressions we're currently inside, plus the number of such
3900 registers, and, finally, two char *'s. The first char * is where
3901 to resume scanning the pattern; the second one is where to resume
3902 scanning the strings. If the latter is zero, the failure point is
3903 a ``dummy''; if a failure happens and the failure point is a dummy,
3904 it gets discarded and the next next one is tried. */
3905#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
3906 fail_stack_type fail_stack;
3907#endif
3908#ifdef DEBUG
3909 static unsigned failure_id;
3910 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
3911#endif
3912
3913#ifdef REL_ALLOC
3914 /* This holds the pointer to the failure stack, when
3915 it is allocated relocatably. */
3916 fail_stack_elt_t *failure_stack_ptr;
3917#endif
3918
3919 /* We fill all the registers internally, independent of what we
3920 return, for use in backreferences. The number here includes
3921 an element for register zero. */
3922 size_t num_regs = bufp->re_nsub + 1;
3923
3924 /* The currently active registers. */
3925 active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3926 active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3927
3928 /* Information on the contents of registers. These are pointers into
3929 the input strings; they record just what was matched (on this
3930 attempt) by a subexpression part of the pattern, that is, the
3931 regnum-th regstart pointer points to where in the pattern we began
3932 matching and the regnum-th regend points to right after where we
3933 stopped matching the regnum-th subexpression. (The zeroth register
3934 keeps track of what the whole pattern matches.) */
3935#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3936 const char **regstart, **regend;
3937#endif
3938
3939 /* If a group that's operated upon by a repetition operator fails to
3940 match anything, then the register for its start will need to be
3941 restored because it will have been set to wherever in the string we
3942 are when we last see its open-group operator. Similarly for a
3943 register's end. */
3944#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3945 const char **old_regstart, **old_regend;
3946#endif
3947
3948 /* The is_active field of reg_info helps us keep track of which (possibly
3949 nested) subexpressions we are currently in. The matched_something
3950 field of reg_info[reg_num] helps us tell whether or not we have
3951 matched any of the pattern so far this time through the reg_num-th
3952 subexpression. These two fields get reset each time through any
3953 loop their register is in. */
3954#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
3955 register_info_type *reg_info;
3956#endif
3957
3958 /* The following record the register info as found in the above
3959 variables when we find a match better than any we've seen before.
3960 This happens as we backtrack through the failure points, which in
3961 turn happens only if we have not yet matched the entire string. */
3962 unsigned best_regs_set = false;
3963#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3964 const char **best_regstart, **best_regend;
3965#endif
3966
3967 /* Logically, this is `best_regend[0]'. But we don't want to have to
3968 allocate space for that if we're not allocating space for anything
3969 else (see below). Also, we never need info about register 0 for
3970 any of the other register vectors, and it seems rather a kludge to
3971 treat `best_regend' differently than the rest. So we keep track of
3972 the end of the best match so far in a separate variable. We
3973 initialize this to NULL so that when we backtrack the first time
3974 and need to test it, it's not garbage. */
3975 const char *match_end = NULL;
3976
3977 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
3978 int set_regs_matched_done = 0;
3979
3980 /* Used when we pop values we don't care about. */
3981#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3982 const char **reg_dummy;
3983 register_info_type *reg_info_dummy;
3984#endif
3985
3986#ifdef DEBUG
3987 /* Counts the total number of registers pushed. */
3988 unsigned num_regs_pushed = 0;
3989#endif
3990
3991 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
3992
3993 INIT_FAIL_STACK ();
3994
3995#ifdef MATCH_MAY_ALLOCATE
3996 /* Do not bother to initialize all the register variables if there are
3997 no groups in the pattern, as it takes a fair amount of time. If
3998 there are groups, we include space for register 0 (the whole
3999 pattern), even though we never use it, since it simplifies the
4000 array indexing. We should fix this. */
4001 if (bufp->re_nsub)
4002 {
4003 regstart = REGEX_TALLOC (num_regs, const char *);
4004 regend = REGEX_TALLOC (num_regs, const char *);
4005 old_regstart = REGEX_TALLOC (num_regs, const char *);
4006 old_regend = REGEX_TALLOC (num_regs, const char *);
4007 best_regstart = REGEX_TALLOC (num_regs, const char *);
4008 best_regend = REGEX_TALLOC (num_regs, const char *);
4009 reg_info = REGEX_TALLOC (num_regs, register_info_type);
4010 reg_dummy = REGEX_TALLOC (num_regs, const char *);
4011 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
4012
4013 if (!(regstart && regend && old_regstart && old_regend && reg_info
4014 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
4015 {
4016 FREE_VARIABLES ();
4017 return -2;
4018 }
4019 }
4020 else
4021 {
4022 /* We must initialize all our variables to NULL, so that
4023 `FREE_VARIABLES' doesn't try to free them. */
4024 regstart = regend = old_regstart = old_regend = best_regstart
4025 = best_regend = reg_dummy = NULL;
4026 reg_info = reg_info_dummy = (register_info_type *) NULL;
4027 }
4028#endif /* MATCH_MAY_ALLOCATE */
4029
4030 /* The starting position is bogus. */
4031 if (pos < 0 || pos > size1 + size2)
4032 {
4033 FREE_VARIABLES ();
4034 return -1;
4035 }
4036
4037 /* Initialize subexpression text positions to -1 to mark ones that no
4038 start_memory/stop_memory has been seen for. Also initialize the
4039 register information struct. */
4040 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
4041 {
4042 regstart[mcnt] = regend[mcnt]
4043 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
4044
4045 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
4046 IS_ACTIVE (reg_info[mcnt]) = 0;
4047 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4048 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4049 }
4050
4051 /* We move `string1' into `string2' if the latter's empty -- but not if
4052 `string1' is null. */
4053 if (size2 == 0 && string1 != NULL)
4054 {
4055 string2 = string1;
4056 size2 = size1;
4057 string1 = 0;
4058 size1 = 0;
4059 }
4060 end1 = string1 + size1;
4061 end2 = string2 + size2;
4062
4063 /* Compute where to stop matching, within the two strings. */
4064 if (stop <= size1)
4065 {
4066 end_match_1 = string1 + stop;
4067 end_match_2 = string2;
4068 }
4069 else
4070 {
4071 end_match_1 = end1;
4072 end_match_2 = string2 + stop - size1;
4073 }
4074
4075 /* `p' scans through the pattern as `d' scans through the data.
4076 `dend' is the end of the input string that `d' points within. `d'
4077 is advanced into the following input string whenever necessary, but
4078 this happens before fetching; therefore, at the beginning of the
4079 loop, `d' can be pointing at the end of a string, but it cannot
4080 equal `string2'. */
4081 if (size1 > 0 && pos <= size1)
4082 {
4083 d = string1 + pos;
4084 dend = end_match_1;
4085 }
4086 else
4087 {
4088 d = string2 + pos - size1;
4089 dend = end_match_2;
4090 }
4091
4092 DEBUG_PRINT1 ("The compiled pattern is:\n");
4093 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4094 DEBUG_PRINT1 ("The string to match is: `");
4095 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4096 DEBUG_PRINT1 ("'\n");
4097
4098 /* This loops over pattern commands. It exits by returning from the
4099 function if the match is complete, or it drops through if the match
4100 fails at this starting point in the input data. */
4101 for (;;)
4102 {
4103#ifdef _LIBC
4104 DEBUG_PRINT2 ("\n%p: ", p);
4105#else
4106 DEBUG_PRINT2 ("\n0x%x: ", p);
4107#endif
4108
4109 if (p == pend)
4110 { /* End of pattern means we might have succeeded. */
4111 DEBUG_PRINT1 ("end of pattern ... ");
4112
4113 /* If we haven't matched the entire string, and we want the
4114 longest match, try backtracking. */
4115 if (d != end_match_2)
4116 {
4117 /* 1 if this match ends in the same string (string1 or string2)
4118 as the best previous match. */
4119 boolean same_str_p = (FIRST_STRING_P (match_end)
4120 == MATCHING_IN_FIRST_STRING);
4121 /* 1 if this match is the best seen so far. */
4122 boolean best_match_p;
4123
4124 /* AIX compiler got confused when this was combined
4125 with the previous declaration. */
4126 if (same_str_p)
4127 best_match_p = d > match_end;
4128 else
4129 best_match_p = !MATCHING_IN_FIRST_STRING;
4130
4131 DEBUG_PRINT1 ("backtracking.\n");
4132
4133 if (!FAIL_STACK_EMPTY ())
4134 { /* More failure points to try. */
4135
4136 /* If exceeds best match so far, save it. */
4137 if (!best_regs_set || best_match_p)
4138 {
4139 best_regs_set = true;
4140 match_end = d;
4141
4142 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4143
4144 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
4145 {
4146 best_regstart[mcnt] = regstart[mcnt];
4147 best_regend[mcnt] = regend[mcnt];
4148 }
4149 }
4150 goto fail;
4151 }
4152
4153 /* If no failure points, don't restore garbage. And if
4154 last match is real best match, don't restore second
4155 best one. */
4156 else if (best_regs_set && !best_match_p)
4157 {
4158 restore_best_regs:
4159 /* Restore best match. It may happen that `dend ==
4160 end_match_1' while the restored d is in string2.
4161 For example, the pattern `x.*y.*z' against the
4162 strings `x-' and `y-z-', if the two strings are
4163 not consecutive in memory. */
4164 DEBUG_PRINT1 ("Restoring best registers.\n");
4165
4166 d = match_end;
4167 dend = ((d >= string1 && d <= end1)
4168 ? end_match_1 : end_match_2);
4169
4170 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
4171 {
4172 regstart[mcnt] = best_regstart[mcnt];
4173 regend[mcnt] = best_regend[mcnt];
4174 }
4175 }
4176 } /* d != end_match_2 */
4177
4178 succeed_label:
4179 DEBUG_PRINT1 ("Accepting match.\n");
4180
4181 /* If caller wants register contents data back, do it. */
4182 if (regs && !bufp->no_sub)
4183 {
4184 /* Have the register data arrays been allocated? */
4185 if (bufp->regs_allocated == REGS_UNALLOCATED)
4186 { /* No. So allocate them with malloc. We need one
4187 extra element beyond `num_regs' for the `-1' marker
4188 GNU code uses. */
4189 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
4190 regs->start = TALLOC (regs->num_regs, regoff_t);
4191 regs->end = TALLOC (regs->num_regs, regoff_t);
4192 if (regs->start == NULL || regs->end == NULL)
4193 {
4194 FREE_VARIABLES ();
4195 return -2;
4196 }
4197 bufp->regs_allocated = REGS_REALLOCATE;
4198 }
4199 else if (bufp->regs_allocated == REGS_REALLOCATE)
4200 { /* Yes. If we need more elements than were already
4201 allocated, reallocate them. If we need fewer, just
4202 leave it alone. */
4203 if (regs->num_regs < num_regs + 1)
4204 {
4205 regs->num_regs = num_regs + 1;
4206 RETALLOC (regs->start, regs->num_regs, regoff_t);
4207 RETALLOC (regs->end, regs->num_regs, regoff_t);
4208 if (regs->start == NULL || regs->end == NULL)
4209 {
4210 FREE_VARIABLES ();
4211 return -2;
4212 }
4213 }
4214 }
4215 else
4216 {
4217 /* These braces fend off a "empty body in an else-statement"
4218 warning under GCC when assert expands to nothing. */
4219 assert (bufp->regs_allocated == REGS_FIXED);
4220 }
4221
4222 /* Convert the pointer data in `regstart' and `regend' to
4223 indices. Register zero has to be set differently,
4224 since we haven't kept track of any info for it. */
4225 if (regs->num_regs > 0)
4226 {
4227 regs->start[0] = pos;
4228 regs->end[0] = (MATCHING_IN_FIRST_STRING
4229 ? ((regoff_t) (d - string1))
4230 : ((regoff_t) (d - string2 + size1)));
4231 }
4232
4233 /* Go through the first `min (num_regs, regs->num_regs)'
4234 registers, since that is all we initialized. */
4235 for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs);
4236 mcnt++)
4237 {
4238 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
4239 regs->start[mcnt] = regs->end[mcnt] = -1;
4240 else
4241 {
4242 regs->start[mcnt]
4243 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
4244 regs->end[mcnt]
4245 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
4246 }
4247 }
4248
4249 /* If the regs structure we return has more elements than
4250 were in the pattern, set the extra elements to -1. If
4251 we (re)allocated the registers, this is the case,
4252 because we always allocate enough to have at least one
4253 -1 at the end. */
4254 for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++)
4255 regs->start[mcnt] = regs->end[mcnt] = -1;
4256 } /* regs && !bufp->no_sub */
4257
4258 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4259 nfailure_points_pushed, nfailure_points_popped,
4260 nfailure_points_pushed - nfailure_points_popped);
4261 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
4262
4263 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
4264 ? string1
4265 : string2 - size1);
4266
4267 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
4268
4269 FREE_VARIABLES ();
4270 return mcnt;
4271 }
4272
4273 /* Otherwise match next pattern command. */
4274 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4275 {
4276 /* Ignore these. Used to ignore the n of succeed_n's which
4277 currently have n == 0. */
4278 case no_op:
4279 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4280 break;
4281
4282 case succeed:
4283 DEBUG_PRINT1 ("EXECUTING succeed.\n");
4284 goto succeed_label;
4285
4286 /* Match the next n pattern characters exactly. The following
4287 byte in the pattern defines n, and the n bytes after that
4288 are the characters to match. */
4289 case exactn:
4290 mcnt = *p++;
4291 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
4292
4293 /* This is written out as an if-else so we don't waste time
4294 testing `translate' inside the loop. */
4295 if (translate)
4296 {
4297 do
4298 {
4299 PREFETCH ();
4300 if ((unsigned char) translate[(unsigned char) *d++]
4301 != (unsigned char) *p++)
4302 goto fail;
4303 }
4304 while (--mcnt);
4305 }
4306 else
4307 {
4308 do
4309 {
4310 PREFETCH ();
4311 if (*d++ != (char) *p++) goto fail;
4312 }
4313 while (--mcnt);
4314 }
4315 SET_REGS_MATCHED ();
4316 break;
4317
4318
4319 /* Match any character except possibly a newline or a null. */
4320 case anychar:
4321 DEBUG_PRINT1 ("EXECUTING anychar.\n");
4322
4323 PREFETCH ();
4324
4325 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
4326 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
4327 goto fail;
4328
4329 SET_REGS_MATCHED ();
4330 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4331 d++;
4332 break;
4333
4334
4335 case charset:
4336 case charset_not:
4337 {
4338 register unsigned char c;
4339 boolean not = (re_opcode_t) *(p - 1) == charset_not;
4340
4341 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
4342
4343 PREFETCH ();
4344 c = TRANSLATE (*d); /* The character to match. */
4345
4346 /* Cast to `unsigned' instead of `unsigned char' in case the
4347 bit list is a full 32 bytes long. */
4348 if (c < (unsigned) (*p * BYTEWIDTH)
4349 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4350 not = !not;
4351
4352 p += 1 + *p;
4353
4354 if (!not) goto fail;
4355
4356 SET_REGS_MATCHED ();
4357 d++;
4358 break;
4359 }
4360
4361
4362 /* The beginning of a group is represented by start_memory.
4363 The arguments are the register number in the next byte, and the
4364 number of groups inner to this one in the next. The text
4365 matched within the group is recorded (in the internal
4366 registers data structure) under the register number. */
4367 case start_memory:
4368 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
4369
4370 /* Find out if this group can match the empty string. */
4371 p1 = p; /* To send to group_match_null_string_p. */
4372
4373 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
4374 REG_MATCH_NULL_STRING_P (reg_info[*p])
4375 = group_match_null_string_p (&p1, pend, reg_info);
4376
4377 /* Save the position in the string where we were the last time
4378 we were at this open-group operator in case the group is
4379 operated upon by a repetition operator, e.g., with `(a*)*b'
4380 against `ab'; then we want to ignore where we are now in
4381 the string in case this attempt to match fails. */
4382 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4383 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
4384 : regstart[*p];
4385 DEBUG_PRINT2 (" old_regstart: %d\n",
4386 POINTER_TO_OFFSET (old_regstart[*p]));
4387
4388 regstart[*p] = d;
4389 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4390
4391 IS_ACTIVE (reg_info[*p]) = 1;
4392 MATCHED_SOMETHING (reg_info[*p]) = 0;
4393
4394 /* Clear this whenever we change the register activity status. */
4395 set_regs_matched_done = 0;
4396
4397 /* This is the new highest active register. */
4398 highest_active_reg = *p;
4399
4400 /* If nothing was active before, this is the new lowest active
4401 register. */
4402 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4403 lowest_active_reg = *p;
4404
4405 /* Move past the register number and inner group count. */
4406 p += 2;
4407 just_past_start_mem = p;
4408
4409 break;
4410
4411
4412 /* The stop_memory opcode represents the end of a group. Its
4413 arguments are the same as start_memory's: the register
4414 number, and the number of inner groups. */
4415 case stop_memory:
4416 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
4417
4418 /* We need to save the string position the last time we were at
4419 this close-group operator in case the group is operated
4420 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
4421 against `aba'; then we want to ignore where we are now in
4422 the string in case this attempt to match fails. */
4423 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4424 ? REG_UNSET (regend[*p]) ? d : regend[*p]
4425 : regend[*p];
4426 DEBUG_PRINT2 (" old_regend: %d\n",
4427 POINTER_TO_OFFSET (old_regend[*p]));
4428
4429 regend[*p] = d;
4430 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
4431
4432 /* This register isn't active anymore. */
4433 IS_ACTIVE (reg_info[*p]) = 0;
4434
4435 /* Clear this whenever we change the register activity status. */
4436 set_regs_matched_done = 0;
4437
4438 /* If this was the only register active, nothing is active
4439 anymore. */
4440 if (lowest_active_reg == highest_active_reg)
4441 {
4442 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4443 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4444 }
4445 else
4446 { /* We must scan for the new highest active register, since
4447 it isn't necessarily one less than now: consider
4448 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
4449 new highest active register is 1. */
4450 unsigned char r = *p - 1;
4451 while (r > 0 && !IS_ACTIVE (reg_info[r]))
4452 r--;
4453
4454 /* If we end up at register zero, that means that we saved
4455 the registers as the result of an `on_failure_jump', not
4456 a `start_memory', and we jumped to past the innermost
4457 `stop_memory'. For example, in ((.)*) we save
4458 registers 1 and 2 as a result of the *, but when we pop
4459 back to the second ), we are at the stop_memory 1.
4460 Thus, nothing is active. */
4461 if (r == 0)
4462 {
4463 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4464 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4465 }
4466 else
4467 highest_active_reg = r;
4468 }
4469
4470 /* If just failed to match something this time around with a
4471 group that's operated on by a repetition operator, try to
4472 force exit from the ``loop'', and restore the register
4473 information for this group that we had before trying this
4474 last match. */
4475 if ((!MATCHED_SOMETHING (reg_info[*p])
4476 || just_past_start_mem == p - 1)
4477 && (p + 2) < pend)
4478 {
4479 boolean is_a_jump_n = false;
4480
4481 p1 = p + 2;
4482 mcnt = 0;
4483 switch ((re_opcode_t) *p1++)
4484 {
4485 case jump_n:
4486 is_a_jump_n = true;
4487 case pop_failure_jump:
4488 case maybe_pop_jump:
4489 case jump:
4490 case dummy_failure_jump:
4491 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4492 if (is_a_jump_n)
4493 p1 += 2;
4494 break;
4495
4496 default:
4497 /* do nothing */ ;
4498 }
4499 p1 += mcnt;
4500
4501 /* If the next operation is a jump backwards in the pattern
4502 to an on_failure_jump right before the start_memory
4503 corresponding to this stop_memory, exit from the loop
4504 by forcing a failure after pushing on the stack the
4505 on_failure_jump's jump in the pattern, and d. */
4506 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
4507 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
4508 {
4509 /* If this group ever matched anything, then restore
4510 what its registers were before trying this last
4511 failed match, e.g., with `(a*)*b' against `ab' for
4512 regstart[1], and, e.g., with `((a*)*(b*)*)*'
4513 against `aba' for regend[3].
4514
4515 Also restore the registers for inner groups for,
4516 e.g., `((a*)(b*))*' against `aba' (register 3 would
4517 otherwise get trashed). */
4518
4519 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
4520 {
4521 unsigned r;
4522
4523 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
4524
4525 /* Restore this and inner groups' (if any) registers. */
4526 for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1);
4527 r++)
4528 {
4529 regstart[r] = old_regstart[r];
4530
4531 /* xx why this test? */
4532 if (old_regend[r] >= regstart[r])
4533 regend[r] = old_regend[r];
4534 }
4535 }
4536 p1++;
4537 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4538 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
4539
4540 goto fail;
4541 }
4542 }
4543
4544 /* Move past the register number and the inner group count. */
4545 p += 2;
4546 break;
4547
4548
4549 /* \<digit> has been turned into a `duplicate' command which is
4550 followed by the numeric value of <digit> as the register number. */
4551 case duplicate:
4552 {
4553 register const char *d2, *dend2;
4554 int regno = *p++; /* Get which register to match against. */
4555 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
4556
4557 /* Can't back reference a group which we've never matched. */
4558 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
4559 goto fail;
4560
4561 /* Where in input to try to start matching. */
4562 d2 = regstart[regno];
4563
4564 /* Where to stop matching; if both the place to start and
4565 the place to stop matching are in the same string, then
4566 set to the place to stop, otherwise, for now have to use
4567 the end of the first string. */
4568
4569 dend2 = ((FIRST_STRING_P (regstart[regno])
4570 == FIRST_STRING_P (regend[regno]))
4571 ? regend[regno] : end_match_1);
4572 for (;;)
4573 {
4574 /* If necessary, advance to next segment in register
4575 contents. */
4576 while (d2 == dend2)
4577 {
4578 if (dend2 == end_match_2) break;
4579 if (dend2 == regend[regno]) break;
4580
4581 /* End of string1 => advance to string2. */
4582 d2 = string2;
4583 dend2 = regend[regno];
4584 }
4585 /* At end of register contents => success */
4586 if (d2 == dend2) break;
4587
4588 /* If necessary, advance to next segment in data. */
4589 PREFETCH ();
4590
4591 /* How many characters left in this segment to match. */
4592 mcnt = dend - d;
4593
4594 /* Want how many consecutive characters we can match in
4595 one shot, so, if necessary, adjust the count. */
4596 if (mcnt > dend2 - d2)
4597 mcnt = dend2 - d2;
4598
4599 /* Compare that many; failure if mismatch, else move
4600 past them. */
4601 if (translate
4602 ? bcmp_translate (d, d2, mcnt, translate)
4603 : memcmp (d, d2, mcnt))
4604 goto fail;
4605 d += mcnt, d2 += mcnt;
4606
4607 /* Do this because we've match some characters. */
4608 SET_REGS_MATCHED ();
4609 }
4610 }
4611 break;
4612
4613
4614 /* begline matches the empty string at the beginning of the string
4615 (unless `not_bol' is set in `bufp'), and, if
4616 `newline_anchor' is set, after newlines. */
4617 case begline:
4618 DEBUG_PRINT1 ("EXECUTING begline.\n");
4619
4620 if (AT_STRINGS_BEG (d))
4621 {
4622 if (!bufp->not_bol) break;
4623 }
4624 else if (d[-1] == '\n' && bufp->newline_anchor)
4625 {
4626 break;
4627 }
4628 /* In all other cases, we fail. */
4629 goto fail;
4630
4631
4632 /* endline is the dual of begline. */
4633 case endline:
4634 DEBUG_PRINT1 ("EXECUTING endline.\n");
4635
4636 if (AT_STRINGS_END (d))
4637 {
4638 if (!bufp->not_eol) break;
4639 }
4640
4641 /* We have to ``prefetch'' the next character. */
4642 else if ((d == end1 ? *string2 : *d) == '\n'
4643 && bufp->newline_anchor)
4644 {
4645 break;
4646 }
4647 goto fail;
4648
4649
4650 /* Match at the very beginning of the data. */
4651 case begbuf:
4652 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
4653 if (AT_STRINGS_BEG (d))
4654 break;
4655 goto fail;
4656
4657
4658 /* Match at the very end of the data. */
4659 case endbuf:
4660 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
4661 if (AT_STRINGS_END (d))
4662 break;
4663 goto fail;
4664
4665
4666 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
4667 pushes NULL as the value for the string on the stack. Then
4668 `pop_failure_point' will keep the current value for the
4669 string, instead of restoring it. To see why, consider
4670 matching `foo\nbar' against `.*\n'. The .* matches the foo;
4671 then the . fails against the \n. But the next thing we want
4672 to do is match the \n against the \n; if we restored the
4673 string value, we would be back at the foo.
4674
4675 Because this is used only in specific cases, we don't need to
4676 check all the things that `on_failure_jump' does, to make
4677 sure the right things get saved on the stack. Hence we don't
4678 share its code. The only reason to push anything on the
4679 stack at all is that otherwise we would have to change
4680 `anychar's code to do something besides goto fail in this
4681 case; that seems worse than this. */
4682 case on_failure_keep_string_jump:
4683 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
4684
4685 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4686#ifdef _LIBC
4687 DEBUG_PRINT3 (" %d (to %p):\n", mcnt, p + mcnt);
4688#else
4689 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
4690#endif
4691
4692 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
4693 break;
4694
4695
4696 /* Uses of on_failure_jump:
4697
4698 Each alternative starts with an on_failure_jump that points
4699 to the beginning of the next alternative. Each alternative
4700 except the last ends with a jump that in effect jumps past
4701 the rest of the alternatives. (They really jump to the
4702 ending jump of the following alternative, because tensioning
4703 these jumps is a hassle.)
4704
4705 Repeats start with an on_failure_jump that points past both
4706 the repetition text and either the following jump or
4707 pop_failure_jump back to this on_failure_jump. */
4708 case on_failure_jump:
4709 on_failure:
4710 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
4711
4712 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4713#ifdef _LIBC
4714 DEBUG_PRINT3 (" %d (to %p)", mcnt, p + mcnt);
4715#else
4716 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
4717#endif
4718
4719 /* If this on_failure_jump comes right before a group (i.e.,
4720 the original * applied to a group), save the information
4721 for that group and all inner ones, so that if we fail back
4722 to this point, the group's information will be correct.
4723 For example, in \(a*\)*\1, we need the preceding group,
4724 and in \(zz\(a*\)b*\)\2, we need the inner group. */
4725
4726 /* We can't use `p' to check ahead because we push
4727 a failure point to `p + mcnt' after we do this. */
4728 p1 = p;
4729
4730 /* We need to skip no_op's before we look for the
4731 start_memory in case this on_failure_jump is happening as
4732 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
4733 against aba. */
4734 while (p1 < pend && (re_opcode_t) *p1 == no_op)
4735 p1++;
4736
4737 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
4738 {
4739 /* We have a new highest active register now. This will
4740 get reset at the start_memory we are about to get to,
4741 but we will have saved all the registers relevant to
4742 this repetition op, as described above. */
4743 highest_active_reg = *(p1 + 1) + *(p1 + 2);
4744 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4745 lowest_active_reg = *(p1 + 1);
4746 }
4747
4748 DEBUG_PRINT1 (":\n");
4749 PUSH_FAILURE_POINT (p + mcnt, d, -2);
4750 break;
4751
4752
4753 /* A smart repeat ends with `maybe_pop_jump'.
4754 We change it to either `pop_failure_jump' or `jump'. */
4755 case maybe_pop_jump:
4756 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4757 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
4758 {
4759 register unsigned char *p2 = p;
4760
4761 /* Compare the beginning of the repeat with what in the
4762 pattern follows its end. If we can establish that there
4763 is nothing that they would both match, i.e., that we
4764 would have to backtrack because of (as in, e.g., `a*a')
4765 then we can change to pop_failure_jump, because we'll
4766 never have to backtrack.
4767
4768 This is not true in the case of alternatives: in
4769 `(a|ab)*' we do need to backtrack to the `ab' alternative
4770 (e.g., if the string was `ab'). But instead of trying to
4771 detect that here, the alternative has put on a dummy
4772 failure point which is what we will end up popping. */
4773
4774 /* Skip over open/close-group commands.
4775 If what follows this loop is a ...+ construct,
4776 look at what begins its body, since we will have to
4777 match at least one of that. */
4778 while (1)
4779 {
4780 if (p2 + 2 < pend
4781 && ((re_opcode_t) *p2 == stop_memory
4782 || (re_opcode_t) *p2 == start_memory))
4783 p2 += 3;
4784 else if (p2 + 6 < pend
4785 && (re_opcode_t) *p2 == dummy_failure_jump)
4786 p2 += 6;
4787 else
4788 break;
4789 }
4790
4791 p1 = p + mcnt;
4792 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
4793 to the `maybe_finalize_jump' of this case. Examine what
4794 follows. */
4795
4796 /* If we're at the end of the pattern, we can change. */
4797 if (p2 == pend)
4798 {
4799 /* Consider what happens when matching ":\(.*\)"
4800 against ":/". I don't really understand this code
4801 yet. */
4802 p[-3] = (unsigned char) pop_failure_jump;
4803 DEBUG_PRINT1
4804 (" End of pattern: change to `pop_failure_jump'.\n");
4805 }
4806
4807 else if ((re_opcode_t) *p2 == exactn
4808 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
4809 {
4810 register unsigned char c
4811 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4812
4813 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
4814 {
4815 p[-3] = (unsigned char) pop_failure_jump;
4816 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4817 c, p1[5]);
4818 }
4819
4820 else if ((re_opcode_t) p1[3] == charset
4821 || (re_opcode_t) p1[3] == charset_not)
4822 {
4823 int not = (re_opcode_t) p1[3] == charset_not;
4824
4825 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
4826 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4827 not = !not;
4828
4829 /* `not' is equal to 1 if c would match, which means
4830 that we can't change to pop_failure_jump. */
4831 if (!not)
4832 {
4833 p[-3] = (unsigned char) pop_failure_jump;
4834 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4835 }
4836 }
4837 }
4838 else if ((re_opcode_t) *p2 == charset)
4839 {
4840#ifdef DEBUG
4841 register unsigned char c
4842 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4843#endif
4844
4845#if 0
4846 if ((re_opcode_t) p1[3] == exactn
4847 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
4848 && (p2[2 + p1[5] / BYTEWIDTH]
4849 & (1 << (p1[5] % BYTEWIDTH)))))
4850#else
4851 if ((re_opcode_t) p1[3] == exactn
4852 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[4]
4853 && (p2[2 + p1[4] / BYTEWIDTH]
4854 & (1 << (p1[4] % BYTEWIDTH)))))
4855#endif
4856 {
4857 p[-3] = (unsigned char) pop_failure_jump;
4858 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4859 c, p1[5]);
4860 }
4861
4862 else if ((re_opcode_t) p1[3] == charset_not)
4863 {
4864 int idx;
4865 /* We win if the charset_not inside the loop
4866 lists every character listed in the charset after. */
4867 for (idx = 0; idx < (int) p2[1]; idx++)
4868 if (! (p2[2 + idx] == 0
4869 || (idx < (int) p1[4]
4870 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
4871 break;
4872
4873 if (idx == p2[1])
4874 {
4875 p[-3] = (unsigned char) pop_failure_jump;
4876 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4877 }
4878 }
4879 else if ((re_opcode_t) p1[3] == charset)
4880 {
4881 int idx;
4882 /* We win if the charset inside the loop
4883 has no overlap with the one after the loop. */
4884 for (idx = 0;
4885 idx < (int) p2[1] && idx < (int) p1[4];
4886 idx++)
4887 if ((p2[2 + idx] & p1[5 + idx]) != 0)
4888 break;
4889
4890 if (idx == p2[1] || idx == p1[4])
4891 {
4892 p[-3] = (unsigned char) pop_failure_jump;
4893 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4894 }
4895 }
4896 }
4897 }
4898 p -= 2; /* Point at relative address again. */
4899 if ((re_opcode_t) p[-1] != pop_failure_jump)
4900 {
4901 p[-1] = (unsigned char) jump;
4902 DEBUG_PRINT1 (" Match => jump.\n");
4903 goto unconditional_jump;
4904 }
4905 /* Note fall through. */
4906
4907
4908 /* The end of a simple repeat has a pop_failure_jump back to
4909 its matching on_failure_jump, where the latter will push a
4910 failure point. The pop_failure_jump takes off failure
4911 points put on by this pop_failure_jump's matching
4912 on_failure_jump; we got through the pattern to here from the
4913 matching on_failure_jump, so didn't fail. */
4914 case pop_failure_jump:
4915 {
4916 /* We need to pass separate storage for the lowest and
4917 highest registers, even though we don't care about the
4918 actual values. Otherwise, we will restore only one
4919 register from the stack, since lowest will == highest in
4920 `pop_failure_point'. */
4921 active_reg_t dummy_low_reg, dummy_high_reg;
4922 unsigned char *pdummy;
4923 const char *sdummy;
4924
4925 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
4926 POP_FAILURE_POINT (sdummy, pdummy,
4927 dummy_low_reg, dummy_high_reg,
4928 reg_dummy, reg_dummy, reg_info_dummy);
4929 }
4930 /* Note fall through. */
4931
4932 unconditional_jump:
4933#ifdef _LIBC
4934 DEBUG_PRINT2 ("\n%p: ", p);
4935#else
4936 DEBUG_PRINT2 ("\n0x%x: ", p);
4937#endif
4938 /* Note fall through. */
4939
4940 /* Unconditionally jump (without popping any failure points). */
4941 case jump:
4942 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
4943 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
4944 p += mcnt; /* Do the jump. */
4945#ifdef _LIBC
4946 DEBUG_PRINT2 ("(to %p).\n", p);
4947#else
4948 DEBUG_PRINT2 ("(to 0x%x).\n", p);
4949#endif
4950 break;
4951
4952
4953 /* We need this opcode so we can detect where alternatives end
4954 in `group_match_null_string_p' et al. */
4955 case jump_past_alt:
4956 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
4957 goto unconditional_jump;
4958
4959
4960 /* Normally, the on_failure_jump pushes a failure point, which
4961 then gets popped at pop_failure_jump. We will end up at
4962 pop_failure_jump, also, and with a pattern of, say, `a+', we
4963 are skipping over the on_failure_jump, so we have to push
4964 something meaningless for pop_failure_jump to pop. */
4965 case dummy_failure_jump:
4966 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
4967 /* It doesn't matter what we push for the string here. What
4968 the code at `fail' tests is the value for the pattern. */
4969 PUSH_FAILURE_POINT (NULL, NULL, -2);
4970 goto unconditional_jump;
4971
4972
4973 /* At the end of an alternative, we need to push a dummy failure
4974 point in case we are followed by a `pop_failure_jump', because
4975 we don't want the failure point for the alternative to be
4976 popped. For example, matching `(a|ab)*' against `aab'
4977 requires that we match the `ab' alternative. */
4978 case push_dummy_failure:
4979 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
4980 /* See comments just above at `dummy_failure_jump' about the
4981 two zeroes. */
4982 PUSH_FAILURE_POINT (NULL, NULL, -2);
4983 break;
4984
4985 /* Have to succeed matching what follows at least n times.
4986 After that, handle like `on_failure_jump'. */
4987 case succeed_n:
4988 EXTRACT_NUMBER (mcnt, p + 2);
4989 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
4990
4991 assert (mcnt >= 0);
4992 /* Originally, this is how many times we HAVE to succeed. */
4993 if (mcnt > 0)
4994 {
4995 mcnt--;
4996 p += 2;
4997 STORE_NUMBER_AND_INCR (p, mcnt);
4998#ifdef _LIBC
4999 DEBUG_PRINT3 (" Setting %p to %d.\n", p - 2, mcnt);
5000#else
5001 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - 2, mcnt);
5002#endif
5003 }
5004 else if (mcnt == 0)
5005 {
5006#ifdef _LIBC
5007 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", p+2);
5008#else
5009 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
5010#endif
5011 p[2] = (unsigned char) no_op;
5012 p[3] = (unsigned char) no_op;
5013 goto on_failure;
5014 }
5015 break;
5016
5017 case jump_n:
5018 EXTRACT_NUMBER (mcnt, p + 2);
5019 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5020
5021 /* Originally, this is how many times we CAN jump. */
5022 if (mcnt)
5023 {
5024 mcnt--;
5025 STORE_NUMBER (p + 2, mcnt);
5026#ifdef _LIBC
5027 DEBUG_PRINT3 (" Setting %p to %d.\n", p + 2, mcnt);
5028#else
5029 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + 2, mcnt);
5030#endif
5031 goto unconditional_jump;
5032 }
5033 /* If don't have to jump any more, skip over the rest of command. */
5034 else
5035 p += 4;
5036 break;
5037
5038 case set_number_at:
5039 {
5040 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
5041
5042 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5043 p1 = p + mcnt;
5044 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5045#ifdef _LIBC
5046 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
5047#else
5048 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
5049#endif
5050 STORE_NUMBER (p1, mcnt);
5051 break;
5052 }
5053
5054#if 0
5055 /* The DEC Alpha C compiler 3.x generates incorrect code for the
5056 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
5057 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
5058 macro and introducing temporary variables works around the bug. */
5059
5060 case wordbound:
5061 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
5062 if (AT_WORD_BOUNDARY (d))
5063 break;
5064 goto fail;
5065
5066 case notwordbound:
5067 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
5068 if (AT_WORD_BOUNDARY (d))
5069 goto fail;
5070 break;
5071#else
5072 case wordbound:
5073 {
5074 boolean prevchar, thischar;
5075
5076 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
5077 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5078 break;
5079
5080 prevchar = WORDCHAR_P (d - 1);
5081 thischar = WORDCHAR_P (d);
5082 if (prevchar != thischar)
5083 break;
5084 goto fail;
5085 }
5086
5087 case notwordbound:
5088 {
5089 boolean prevchar, thischar;
5090
5091 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
5092 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5093 goto fail;
5094
5095 prevchar = WORDCHAR_P (d - 1);
5096 thischar = WORDCHAR_P (d);
5097 if (prevchar != thischar)
5098 goto fail;
5099 break;
5100 }
5101#endif
5102
5103 case wordbeg:
5104 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
5105 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
5106 break;
5107 goto fail;
5108
5109 case wordend:
5110 DEBUG_PRINT1 ("EXECUTING wordend.\n");
5111 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
5112 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
5113 break;
5114 goto fail;
5115
5116#ifdef emacs
5117 case before_dot:
5118 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5119 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
5120 goto fail;
5121 break;
5122
5123 case at_dot:
5124 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5125 if (PTR_CHAR_POS ((unsigned char *) d) != point)
5126 goto fail;
5127 break;
5128
5129 case after_dot:
5130 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5131 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
5132 goto fail;
5133 break;
5134
5135 case syntaxspec:
5136 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
5137 mcnt = *p++;
5138 goto matchsyntax;
5139
5140 case wordchar:
5141 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
5142 mcnt = (int) Sword;
5143 matchsyntax:
5144 PREFETCH ();
5145 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
5146 d++;
5147 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt)
5148 goto fail;
5149 SET_REGS_MATCHED ();
5150 break;
5151
5152 case notsyntaxspec:
5153 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
5154 mcnt = *p++;
5155 goto matchnotsyntax;
5156
5157 case notwordchar:
5158 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
5159 mcnt = (int) Sword;
5160 matchnotsyntax:
5161 PREFETCH ();
5162 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
5163 d++;
5164 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt)
5165 goto fail;
5166 SET_REGS_MATCHED ();
5167 break;
5168
5169#else /* not emacs */
5170 case wordchar:
5171 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
5172 PREFETCH ();
5173 if (!WORDCHAR_P (d))
5174 goto fail;
5175 SET_REGS_MATCHED ();
5176 d++;
5177 break;
5178
5179 case notwordchar:
5180 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
5181 PREFETCH ();
5182 if (WORDCHAR_P (d))
5183 goto fail;
5184 SET_REGS_MATCHED ();
5185 d++;
5186 break;
5187#endif /* not emacs */
5188
5189 default:
5190 abort ();
5191 }
5192 continue; /* Successfully executed one pattern command; keep going. */
5193
5194
5195 /* We goto here if a matching operation fails. */
5196 fail:
5197 if (!FAIL_STACK_EMPTY ())
5198 { /* A restart point is known. Restore to that state. */
5199 DEBUG_PRINT1 ("\nFAIL:\n");
5200 POP_FAILURE_POINT (d, p,
5201 lowest_active_reg, highest_active_reg,
5202 regstart, regend, reg_info);
5203
5204 /* If this failure point is a dummy, try the next one. */
5205 if (!p)
5206 goto fail;
5207
5208 /* If we failed to the end of the pattern, don't examine *p. */
5209 assert (p <= pend);
5210 if (p < pend)
5211 {
5212 boolean is_a_jump_n = false;
5213
5214 /* If failed to a backwards jump that's part of a repetition
5215 loop, need to pop this failure point and use the next one. */
5216 switch ((re_opcode_t) *p)
5217 {
5218 case jump_n:
5219 is_a_jump_n = true;
5220 case maybe_pop_jump:
5221 case pop_failure_jump:
5222 case jump:
5223 p1 = p + 1;
5224 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5225 p1 += mcnt;
5226
5227 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
5228 || (!is_a_jump_n
5229 && (re_opcode_t) *p1 == on_failure_jump))
5230 goto fail;
5231 break;
5232 default:
5233 /* do nothing */ ;
5234 }
5235 }
5236
5237 if (d >= string1 && d <= end1)
5238 dend = end_match_1;
5239 }
5240 else
5241 break; /* Matching at this starting point really fails. */
5242 } /* for (;;) */
5243
5244 if (best_regs_set)
5245 goto restore_best_regs;
5246
5247 FREE_VARIABLES ();
5248
5249 return -1; /* Failure to match. */
5250} /* re_match_2 */
David Lamparter6b0655a2014-06-04 06:53:35 +02005251
paul718e3742002-12-13 20:15:29 +00005252/* Subroutine definitions for re_match_2. */
5253
5254
5255/* We are passed P pointing to a register number after a start_memory.
5256
5257 Return true if the pattern up to the corresponding stop_memory can
5258 match the empty string, and false otherwise.
5259
5260 If we find the matching stop_memory, sets P to point to one past its number.
5261 Otherwise, sets P to an undefined byte less than or equal to END.
5262
5263 We don't handle duplicates properly (yet). */
5264
5265static boolean
5266group_match_null_string_p (p, end, reg_info)
5267 unsigned char **p, *end;
5268 register_info_type *reg_info;
5269{
5270 int mcnt;
5271 /* Point to after the args to the start_memory. */
5272 unsigned char *p1 = *p + 2;
5273
5274 while (p1 < end)
5275 {
5276 /* Skip over opcodes that can match nothing, and return true or
5277 false, as appropriate, when we get to one that can't, or to the
5278 matching stop_memory. */
5279
5280 switch ((re_opcode_t) *p1)
5281 {
5282 /* Could be either a loop or a series of alternatives. */
5283 case on_failure_jump:
5284 p1++;
5285 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5286
5287 /* If the next operation is not a jump backwards in the
5288 pattern. */
5289
5290 if (mcnt >= 0)
5291 {
5292 /* Go through the on_failure_jumps of the alternatives,
5293 seeing if any of the alternatives cannot match nothing.
5294 The last alternative starts with only a jump,
5295 whereas the rest start with on_failure_jump and end
5296 with a jump, e.g., here is the pattern for `a|b|c':
5297
5298 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
5299 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
5300 /exactn/1/c
5301
5302 So, we have to first go through the first (n-1)
5303 alternatives and then deal with the last one separately. */
5304
5305
5306 /* Deal with the first (n-1) alternatives, which start
5307 with an on_failure_jump (see above) that jumps to right
5308 past a jump_past_alt. */
5309
5310 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
5311 {
5312 /* `mcnt' holds how many bytes long the alternative
5313 is, including the ending `jump_past_alt' and
5314 its number. */
5315
5316 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
5317 reg_info))
5318 return false;
5319
5320 /* Move to right after this alternative, including the
5321 jump_past_alt. */
5322 p1 += mcnt;
5323
5324 /* Break if it's the beginning of an n-th alternative
5325 that doesn't begin with an on_failure_jump. */
5326 if ((re_opcode_t) *p1 != on_failure_jump)
5327 break;
5328
5329 /* Still have to check that it's not an n-th
5330 alternative that starts with an on_failure_jump. */
5331 p1++;
5332 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5333 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
5334 {
5335 /* Get to the beginning of the n-th alternative. */
5336 p1 -= 3;
5337 break;
5338 }
5339 }
5340
5341 /* Deal with the last alternative: go back and get number
5342 of the `jump_past_alt' just before it. `mcnt' contains
5343 the length of the alternative. */
5344 EXTRACT_NUMBER (mcnt, p1 - 2);
5345
5346 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
5347 return false;
5348
5349 p1 += mcnt; /* Get past the n-th alternative. */
5350 } /* if mcnt > 0 */
5351 break;
5352
5353
5354 case stop_memory:
5355 assert (p1[1] == **p);
5356 *p = p1 + 2;
5357 return true;
5358
5359
5360 default:
5361 if (!common_op_match_null_string_p (&p1, end, reg_info))
5362 return false;
5363 }
5364 } /* while p1 < end */
5365
5366 return false;
5367} /* group_match_null_string_p */
5368
5369
5370/* Similar to group_match_null_string_p, but doesn't deal with alternatives:
5371 It expects P to be the first byte of a single alternative and END one
5372 byte past the last. The alternative can contain groups. */
5373
5374static boolean
5375alt_match_null_string_p (p, end, reg_info)
5376 unsigned char *p, *end;
5377 register_info_type *reg_info;
5378{
5379 int mcnt;
5380 unsigned char *p1 = p;
5381
5382 while (p1 < end)
5383 {
5384 /* Skip over opcodes that can match nothing, and break when we get
5385 to one that can't. */
5386
5387 switch ((re_opcode_t) *p1)
5388 {
5389 /* It's a loop. */
5390 case on_failure_jump:
5391 p1++;
5392 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5393 p1 += mcnt;
5394 break;
5395
5396 default:
5397 if (!common_op_match_null_string_p (&p1, end, reg_info))
5398 return false;
5399 }
5400 } /* while p1 < end */
5401
5402 return true;
5403} /* alt_match_null_string_p */
5404
5405
5406/* Deals with the ops common to group_match_null_string_p and
5407 alt_match_null_string_p.
5408
5409 Sets P to one after the op and its arguments, if any. */
5410
5411static boolean
5412common_op_match_null_string_p (p, end, reg_info)
5413 unsigned char **p, *end;
5414 register_info_type *reg_info;
5415{
5416 int mcnt;
5417 boolean ret;
5418 int reg_no;
5419 unsigned char *p1 = *p;
5420
5421 switch ((re_opcode_t) *p1++)
5422 {
5423 case no_op:
5424 case begline:
5425 case endline:
5426 case begbuf:
5427 case endbuf:
5428 case wordbeg:
5429 case wordend:
5430 case wordbound:
5431 case notwordbound:
5432#ifdef emacs
5433 case before_dot:
5434 case at_dot:
5435 case after_dot:
5436#endif
5437 break;
5438
5439 case start_memory:
5440 reg_no = *p1;
5441 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
5442 ret = group_match_null_string_p (&p1, end, reg_info);
5443
5444 /* Have to set this here in case we're checking a group which
5445 contains a group and a back reference to it. */
5446
5447 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
5448 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
5449
5450 if (!ret)
5451 return false;
5452 break;
5453
5454 /* If this is an optimized succeed_n for zero times, make the jump. */
5455 case jump:
5456 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5457 if (mcnt >= 0)
5458 p1 += mcnt;
5459 else
5460 return false;
5461 break;
5462
5463 case succeed_n:
5464 /* Get to the number of times to succeed. */
5465 p1 += 2;
5466 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5467
5468 if (mcnt == 0)
5469 {
5470 p1 -= 4;
5471 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5472 p1 += mcnt;
5473 }
5474 else
5475 return false;
5476 break;
5477
5478 case duplicate:
5479 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
5480 return false;
5481 break;
5482
5483 case set_number_at:
5484 p1 += 4;
5485
5486 default:
5487 /* All other opcodes mean we cannot match the empty string. */
5488 return false;
5489 }
5490
5491 *p = p1;
5492 return true;
5493} /* common_op_match_null_string_p */
5494
5495
5496/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
5497 bytes; nonzero otherwise. */
5498
5499static int
5500bcmp_translate (s1, s2, len, translate)
5501 const char *s1, *s2;
5502 register int len;
5503 RE_TRANSLATE_TYPE translate;
5504{
5505 register const unsigned char *p1 = (const unsigned char *) s1;
5506 register const unsigned char *p2 = (const unsigned char *) s2;
5507 while (len)
5508 {
5509 if (translate[*p1++] != translate[*p2++]) return 1;
5510 len--;
5511 }
5512 return 0;
5513}
David Lamparter6b0655a2014-06-04 06:53:35 +02005514
paul718e3742002-12-13 20:15:29 +00005515/* Entry points for GNU code. */
5516
5517/* re_compile_pattern is the GNU regular expression compiler: it
5518 compiles PATTERN (of length SIZE) and puts the result in BUFP.
5519 Returns 0 if the pattern was valid, otherwise an error string.
5520
5521 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
5522 are set in BUFP on entry.
5523
5524 We call regex_compile to do the actual compilation. */
5525
5526const char *
5527re_compile_pattern (pattern, length, bufp)
5528 const char *pattern;
5529 size_t length;
5530 struct re_pattern_buffer *bufp;
5531{
5532 reg_errcode_t ret;
5533
5534 /* GNU code is written to assume at least RE_NREGS registers will be set
5535 (and at least one extra will be -1). */
5536 bufp->regs_allocated = REGS_UNALLOCATED;
5537
5538 /* And GNU code determines whether or not to get register information
5539 by passing null for the REGS argument to re_match, etc., not by
5540 setting no_sub. */
5541 bufp->no_sub = 0;
5542
5543 /* Match anchors at newline. */
5544 bufp->newline_anchor = 1;
5545
5546 ret = regex_compile (pattern, length, re_syntax_options, bufp);
5547
5548 if (!ret)
5549 return NULL;
5550 return gettext (re_error_msgid + re_error_msgid_idx[(int) ret]);
5551}
5552#ifdef _LIBC
5553weak_alias (__re_compile_pattern, re_compile_pattern)
5554#endif
David Lamparter6b0655a2014-06-04 06:53:35 +02005555
paul718e3742002-12-13 20:15:29 +00005556/* Entry points compatible with 4.2 BSD regex library. We don't define
5557 them unless specifically requested. */
5558
5559#if defined _REGEX_RE_COMP || defined _LIBC
5560
5561/* BSD has one and only one pattern buffer. */
5562static struct re_pattern_buffer re_comp_buf;
5563
5564char *
5565#ifdef _LIBC
5566/* Make these definitions weak in libc, so POSIX programs can redefine
5567 these names if they don't use our functions, and still use
5568 regcomp/regexec below without link errors. */
5569weak_function
5570#endif
5571re_comp (s)
5572 const char *s;
5573{
5574 reg_errcode_t ret;
5575
5576 if (!s)
5577 {
5578 if (!re_comp_buf.buffer)
5579 return gettext ("No previous regular expression");
5580 return 0;
5581 }
5582
5583 if (!re_comp_buf.buffer)
5584 {
5585 re_comp_buf.buffer = (unsigned char *) malloc (200);
5586 if (re_comp_buf.buffer == NULL)
5587 return (char *) gettext (re_error_msgid
5588 + re_error_msgid_idx[(int) REG_ESPACE]);
5589 re_comp_buf.allocated = 200;
5590
5591 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
5592 if (re_comp_buf.fastmap == NULL)
5593 return (char *) gettext (re_error_msgid
5594 + re_error_msgid_idx[(int) REG_ESPACE]);
5595 }
5596
5597 /* Since `re_exec' always passes NULL for the `regs' argument, we
5598 don't need to initialize the pattern buffer fields which affect it. */
5599
5600 /* Match anchors at newlines. */
5601 re_comp_buf.newline_anchor = 1;
5602
5603 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5604
5605 if (!ret)
5606 return NULL;
5607
5608 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5609 return (char *) gettext (re_error_msgid + re_error_msgid_idx[(int) ret]);
5610}
5611
5612
5613int
5614#ifdef _LIBC
5615weak_function
5616#endif
5617re_exec (s)
5618 const char *s;
5619{
5620 const int len = strlen (s);
5621 return
5622 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
5623}
5624
5625#endif /* _REGEX_RE_COMP */
David Lamparter6b0655a2014-06-04 06:53:35 +02005626
paul718e3742002-12-13 20:15:29 +00005627/* POSIX.2 functions. Don't define these for Emacs. */
5628
5629#ifndef emacs
5630
5631/* regcomp takes a regular expression as a string and compiles it.
5632
5633 PREG is a regex_t *. We do not expect any fields to be initialized,
5634 since POSIX says we shouldn't. Thus, we set
5635
5636 `buffer' to the compiled pattern;
5637 `used' to the length of the compiled pattern;
5638 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
5639 REG_EXTENDED bit in CFLAGS is set; otherwise, to
5640 RE_SYNTAX_POSIX_BASIC;
5641 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
5642 `fastmap' to an allocated space for the fastmap;
5643 `fastmap_accurate' to zero;
5644 `re_nsub' to the number of subexpressions in PATTERN.
5645
5646 PATTERN is the address of the pattern string.
5647
5648 CFLAGS is a series of bits which affect compilation.
5649
5650 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
5651 use POSIX basic syntax.
5652
5653 If REG_NEWLINE is set, then . and [^...] don't match newline.
5654 Also, regexec will try a match beginning after every newline.
5655
5656 If REG_ICASE is set, then we considers upper- and lowercase
5657 versions of letters to be equivalent when matching.
5658
5659 If REG_NOSUB is set, then when PREG is passed to regexec, that
5660 routine will report only success or failure, and nothing about the
5661 registers.
5662
5663 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
5664 the return codes and their meanings.) */
5665
5666int
5667regcomp (preg, pattern, cflags)
5668 regex_t *preg;
5669 const char *pattern;
5670 int cflags;
5671{
5672 reg_errcode_t ret;
5673 reg_syntax_t syntax
5674 = (cflags & REG_EXTENDED) ?
5675 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
5676
5677 /* regex_compile will allocate the space for the compiled pattern. */
5678 preg->buffer = 0;
5679 preg->allocated = 0;
5680 preg->used = 0;
5681
5682 /* Try to allocate space for the fastmap. */
5683 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
5684
5685 if (cflags & REG_ICASE)
5686 {
5687 unsigned i;
5688
5689 preg->translate
5690 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
5691 * sizeof (*(RE_TRANSLATE_TYPE)0));
5692 if (preg->translate == NULL)
5693 return (int) REG_ESPACE;
5694
5695 /* Map uppercase characters to corresponding lowercase ones. */
5696 for (i = 0; i < CHAR_SET_SIZE; i++)
5697 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
5698 }
5699 else
5700 preg->translate = NULL;
5701
5702 /* If REG_NEWLINE is set, newlines are treated differently. */
5703 if (cflags & REG_NEWLINE)
5704 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
5705 syntax &= ~RE_DOT_NEWLINE;
5706 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
5707 /* It also changes the matching behavior. */
5708 preg->newline_anchor = 1;
5709 }
5710 else
5711 preg->newline_anchor = 0;
5712
5713 preg->no_sub = !!(cflags & REG_NOSUB);
5714
5715 /* POSIX says a null character in the pattern terminates it, so we
5716 can use strlen here in compiling the pattern. */
5717 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
5718
5719 /* POSIX doesn't distinguish between an unmatched open-group and an
5720 unmatched close-group: both are REG_EPAREN. */
5721 if (ret == REG_ERPAREN) ret = REG_EPAREN;
5722
5723 if (ret == REG_NOERROR && preg->fastmap)
5724 {
5725 /* Compute the fastmap now, since regexec cannot modify the pattern
5726 buffer. */
5727 if (re_compile_fastmap (preg) == -2)
5728 {
5729 /* Some error occured while computing the fastmap, just forget
5730 about it. */
5731 free (preg->fastmap);
5732 preg->fastmap = NULL;
5733 }
5734 }
5735
5736 return (int) ret;
5737}
5738#ifdef _LIBC
5739weak_alias (__regcomp, regcomp)
5740#endif
5741
5742
5743/* regexec searches for a given pattern, specified by PREG, in the
5744 string STRING.
5745
5746 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
5747 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
5748 least NMATCH elements, and we set them to the offsets of the
5749 corresponding matched substrings.
5750
5751 EFLAGS specifies `execution flags' which affect matching: if
5752 REG_NOTBOL is set, then ^ does not match at the beginning of the
5753 string; if REG_NOTEOL is set, then $ does not match at the end.
5754
5755 We return 0 if we find a match and REG_NOMATCH if not. */
5756
5757int
5758regexec (preg, string, nmatch, pmatch, eflags)
5759 const regex_t *preg;
5760 const char *string;
5761 size_t nmatch;
5762 regmatch_t pmatch[];
5763 int eflags;
5764{
5765 int ret;
5766 struct re_registers regs;
5767 regex_t private_preg;
5768 int len = strlen (string);
5769 boolean want_reg_info = !preg->no_sub && nmatch > 0;
5770
5771 private_preg = *preg;
5772
5773 private_preg.not_bol = !!(eflags & REG_NOTBOL);
5774 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5775
5776 /* The user has told us exactly how many registers to return
5777 information about, via `nmatch'. We have to pass that on to the
5778 matching routines. */
5779 private_preg.regs_allocated = REGS_FIXED;
5780
5781 if (want_reg_info)
5782 {
5783 regs.num_regs = nmatch;
5784 regs.start = TALLOC (nmatch * 2, regoff_t);
5785 if (regs.start == NULL)
5786 return (int) REG_NOMATCH;
5787 regs.end = regs.start + nmatch;
5788 }
5789
5790 /* Perform the searching operation. */
5791 ret = re_search (&private_preg, string, len,
5792 /* start: */ 0, /* range: */ len,
5793 want_reg_info ? &regs : (struct re_registers *) 0);
5794
5795 /* Copy the register information to the POSIX structure. */
5796 if (want_reg_info)
5797 {
5798 if (ret >= 0)
5799 {
5800 unsigned r;
5801
5802 for (r = 0; r < nmatch; r++)
5803 {
5804 pmatch[r].rm_so = regs.start[r];
5805 pmatch[r].rm_eo = regs.end[r];
5806 }
5807 }
5808
5809 /* If we needed the temporary register info, free the space now. */
5810 free (regs.start);
5811 }
5812
5813 /* We want zero return to mean success, unlike `re_search'. */
5814 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
5815}
5816#ifdef _LIBC
5817weak_alias (__regexec, regexec)
5818#endif
5819
5820
5821/* Returns a message corresponding to an error code, ERRCODE, returned
5822 from either regcomp or regexec. We don't use PREG here. */
5823
5824size_t
vincenteac314c2006-01-17 23:39:04 +00005825regerror (err, preg, errbuf, errbuf_size)
5826 int err;
paul718e3742002-12-13 20:15:29 +00005827 const regex_t *preg;
5828 char *errbuf;
5829 size_t errbuf_size;
5830{
5831 const char *msg;
5832 size_t msg_size;
5833
vincenteac314c2006-01-17 23:39:04 +00005834 if (err < 0
5835 || err >= (int) (sizeof (re_error_msgid_idx)
paul718e3742002-12-13 20:15:29 +00005836 / sizeof (re_error_msgid_idx[0])))
5837 /* Only error codes returned by the rest of the code should be passed
5838 to this routine. If we are given anything else, or if other regex
5839 code generates an invalid error code, then the program has a bug.
5840 Dump core so we can fix it. */
5841 abort ();
5842
vincenteac314c2006-01-17 23:39:04 +00005843 msg = gettext (re_error_msgid + re_error_msgid_idx[err]);
paul718e3742002-12-13 20:15:29 +00005844
5845 msg_size = strlen (msg) + 1; /* Includes the null. */
5846
5847 if (errbuf_size != 0)
5848 {
5849 if (msg_size > errbuf_size)
5850 {
5851#if defined HAVE_MEMPCPY || defined _LIBC
5852 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
5853#else
5854 memcpy (errbuf, msg, errbuf_size - 1);
5855 errbuf[errbuf_size - 1] = 0;
5856#endif
5857 }
5858 else
5859 memcpy (errbuf, msg, msg_size);
5860 }
5861
5862 return msg_size;
5863}
5864#ifdef _LIBC
5865weak_alias (__regerror, regerror)
5866#endif
5867
5868
5869/* Free dynamically allocated space used by PREG. */
5870
5871void
5872regfree (preg)
5873 regex_t *preg;
5874{
5875 if (preg->buffer != NULL)
5876 free (preg->buffer);
5877 preg->buffer = NULL;
5878
5879 preg->allocated = 0;
5880 preg->used = 0;
5881
5882 if (preg->fastmap != NULL)
5883 free (preg->fastmap);
5884 preg->fastmap = NULL;
5885 preg->fastmap_accurate = 0;
5886
5887 if (preg->translate != NULL)
5888 free (preg->translate);
5889 preg->translate = NULL;
5890}
5891#ifdef _LIBC
5892weak_alias (__regfree, regfree)
5893#endif
5894
5895#endif /* not emacs */