Annotation of src/lib/libc/regex/regcomp.c, Revision 1.41
1.41 ! christos 1: /* $NetBSD: regcomp.c,v 1.40 2021/02/24 18:13:21 christos Exp $ */
1.6 cgd 2:
1.5 cgd 3: /*-
1.39 christos 4: * SPDX-License-Identifier: BSD-3-Clause
5: *
6: * Copyright (c) 1992, 1993, 1994 Henry Spencer.
1.5 cgd 7: * Copyright (c) 1992, 1993, 1994
8: * The Regents of the University of California. All rights reserved.
9: *
1.39 christos 10: * Copyright (c) 2011 The FreeBSD Foundation
11: * All rights reserved.
12: * Portions of this software were developed by David Chisnall
13: * under sponsorship from the FreeBSD Foundation.
14: *
1.5 cgd 15: * This code is derived from software contributed to Berkeley by
16: * Henry Spencer.
17: *
18: * Redistribution and use in source and binary forms, with or without
19: * modification, are permitted provided that the following conditions
20: * are met:
21: * 1. Redistributions of source code must retain the above copyright
22: * notice, this list of conditions and the following disclaimer.
23: * 2. Redistributions in binary form must reproduce the above copyright
24: * notice, this list of conditions and the following disclaimer in the
25: * documentation and/or other materials provided with the distribution.
1.18 agc 26: * 3. Neither the name of the University nor the names of its contributors
27: * may be used to endorse or promote products derived from this software
28: * without specific prior written permission.
29: *
30: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40: * SUCH DAMAGE.
41: *
42: * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
43: */
44:
1.7 christos 45: #include <sys/cdefs.h>
1.6 cgd 46: #if 0
1.5 cgd 47: static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
1.39 christos 48: __FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 368359 2020-12-05 03:18:48Z kevans $");
1.6 cgd 49: #endif
1.41 ! christos 50: __RCSID("$NetBSD: regcomp.c,v 1.40 2021/02/24 18:13:21 christos Exp $");
1.39 christos 51:
52: #define _OPENBSD_SOURCE
53: #define REGEX_GNU_EXTENSIONS
1.5 cgd 54:
1.8 jtc 55: #include "namespace.h"
1.1 jtc 56: #include <sys/types.h>
1.39 christos 57: #include <stdio.h>
58: #include <string.h>
1.1 jtc 59: #include <ctype.h>
60: #include <limits.h>
61: #include <stdlib.h>
1.28 junyoung 62: #include <regex.h>
1.39 christos 63: #include <stdbool.h>
64: #include <wchar.h>
65: #include <wctype.h>
1.8 jtc 66:
67: #ifdef __weak_alias
1.16 mycroft 68: __weak_alias(regcomp,_regcomp)
1.8 jtc 69: #endif
1.1 jtc 70:
1.39 christos 71: #ifdef REGEX_LIBC_COLLATE
72: #include "collate.h"
73: #endif
74:
1.1 jtc 75: #include "utils.h"
76: #include "regex2.h"
77:
78: #include "cname.h"
79:
80: /*
1.39 christos 81: * Branching context, used to keep track of branch state for all of the branch-
82: * aware functions. In addition to keeping track of branch positions for the
83: * p_branch_* functions, we use this to simplify some clumsiness in BREs for
84: * detection of whether ^ is acting as an anchor or being used erroneously and
85: * also for whether we're in a sub-expression or not.
86: */
87: struct branchc {
88: sopno start;
89: sopno back;
90: sopno fwd;
91:
92: int nbranch;
93: int nchain;
94: bool outer;
95: bool terminate;
96: };
97:
98: /*
1.1 jtc 99: * parse structure, passed up and down to avoid global variables and
100: * other clumsinesses
101: */
102: struct parse {
1.21 yamt 103: const char *next; /* next character in RE */
104: const char *end; /* end of string (-> NUL normally) */
1.1 jtc 105: int error; /* has an error been seen? */
1.39 christos 106: int gnuext;
1.1 jtc 107: sop *strip; /* malloced strip */
108: sopno ssize; /* malloced strip size (allocated) */
109: sopno slen; /* malloced strip length (used) */
1.33 christos 110: size_t ncsalloc; /* number of csets allocated */
1.1 jtc 111: struct re_guts *g;
112: # define NPAREN 10 /* we need to remember () 1-9 for back refs */
113: sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
114: sopno pend[NPAREN]; /* -> ) ([0] unused) */
1.39 christos 115: bool allowbranch; /* can this expression branch? */
116: bool bre; /* convenience; is this a BRE? */
117: int pflags; /* other parsing flags -- legacy escapes? */
118: bool (*parse_expr)(struct parse *, struct branchc *);
119: void (*pre_parse)(struct parse *, struct branchc *);
120: void (*post_parse)(struct parse *, struct branchc *);
1.1 jtc 121: };
122:
1.39 christos 123: #define PFLAG_LEGACY_ESC 0x00000001
124:
1.5 cgd 125: /* ========= begin header generated by ./mkh ========= */
126: #ifdef __cplusplus
127: extern "C" {
128: #endif
129:
130: /* === regcomp.c === */
1.39 christos 131: static bool p_ere_exp(struct parse *p, struct branchc *bc);
1.26 junyoung 132: static void p_str(struct parse *p);
1.39 christos 133: static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
134: static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
135: static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
136: static bool p_branch_empty(struct parse *p, struct branchc *bc);
137: static bool p_branch_do(struct parse *p, struct branchc *bc);
138: static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
139: static void p_bre_post_parse(struct parse *p, struct branchc *bc);
140: static void p_re(struct parse *p, int end1, int end2);
141: static bool p_simp_re(struct parse *p, struct branchc *bc);
1.26 junyoung 142: static int p_count(struct parse *p);
143: static void p_bracket(struct parse *p);
1.39 christos 144: static int p_range_cmp(wchar_t c1, wchar_t c2);
1.26 junyoung 145: static void p_b_term(struct parse *p, cset *cs);
1.39 christos 146: #ifdef REGEX_GNU_EXTENSIONS
147: static int p_b_pseudoclass(struct parse *p, char c);
148: #endif
1.26 junyoung 149: static void p_b_cclass(struct parse *p, cset *cs);
1.39 christos 150: static void p_b_cclass_named(struct parse *p, cset *cs, const char[]);
1.26 junyoung 151: static void p_b_eclass(struct parse *p, cset *cs);
1.39 christos 152: static wint_t p_b_symbol(struct parse *p);
153: static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
154: static bool may_escape(struct parse *p, const wint_t ch);
155: static wint_t othercase(wint_t ch);
156: static void bothcases(struct parse *p, wint_t ch);
157: static void ordinary(struct parse *p, wint_t ch);
1.26 junyoung 158: static void nonnewline(struct parse *p);
1.39 christos 159: static void repeat(struct parse *p, sopno start, int from, int to);
1.26 junyoung 160: static int seterr(struct parse *p, int e);
161: static cset *allocset(struct parse *p);
162: static void freeset(struct parse *p, cset *cs);
1.39 christos 163: static void CHadd(struct parse *p, cset *cs, wint_t ch);
164: static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
165: static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
166: static wint_t singleton(cset *cs);
1.26 junyoung 167: static sopno dupl(struct parse *p, sopno start, sopno finish);
1.39 christos 168: static void doemit(struct parse *p, sop op, size_t opnd);
169: static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
170: static void dofwd(struct parse *p, sopno pos, sop value);
1.30 christos 171: static int enlarge(struct parse *p, sopno size);
1.26 junyoung 172: static void stripsnug(struct parse *p, struct re_guts *g);
173: static void findmust(struct parse *p, struct re_guts *g);
1.39 christos 174: static int altoffset(sop *scan, int offset);
175: static void computejumps(struct parse *p, struct re_guts *g);
176: static void computematchjumps(struct parse *p, struct re_guts *g);
1.26 junyoung 177: static sopno pluscount(struct parse *p, struct re_guts *g);
1.39 christos 178: static wint_t wgetnext(struct parse *p);
1.5 cgd 179:
180: #ifdef __cplusplus
181: }
182: #endif
183: /* ========= end header generated by ./mkh ========= */
1.1 jtc 184:
185: static char nuls[10]; /* place to point scanner in event of error */
186:
187: /*
188: * macros for use with parse structure
189: * BEWARE: these know that the parse structure is named `p' !!!
190: */
191: #define PEEK() (*p->next)
192: #define PEEK2() (*(p->next+1))
193: #define MORE() (p->next < p->end)
194: #define MORE2() (p->next+1 < p->end)
195: #define SEE(c) (MORE() && PEEK() == (c))
196: #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
1.39 christos 197: #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a))
1.1 jtc 198: #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
199: #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
1.39 christos 200: #define EATSPEC(a) (p->bre ? EATTWO('\\', a) : EAT(a))
1.1 jtc 201: #define NEXT() (p->next++)
202: #define NEXT2() (p->next += 2)
203: #define NEXTn(n) (p->next += (n))
204: #define GETNEXT() (*p->next++)
1.39 christos 205: #define WGETNEXT() wgetnext(p)
1.1 jtc 206: #define SETERROR(e) seterr(p, (e))
1.39 christos 207: #define REQUIRE(co, e) ((co) || SETERROR(e))
1.1 jtc 208: #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
1.39 christos 209: #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
1.1 jtc 210: #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
1.40 christos 211: #define EMIT(op, sopnd) doemit(p, (op), (sopnd))
212: #define INSERT(op, pos) doinsert(p, (op), HERE()-(pos)+1, pos)
1.12 drochner 213: #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
1.1 jtc 214: #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
215: #define HERE() (p->slen)
216: #define THERE() (p->slen - 1)
1.4 jtc 217: #define THERETHERE() (p->slen - 2)
1.1 jtc 218: #define DROP(n) (p->slen -= (n))
219:
1.39 christos 220: /* Macro used by computejump()/computematchjump() */
1.41 ! christos 221: #ifndef MIN
1.39 christos 222: #define MIN(a,b) ((a)<(b)?(a):(b))
1.41 ! christos 223: #endif
1.30 christos 224:
1.39 christos 225: static int /* 0 success, otherwise REG_something */
226: regcomp_internal(regex_t * __restrict preg,
227: const char * __restrict pattern,
228: int cflags, int pflags)
1.1 jtc 229: {
230: struct parse pa;
1.9 perry 231: struct re_guts *g;
232: struct parse *p = &pa;
233: int i;
234: size_t len;
1.39 christos 235: size_t maxlen;
1.3 jtc 236: #ifdef REDEBUG
237: # define GOODFLAGS(f) (f)
238: #else
239: # define GOODFLAGS(f) ((f)&~REG_DUMP)
240: #endif
1.1 jtc 241:
1.14 lukem 242: _DIAGASSERT(preg != NULL);
243: _DIAGASSERT(pattern != NULL);
244:
1.3 jtc 245: cflags = GOODFLAGS(cflags);
1.1 jtc 246: if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
247: return(REG_INVARG);
248:
249: if (cflags®_PEND) {
250: if (preg->re_endp < pattern)
251: return(REG_INVARG);
252: len = preg->re_endp - pattern;
253: } else
1.11 christos 254: len = strlen(pattern);
1.1 jtc 255:
256: /* do the mallocs early so failure handling is easy */
1.39 christos 257: g = malloc(sizeof(*g));
1.1 jtc 258: if (g == NULL)
259: return(REG_ESPACE);
1.39 christos 260: /*
261: * Limit the pattern space to avoid a 32-bit overflow on buffer
262: * extension. Also avoid any signed overflow in case of conversion
263: * so make the real limit based on a 31-bit overflow.
264: *
265: * Likely not applicable on 64-bit systems but handle the case
266: * generically (who are we to stop people from using ~715MB+
267: * patterns?).
268: */
1.40 christos 269: maxlen = ((size_t)-1 >> 1) / sizeof(*p->strip) * 2 / 3;
1.39 christos 270: if (len >= maxlen) {
1.40 christos 271: free(g);
1.39 christos 272: return(REG_ESPACE);
273: }
1.40 christos 274: p->ssize = (sopno)(len / 2 * 3 + 1); /* ugh */
1.39 christos 275: assert(p->ssize >= len);
276:
277: p->strip = calloc(p->ssize, sizeof(*p->strip));
1.1 jtc 278: p->slen = 0;
279: if (p->strip == NULL) {
1.40 christos 280: free(g);
1.1 jtc 281: return(REG_ESPACE);
282: }
283:
284: /* set things up */
285: p->g = g;
1.39 christos 286: p->next = pattern; /* convenience; we do not modify it */
1.1 jtc 287: p->end = p->next + len;
288: p->error = 0;
289: p->ncsalloc = 0;
1.39 christos 290: p->pflags = pflags;
1.1 jtc 291: for (i = 0; i < NPAREN; i++) {
292: p->pbegin[i] = 0;
293: p->pend[i] = 0;
294: }
1.39 christos 295: #ifdef REGEX_GNU_EXTENSIONS
296: if ((cflags & REG_GNU) == 0) {
297: p->gnuext = false;
298: p->allowbranch = (cflags & REG_EXTENDED) != 0;
299: } else
300: p->gnuext = p->allowbranch = true;
301: #else
302: p->gnuext = false;
303: p->allowbranch = (cflags & REG_EXTENDED) != 0;
304: #endif
305: if (cflags & REG_EXTENDED) {
306: p->bre = false;
307: p->parse_expr = p_ere_exp;
308: p->pre_parse = NULL;
309: p->post_parse = NULL;
310: } else {
311: p->bre = true;
312: p->parse_expr = p_simp_re;
313: p->pre_parse = p_bre_pre_parse;
314: p->post_parse = p_bre_post_parse;
315: }
1.1 jtc 316: g->sets = NULL;
317: g->ncsets = 0;
318: g->cflags = cflags;
319: g->iflags = 0;
320: g->nbol = 0;
321: g->neol = 0;
322: g->must = NULL;
1.39 christos 323: g->moffset = -1;
324: g->charjump = NULL;
325: g->matchjump = NULL;
1.1 jtc 326: g->mlen = 0;
327: g->nsub = 0;
328: g->backrefs = 0;
329:
330: /* do it */
331: EMIT(OEND, 0);
332: g->firststate = THERE();
1.39 christos 333: if (cflags & REG_NOSPEC)
1.1 jtc 334: p_str(p);
335: else
1.39 christos 336: p_re(p, OUT, OUT);
1.1 jtc 337: EMIT(OEND, 0);
338: g->laststate = THERE();
339:
340: /* tidy up loose ends and fill things in */
341: stripsnug(p, g);
342: findmust(p, g);
1.39 christos 343: /* only use Boyer-Moore algorithm if the pattern is bigger
344: * than three characters
345: */
346: if(g->mlen > 3) {
347: computejumps(p, g);
348: computematchjumps(p, g);
349: if(g->matchjump == NULL && g->charjump != NULL) {
350: free(g->charjump);
351: g->charjump = NULL;
352: }
353: }
1.1 jtc 354: g->nplus = pluscount(p, g);
355: g->magic = MAGIC2;
356: preg->re_nsub = g->nsub;
357: preg->re_g = g;
358: preg->re_magic = MAGIC1;
359: #ifndef REDEBUG
360: /* not debugging, so can't rely on the assert() in regexec() */
361: if (g->iflags&BAD)
362: SETERROR(REG_ASSERT);
363: #endif
364:
365: /* win or lose, we're done */
366: if (p->error != 0) /* lose */
367: regfree(preg);
368: return(p->error);
369: }
370:
371: /*
1.39 christos 372: - regcomp - interface for parser and compilation
373: = extern int regcomp(regex_t *, const char *, int);
374: = #define REG_BASIC 0000
375: = #define REG_EXTENDED 0001
376: = #define REG_ICASE 0002
377: = #define REG_NOSUB 0004
378: = #define REG_NEWLINE 0010
379: = #define REG_NOSPEC 0020
380: = #define REG_PEND 0040
381: = #define REG_DUMP 0200
1.1 jtc 382: */
1.39 christos 383: int /* 0 success, otherwise REG_something */
384: regcomp(regex_t * __restrict preg,
385: const char * __restrict pattern,
386: int cflags)
1.1 jtc 387: {
1.30 christos 388:
1.39 christos 389: return (regcomp_internal(preg, pattern, cflags, 0));
1.1 jtc 390: }
391:
392: /*
1.39 christos 393: - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
394: - return whether we should terminate or not
395: == static bool p_ere_exp(struct parse *p);
1.1 jtc 396: */
1.39 christos 397: static bool
398: p_ere_exp(struct parse *p, struct branchc *bc)
1.1 jtc 399: {
1.9 perry 400: char c;
1.39 christos 401: wint_t wc;
1.9 perry 402: sopno pos;
403: int count;
404: int count2;
1.39 christos 405: #ifdef REGEX_GNU_EXTENSIONS
1.40 christos 406: size_t i;
1.39 christos 407: int handled;
408: #endif
1.9 perry 409: sopno subno;
1.1 jtc 410: int wascaret = 0;
411:
1.14 lukem 412: _DIAGASSERT(p != NULL);
413:
1.39 christos 414: (void)bc;
1.1 jtc 415: assert(MORE()); /* caller should have ensured this */
416: c = GETNEXT();
417:
1.39 christos 418: #ifdef REGEX_GNU_EXTENSIONS
419: handled = 0;
420: #endif
1.1 jtc 421: pos = HERE();
422: switch (c) {
423: case '(':
1.39 christos 424: (void)REQUIRE(MORE(), REG_EPAREN);
1.1 jtc 425: p->g->nsub++;
1.40 christos 426: subno = (sopno)p->g->nsub;
1.1 jtc 427: if (subno < NPAREN)
428: p->pbegin[subno] = HERE();
429: EMIT(OLPAREN, subno);
430: if (!SEE(')'))
1.39 christos 431: p_re(p, ')', IGN);
1.1 jtc 432: if (subno < NPAREN) {
433: p->pend[subno] = HERE();
434: assert(p->pend[subno] != 0);
435: }
436: EMIT(ORPAREN, subno);
1.39 christos 437: (void)MUSTEAT(')', REG_EPAREN);
1.1 jtc 438: break;
439: #ifndef POSIX_MISTAKE
440: case ')': /* happens only if no current unmatched ( */
441: /*
442: * You may ask, why the ifndef? Because I didn't notice
443: * this until slightly too late for 1003.2, and none of the
444: * other 1003.2 regular-expression reviewers noticed it at
445: * all. So an unmatched ) is legal POSIX, at least until
446: * we can get it fixed.
447: */
448: SETERROR(REG_EPAREN);
449: break;
450: #endif
451: case '^':
452: EMIT(OBOL, 0);
453: p->g->iflags |= USEBOL;
454: p->g->nbol++;
455: wascaret = 1;
456: break;
457: case '$':
458: EMIT(OEOL, 0);
459: p->g->iflags |= USEEOL;
460: p->g->neol++;
461: break;
462: case '|':
463: SETERROR(REG_EMPTY);
464: break;
465: case '*':
466: case '+':
467: case '?':
1.39 christos 468: case '{':
1.1 jtc 469: SETERROR(REG_BADRPT);
470: break;
471: case '.':
472: if (p->g->cflags®_NEWLINE)
473: nonnewline(p);
474: else
475: EMIT(OANY, 0);
476: break;
477: case '[':
478: p_bracket(p);
479: break;
480: case '\\':
1.39 christos 481: (void)REQUIRE(MORE(), REG_EESCAPE);
482: wc = WGETNEXT();
483: #ifdef REGEX_GNU_EXTENSIONS
484: if (p->gnuext) {
485: handled = 1;
486: switch (wc) {
487: case '`':
488: EMIT(OBOS, 0);
489: break;
490: case '\'':
491: EMIT(OEOS, 0);
492: break;
493: case 'B':
494: EMIT(ONWBND, 0);
495: break;
496: case 'b':
497: EMIT(OWBND, 0);
498: break;
499: case 'W':
500: case 'w':
501: case 'S':
502: case 's':
503: p_b_pseudoclass(p, wc);
504: break;
505: case '1':
506: case '2':
507: case '3':
508: case '4':
509: case '5':
510: case '6':
511: case '7':
512: case '8':
513: case '9':
514: i = wc - '0';
515: assert(i < NPAREN);
516: if (p->pend[i] != 0) {
517: assert(i <= p->g->nsub);
518: EMIT(OBACK_, i);
519: assert(p->pbegin[i] != 0);
520: assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
521: assert(OP(p->strip[p->pend[i]]) == ORPAREN);
522: (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
523: EMIT(O_BACK, i);
524: } else
525: SETERROR(REG_ESUBREG);
526: p->g->backrefs = 1;
527: break;
528: default:
529: handled = 0;
530: }
531: /* Don't proceed to the POSIX bits if we've already handled it */
532: if (handled)
533: break;
534: }
535: #endif
536: switch (wc) {
537: case '<':
538: EMIT(OBOW, 0);
539: break;
540: case '>':
541: EMIT(OEOW, 0);
542: break;
543: default:
544: if (may_escape(p, wc))
545: ordinary(p, wc);
546: else
547: SETERROR(REG_EESCAPE);
548: break;
549: }
1.1 jtc 550: break;
551: default:
1.38 christos 552: if (p->error != 0)
1.39 christos 553: return (false);
554: p->next--;
555: wc = WGETNEXT();
556: ordinary(p, wc);
1.1 jtc 557: break;
558: }
559:
560: if (!MORE())
1.39 christos 561: return (false);
1.1 jtc 562: c = PEEK();
563: /* we call { a repetition if followed by a digit */
1.39 christos 564: if (!( c == '*' || c == '+' || c == '?' || c == '{'))
565: return (false); /* no repetition, we're done */
566: else if (c == '{')
567: (void)REQUIRE(MORE2() && \
568: (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
1.1 jtc 569: NEXT();
570:
1.39 christos 571: (void)REQUIRE(!wascaret, REG_BADRPT);
1.1 jtc 572: switch (c) {
573: case '*': /* implemented as +? */
1.4 jtc 574: /* this case does not require the (y|) trick, noKLUDGE */
1.1 jtc 575: INSERT(OPLUS_, pos);
576: ASTERN(O_PLUS, pos);
577: INSERT(OQUEST_, pos);
578: ASTERN(O_QUEST, pos);
579: break;
580: case '+':
581: INSERT(OPLUS_, pos);
582: ASTERN(O_PLUS, pos);
583: break;
584: case '?':
1.4 jtc 585: /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
586: INSERT(OCH_, pos); /* offset slightly wrong */
587: ASTERN(OOR1, pos); /* this one's right */
588: AHEAD(pos); /* fix the OCH_ */
589: EMIT(OOR2, 0); /* offset very wrong... */
590: AHEAD(THERE()); /* ...so fix it */
591: ASTERN(O_CH, THERETHERE());
1.1 jtc 592: break;
593: case '{':
594: count = p_count(p);
595: if (EAT(',')) {
1.39 christos 596: if (isdigit((uch)PEEK())) {
1.1 jtc 597: count2 = p_count(p);
1.39 christos 598: (void)REQUIRE(count <= count2, REG_BADBR);
1.1 jtc 599: } else /* single number with comma */
600: count2 = INFINITY;
601: } else /* just a single number */
602: count2 = count;
1.39 christos 603: repeat(p, pos, count, count2);
1.1 jtc 604: if (!EAT('}')) { /* error heuristics */
605: while (MORE() && PEEK() != '}')
606: NEXT();
1.39 christos 607: (void)REQUIRE(MORE(), REG_EBRACE);
1.1 jtc 608: SETERROR(REG_BADBR);
609: }
610: break;
611: }
612:
613: if (!MORE())
1.39 christos 614: return (false);
1.1 jtc 615: c = PEEK();
616: if (!( c == '*' || c == '+' || c == '?' ||
1.39 christos 617: (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
618: return (false);
1.1 jtc 619: SETERROR(REG_BADRPT);
1.39 christos 620: return (false);
1.1 jtc 621: }
622:
623: /*
624: - p_str - string (no metacharacters) "parser"
1.9 perry 625: == static void p_str(struct parse *p);
1.1 jtc 626: */
627: static void
1.39 christos 628: p_str(struct parse *p)
1.1 jtc 629: {
1.39 christos 630: (void)REQUIRE(MORE(), REG_EMPTY);
631: while (MORE())
632: ordinary(p, WGETNEXT());
633: }
1.14 lukem 634:
1.39 christos 635: /*
636: * Eat consecutive branch delimiters for the kind of expression that we are
637: * parsing, return the number of delimiters that we ate.
638: */
639: static int
640: p_branch_eat_delim(struct parse *p, struct branchc *bc)
641: {
642: int nskip;
1.14 lukem 643:
1.39 christos 644: (void)bc;
645: nskip = 0;
646: while (EATSPEC('|'))
647: ++nskip;
648: return (nskip);
1.1 jtc 649: }
650:
651: /*
1.39 christos 652: * Insert necessary branch book-keeping operations. This emits a
653: * bogus 'next' offset, since we still have more to parse
1.1 jtc 654: */
655: static void
1.39 christos 656: p_branch_ins_offset(struct parse *p, struct branchc *bc)
1.9 perry 657: {
1.1 jtc 658:
1.39 christos 659: if (bc->nbranch == 0) {
660: INSERT(OCH_, bc->start); /* offset is wrong */
661: bc->fwd = bc->start;
662: bc->back = bc->start;
663: }
664:
665: ASTERN(OOR1, bc->back);
666: bc->back = THERE();
667: AHEAD(bc->fwd); /* fix previous offset */
668: bc->fwd = HERE();
669: EMIT(OOR2, 0); /* offset is very wrong */
670: ++bc->nbranch;
671: }
672:
673: /*
674: * Fix the offset of the tail branch, if we actually had any branches.
675: * This is to correct the bogus placeholder offset that we use.
676: */
677: static void
678: p_branch_fix_tail(struct parse *p, struct branchc *bc)
679: {
1.14 lukem 680:
1.39 christos 681: /* Fix bogus offset at the tail if we actually have branches */
682: if (bc->nbranch > 0) {
683: AHEAD(bc->fwd);
684: ASTERN(O_CH, bc->back);
1.30 christos 685: }
1.39 christos 686: }
687:
688: /*
689: * Signal to the parser that an empty branch has been encountered; this will,
690: * in the future, be used to allow for more permissive behavior with empty
691: * branches. The return value should indicate whether parsing may continue
692: * or not.
693: */
694: static bool
695: p_branch_empty(struct parse *p, struct branchc *bc)
696: {
697:
698: (void)bc;
699: SETERROR(REG_EMPTY);
700: return (false);
701: }
702:
703: /*
704: * Take care of any branching requirements. This includes inserting the
705: * appropriate branching instructions as well as eating all of the branch
706: * delimiters until we either run out of pattern or need to parse more pattern.
707: */
708: static bool
709: p_branch_do(struct parse *p, struct branchc *bc)
710: {
711: int ate = 0;
712:
713: ate = p_branch_eat_delim(p, bc);
714: if (ate == 0)
715: return (false);
716: else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
717: /*
718: * Halt parsing only if we have an empty branch and p_branch_empty
719: * indicates that we must not continue. In the future, this will not
720: * necessarily be an error.
721: */
722: return (false);
723: p_branch_ins_offset(p, bc);
724:
725: return (true);
726: }
1.30 christos 727:
1.39 christos 728: static void
729: p_bre_pre_parse(struct parse *p, struct branchc *bc)
730: {
1.14 lukem 731:
1.40 christos 732: (void)bc;
1.39 christos 733: /*
734: * Does not move cleanly into expression parser because of
735: * ordinary interpration of * at the beginning position of
736: * an expression.
737: */
1.1 jtc 738: if (EAT('^')) {
739: EMIT(OBOL, 0);
740: p->g->iflags |= USEBOL;
741: p->g->nbol++;
742: }
1.39 christos 743: }
744:
745: static void
746: p_bre_post_parse(struct parse *p, struct branchc *bc)
747: {
748:
749: /* Expression is terminating due to EOL token */
750: if (bc->terminate) {
1.1 jtc 751: DROP(1);
752: EMIT(OEOL, 0);
753: p->g->iflags |= USEEOL;
754: p->g->neol++;
755: }
1.39 christos 756: }
1.1 jtc 757:
1.39 christos 758: /*
759: - p_re - Top level parser, concatenation and BRE anchoring
760: == static void p_re(struct parse *p, int end1, int end2);
761: * Giving end1 as OUT essentially eliminates the end1/end2 check.
762: *
763: * This implementation is a bit of a kludge, in that a trailing $ is first
764: * taken as an ordinary character and then revised to be an anchor.
765: * The amount of lookahead needed to avoid this kludge is excessive.
766: */
767: static void
768: p_re(struct parse *p,
769: int end1, /* first terminating character */
770: int end2) /* second terminating character; ignored for EREs */
771: {
772: struct branchc bc;
773:
774: bc.nbranch = 0;
775: if (end1 == OUT && end2 == OUT)
776: bc.outer = true;
777: else
778: bc.outer = false;
779: #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2))
780: for (;;) {
781: bc.start = HERE();
782: bc.nchain = 0;
783: bc.terminate = false;
784: if (p->pre_parse != NULL)
785: p->pre_parse(p, &bc);
786: while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) {
787: bc.terminate = p->parse_expr(p, &bc);
788: ++bc.nchain;
789: }
790: if (p->post_parse != NULL)
791: p->post_parse(p, &bc);
792: (void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY);
793: #ifdef REGEX_GNU_EXTENSIONS
794: if (p->gnuext && HERE() == bc.start && !p_branch_empty(p, &bc))
795: break;
796: #endif
797: if (!p->allowbranch)
798: break;
799: /*
800: * p_branch_do's return value indicates whether we should
801: * continue parsing or not. This is both for correctness and
802: * a slight optimization, because it will check if we've
803: * encountered an empty branch or the end of the string
804: * immediately following a branch delimiter.
805: */
806: if (!p_branch_do(p, &bc))
807: break;
808: }
809: #undef SEE_END
810: if (p->allowbranch)
811: p_branch_fix_tail(p, &bc);
812: assert(!MORE() || SEE(end1));
1.1 jtc 813: }
814:
815: /*
816: - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
1.39 christos 817: == static bool p_simp_re(struct parse *p, struct branchc *bc);
1.1 jtc 818: */
1.39 christos 819: static bool /* was the simple RE an unbackslashed $? */
820: p_simp_re(struct parse *p, struct branchc *bc)
1.1 jtc 821: {
1.9 perry 822: int c;
1.39 christos 823: int cc; /* convenient/control character */
1.9 perry 824: int count;
825: int count2;
1.39 christos 826: sopno pos;
827: bool handled;
1.40 christos 828: size_t i;
1.39 christos 829: wint_t wc;
1.9 perry 830: sopno subno;
1.1 jtc 831: # define BACKSL (1<<CHAR_BIT)
832:
1.39 christos 833: pos = HERE(); /* repetition op, if any, covers from here */
834: handled = false;
1.1 jtc 835:
836: assert(MORE()); /* caller should have ensured this */
837: c = GETNEXT();
838: if (c == '\\') {
1.39 christos 839: (void)REQUIRE(MORE(), REG_EESCAPE);
840: cc = GETNEXT();
841: c = BACKSL | cc;
842: #ifdef REGEX_GNU_EXTENSIONS
843: if (p->gnuext) {
844: handled = true;
845: switch (c) {
846: case BACKSL|'`':
847: EMIT(OBOS, 0);
848: break;
849: case BACKSL|'\'':
850: EMIT(OEOS, 0);
851: break;
852: case BACKSL|'B':
853: EMIT(ONWBND, 0);
854: break;
855: case BACKSL|'b':
856: EMIT(OWBND, 0);
857: break;
858: case BACKSL|'W':
859: case BACKSL|'w':
860: case BACKSL|'S':
861: case BACKSL|'s':
862: p_b_pseudoclass(p, cc);
863: break;
864: default:
865: handled = false;
866: }
867: }
868: #endif
1.1 jtc 869: }
1.39 christos 870: if (!handled) {
871: switch (c) {
872: case '.':
873: if (p->g->cflags®_NEWLINE)
874: nonnewline(p);
875: else
876: EMIT(OANY, 0);
877: break;
878: case '[':
879: p_bracket(p);
880: break;
881: case BACKSL|'<':
882: EMIT(OBOW, 0);
883: break;
884: case BACKSL|'>':
885: EMIT(OEOW, 0);
886: break;
887: case BACKSL|'{':
888: SETERROR(REG_BADRPT);
889: break;
890: case BACKSL|'(':
891: p->g->nsub++;
1.40 christos 892: subno = (sopno)p->g->nsub;
1.39 christos 893: if (subno < NPAREN)
894: p->pbegin[subno] = HERE();
895: EMIT(OLPAREN, subno);
896: /* the MORE here is an error heuristic */
897: if (MORE() && !SEETWO('\\', ')'))
898: p_re(p, '\\', ')');
899: if (subno < NPAREN) {
900: p->pend[subno] = HERE();
901: assert(p->pend[subno] != 0);
902: }
903: EMIT(ORPAREN, subno);
904: (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
905: break;
906: case BACKSL|')': /* should not get here -- must be user */
907: SETERROR(REG_EPAREN);
908: break;
909: case BACKSL|'1':
910: case BACKSL|'2':
911: case BACKSL|'3':
912: case BACKSL|'4':
913: case BACKSL|'5':
914: case BACKSL|'6':
915: case BACKSL|'7':
916: case BACKSL|'8':
917: case BACKSL|'9':
918: i = (c&~BACKSL) - '0';
919: assert(i < NPAREN);
920: if (p->pend[i] != 0) {
921: assert(i <= p->g->nsub);
922: EMIT(OBACK_, i);
923: assert(p->pbegin[i] != 0);
924: assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
925: assert(OP(p->strip[p->pend[i]]) == ORPAREN);
926: (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
927: EMIT(O_BACK, i);
928: } else
929: SETERROR(REG_ESUBREG);
930: p->g->backrefs = 1;
931: break;
932: case '*':
933: /*
934: * Ordinary if used as the first character beyond BOL anchor of
935: * a (sub-)expression, counts as a bad repetition operator if it
936: * appears otherwise.
937: */
938: (void)REQUIRE(bc->nchain == 0, REG_BADRPT);
939: /* FALLTHROUGH */
940: default:
941: if (p->error != 0)
942: return (false); /* Definitely not $... */
943: p->next--;
944: wc = WGETNEXT();
945: if ((c & BACKSL) == 0 || may_escape(p, wc))
946: ordinary(p, wc);
947: else
948: SETERROR(REG_EESCAPE);
949: break;
1.1 jtc 950: }
951: }
952:
953: if (EAT('*')) { /* implemented as +? */
1.4 jtc 954: /* this case does not require the (y|) trick, noKLUDGE */
1.1 jtc 955: INSERT(OPLUS_, pos);
956: ASTERN(O_PLUS, pos);
957: INSERT(OQUEST_, pos);
958: ASTERN(O_QUEST, pos);
1.39 christos 959: #ifdef REGEX_GNU_EXTENSIONS
960: } else if (p->gnuext && EATTWO('\\', '?')) {
961: INSERT(OQUEST_, pos);
962: ASTERN(O_QUEST, pos);
963: } else if (p->gnuext && EATTWO('\\', '+')) {
964: INSERT(OPLUS_, pos);
965: ASTERN(O_PLUS, pos);
966: #endif
1.1 jtc 967: } else if (EATTWO('\\', '{')) {
968: count = p_count(p);
969: if (EAT(',')) {
1.39 christos 970: if (MORE() && isdigit((uch)PEEK())) {
1.1 jtc 971: count2 = p_count(p);
1.39 christos 972: (void)REQUIRE(count <= count2, REG_BADBR);
1.1 jtc 973: } else /* single number with comma */
974: count2 = INFINITY;
975: } else /* just a single number */
976: count2 = count;
1.39 christos 977: repeat(p, pos, count, count2);
1.1 jtc 978: if (!EATTWO('\\', '}')) { /* error heuristics */
979: while (MORE() && !SEETWO('\\', '}'))
980: NEXT();
1.39 christos 981: (void)REQUIRE(MORE(), REG_EBRACE);
1.1 jtc 982: SETERROR(REG_BADBR);
983: }
1.39 christos 984: } else if (c == '$') /* $ (but not \$) ends it */
985: return (true);
1.1 jtc 986:
1.39 christos 987: return (false);
1.1 jtc 988: }
989:
990: /*
991: - p_count - parse a repetition count
1.9 perry 992: == static int p_count(struct parse *p);
1.1 jtc 993: */
994: static int /* the value */
1.39 christos 995: p_count(struct parse *p)
1.1 jtc 996: {
1.9 perry 997: int count = 0;
998: int ndigits = 0;
1.1 jtc 999:
1.39 christos 1000: while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
1.1 jtc 1001: count = count*10 + (GETNEXT() - '0');
1002: ndigits++;
1003: }
1004:
1.39 christos 1005: (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
1.1 jtc 1006: return(count);
1007: }
1008:
1009: /*
1010: - p_bracket - parse a bracketed character list
1.9 perry 1011: == static void p_bracket(struct parse *p);
1.1 jtc 1012: */
1013: static void
1.39 christos 1014: p_bracket(struct parse *p)
1.1 jtc 1015: {
1.14 lukem 1016: cset *cs;
1.39 christos 1017: wint_t ch;
1.14 lukem 1018:
1.1 jtc 1019: /* Dept of Truly Sickening Special-Case Kludges */
1.39 christos 1020: if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
1.1 jtc 1021: EMIT(OBOW, 0);
1022: NEXTn(6);
1023: return;
1024: }
1.39 christos 1025: if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
1.1 jtc 1026: EMIT(OEOW, 0);
1027: NEXTn(6);
1028: return;
1029: }
1030:
1.39 christos 1031: if ((cs = allocset(p)) == NULL)
1032: return;
1033:
1034: if (p->g->cflags®_ICASE)
1035: cs->icase = 1;
1.1 jtc 1036: if (EAT('^'))
1.39 christos 1037: cs->invert = 1;
1.1 jtc 1038: if (EAT(']'))
1.39 christos 1039: CHadd(p, cs, ']');
1.1 jtc 1040: else if (EAT('-'))
1.39 christos 1041: CHadd(p, cs, '-');
1.1 jtc 1042: while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
1043: p_b_term(p, cs);
1044: if (EAT('-'))
1.39 christos 1045: CHadd(p, cs, '-');
1046: (void)MUSTEAT(']', REG_EBRACK);
1.1 jtc 1047:
1048: if (p->error != 0) /* don't mess things up further */
1049: return;
1050:
1.39 christos 1051: if (cs->invert && p->g->cflags®_NEWLINE)
1052: cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
1.1 jtc 1053:
1.39 christos 1054: if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
1055: ordinary(p, ch);
1056: freeset(p, cs);
1057: } else
1.40 christos 1058: EMIT(OANYOF, (size_t)(cs - p->g->sets));
1.39 christos 1059: }
1.1 jtc 1060:
1.39 christos 1061: static int
1062: p_range_cmp(wchar_t c1, wchar_t c2)
1063: {
1064: #ifdef REGEX_LIBC_COLLATE
1065: return __wcollate_range_cmp(c1, c2);
1066: #else
1067: /* Copied from libc/collate __wcollate_range_cmp */
1068: wchar_t s1[2], s2[2];
1.1 jtc 1069:
1.39 christos 1070: s1[0] = c1;
1071: s1[1] = L'\0';
1072: s2[0] = c2;
1073: s2[1] = L'\0';
1074: return (wcscoll(s1, s2));
1075: #endif
1.1 jtc 1076: }
1077:
1078: /*
1079: - p_b_term - parse one term of a bracketed character list
1.9 perry 1080: == static void p_b_term(struct parse *p, cset *cs);
1.1 jtc 1081: */
1082: static void
1.39 christos 1083: p_b_term(struct parse *p, cset *cs)
1.1 jtc 1084: {
1.9 perry 1085: char c;
1.39 christos 1086: wint_t start, finish;
1087: wint_t i;
1088: #ifdef REGEX_LIBC_COLLATE
1089: struct xlocale_collate *table =
1090: (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
1091: #endif
1.1 jtc 1092:
1.14 lukem 1093: _DIAGASSERT(p != NULL);
1094: _DIAGASSERT(cs != NULL);
1095:
1.1 jtc 1096: /* classify what we've got */
1097: switch ((MORE()) ? PEEK() : '\0') {
1098: case '[':
1099: c = (MORE2()) ? PEEK2() : '\0';
1100: break;
1101: case '-':
1102: SETERROR(REG_ERANGE);
1103: return; /* NOTE RETURN */
1104: default:
1105: c = '\0';
1106: break;
1107: }
1108:
1109: switch (c) {
1110: case ':': /* character class */
1111: NEXT2();
1.39 christos 1112: (void)REQUIRE(MORE(), REG_EBRACK);
1.1 jtc 1113: c = PEEK();
1.39 christos 1114: (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
1.1 jtc 1115: p_b_cclass(p, cs);
1.39 christos 1116: (void)REQUIRE(MORE(), REG_EBRACK);
1117: (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
1.1 jtc 1118: break;
1119: case '=': /* equivalence class */
1120: NEXT2();
1.39 christos 1121: (void)REQUIRE(MORE(), REG_EBRACK);
1.1 jtc 1122: c = PEEK();
1.39 christos 1123: (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
1.1 jtc 1124: p_b_eclass(p, cs);
1.39 christos 1125: (void)REQUIRE(MORE(), REG_EBRACK);
1126: (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
1.1 jtc 1127: break;
1128: default: /* symbol, ordinary character, or range */
1129: start = p_b_symbol(p);
1130: if (SEE('-') && MORE2() && PEEK2() != ']') {
1131: /* range */
1132: NEXT();
1133: if (EAT('-'))
1134: finish = '-';
1135: else
1136: finish = p_b_symbol(p);
1137: } else
1138: finish = start;
1.39 christos 1139: if (start == finish)
1140: CHadd(p, cs, start);
1141: else {
1142: #ifdef REGEX_LIBC_COLLATE
1143: if (table->__collate_load_error || MB_CUR_MAX > 1) {
1144: #else
1145: if (MB_CUR_MAX > 1) {
1146: #endif
1147: (void)REQUIRE(start <= finish, REG_ERANGE);
1148: CHaddrange(p, cs, start, finish);
1149: } else {
1150: (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE);
1151: for (i = 0; i <= UCHAR_MAX; i++) {
1152: if (p_range_cmp(start, i) <= 0 &&
1153: p_range_cmp(i, finish) <= 0 )
1154: CHadd(p, cs, i);
1155: }
1156: }
1157: }
1158: break;
1159: }
1160: }
1161:
1162: #ifdef REGEX_GNU_EXTENSIONS
1163: /*
1164: - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S)
1165: == static int p_b_pseudoclass(struct parse *p, char c)
1166: */
1167: static int
1168: p_b_pseudoclass(struct parse *p, char c) {
1169: cset *cs;
1170:
1171: if ((cs = allocset(p)) == NULL)
1172: return(0);
1173:
1174: if (p->g->cflags®_ICASE)
1175: cs->icase = 1;
1176:
1177: switch (c) {
1178: case 'W':
1179: cs->invert = 1;
1180: /* FALLTHROUGH */
1181: case 'w':
1182: p_b_cclass_named(p, cs, "alnum");
1.1 jtc 1183: break;
1.39 christos 1184: case 'S':
1185: cs->invert = 1;
1186: /* FALLTHROUGH */
1187: case 's':
1188: p_b_cclass_named(p, cs, "space");
1189: break;
1190: default:
1191: return(0);
1.1 jtc 1192: }
1.39 christos 1193:
1.40 christos 1194: EMIT(OANYOF, (size_t)(cs - p->g->sets));
1.39 christos 1195: return(1);
1.1 jtc 1196: }
1.39 christos 1197: #endif
1.1 jtc 1198:
1199: /*
1200: - p_b_cclass - parse a character-class name and deal with it
1.9 perry 1201: == static void p_b_cclass(struct parse *p, cset *cs);
1.1 jtc 1202: */
1203: static void
1.39 christos 1204: p_b_cclass(struct parse *p, cset *cs)
1.1 jtc 1205: {
1.39 christos 1206: const char *sp = p->next;
1.9 perry 1207: size_t len;
1.39 christos 1208: char clname[16];
1.1 jtc 1209:
1.39 christos 1210: while (MORE() && isalpha((uch)PEEK()))
1.1 jtc 1211: NEXT();
1212: len = p->next - sp;
1.39 christos 1213: if (len >= sizeof(clname) - 1) {
1.1 jtc 1214: SETERROR(REG_ECTYPE);
1215: return;
1216: }
1.39 christos 1217: memcpy(clname, sp, len);
1218: clname[len] = '\0';
1219:
1220: p_b_cclass_named(p, cs, clname);
1221: }
1222: /*
1223: - p_b_cclass_named - deal with a named character class
1224: == static void p_b_cclass_named(struct parse *p, cset *cs, const char []);
1225: */
1226: static void
1227: p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) {
1228: wctype_t wct;
1.1 jtc 1229:
1.39 christos 1230: if ((wct = wctype(clname)) == 0) {
1231: SETERROR(REG_ECTYPE);
1232: return;
1233: }
1234: CHaddtype(p, cs, wct);
1.1 jtc 1235: }
1236:
1237: /*
1238: - p_b_eclass - parse an equivalence-class name and deal with it
1.9 perry 1239: == static void p_b_eclass(struct parse *p, cset *cs);
1.1 jtc 1240: *
1241: * This implementation is incomplete. xxx
1242: */
1243: static void
1.39 christos 1244: p_b_eclass(struct parse *p, cset *cs)
1.1 jtc 1245: {
1.39 christos 1246: wint_t c;
1.1 jtc 1247:
1.14 lukem 1248: _DIAGASSERT(p != NULL);
1249: _DIAGASSERT(cs != NULL);
1250:
1.1 jtc 1251: c = p_b_coll_elem(p, '=');
1.39 christos 1252: CHadd(p, cs, c);
1.1 jtc 1253: }
1254:
1255: /*
1256: - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1.39 christos 1257: == static wint_t p_b_symbol(struct parse *p);
1.1 jtc 1258: */
1.39 christos 1259: static wint_t /* value of symbol */
1260: p_b_symbol(struct parse *p)
1.1 jtc 1261: {
1.39 christos 1262: wint_t value;
1.1 jtc 1263:
1.14 lukem 1264: _DIAGASSERT(p != NULL);
1265:
1.39 christos 1266: (void)REQUIRE(MORE(), REG_EBRACK);
1.1 jtc 1267: if (!EATTWO('[', '.'))
1.39 christos 1268: return(WGETNEXT());
1.1 jtc 1269:
1270: /* collating symbol */
1271: value = p_b_coll_elem(p, '.');
1.39 christos 1272: (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1.1 jtc 1273: return(value);
1274: }
1275:
1276: /*
1277: - p_b_coll_elem - parse a collating-element name and look it up
1.39 christos 1278: == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
1.1 jtc 1279: */
1.39 christos 1280: static wint_t /* value of collating element */
1281: p_b_coll_elem(struct parse *p,
1282: wint_t endc) /* name ended by endc,']' */
1283: {
1284: const char *sp = p->next;
1285: struct cname *cp;
1286: mbstate_t mbs;
1287: wchar_t wc;
1288: size_t clen, len;
1.1 jtc 1289:
1.14 lukem 1290: _DIAGASSERT(p != NULL);
1291:
1.1 jtc 1292: while (MORE() && !SEETWO(endc, ']'))
1293: NEXT();
1294: if (!MORE()) {
1295: SETERROR(REG_EBRACK);
1296: return(0);
1297: }
1298: len = p->next - sp;
1299: for (cp = cnames; cp->name != NULL; cp++)
1.37 christos 1300: if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1.1 jtc 1301: return(cp->code); /* known name */
1.39 christos 1302: memset(&mbs, 0, sizeof(mbs));
1303: if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1304: return (wc); /* single character */
1305: else if (clen == (size_t)-1 || clen == (size_t)-2)
1306: SETERROR(REG_ILLSEQ);
1307: else
1308: SETERROR(REG_ECOLLATE); /* neither */
1.1 jtc 1309: return(0);
1310: }
1311:
1312: /*
1.39 christos 1313: - may_escape - determine whether 'ch' is escape-able in the current context
1314: == static int may_escape(struct parse *p, const wint_t ch)
1315: */
1316: static bool
1317: may_escape(struct parse *p, const wint_t ch)
1318: {
1319:
1320: if ((p->pflags & PFLAG_LEGACY_ESC) != 0)
1321: return (true);
1322: if (isalpha(ch) || ch == '\'' || ch == '`')
1323: return (false);
1324: return (true);
1325: #ifdef NOTYET
1326: /*
1327: * Build a whitelist of characters that may be escaped to produce an
1328: * ordinary in the current context. This assumes that these have not
1329: * been otherwise interpreted as a special character. Escaping an
1330: * ordinary character yields undefined results according to
1331: * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take
1332: * advantage of this and use escaped ordinary characters to provide
1333: * special meaning, e.g. \b, \B, \w, \W, \s, \S.
1334: */
1335: switch(ch) {
1336: case '|':
1337: case '+':
1338: case '?':
1339: /* The above characters may not be escaped in BREs */
1340: if (!(p->g->cflags®_EXTENDED))
1341: return (false);
1342: /* Fallthrough */
1343: case '(':
1344: case ')':
1345: case '{':
1346: case '}':
1347: case '.':
1348: case '[':
1349: case ']':
1350: case '\\':
1351: case '*':
1352: case '^':
1353: case '$':
1354: return (true);
1355: default:
1356: return (false);
1357: }
1358: #endif
1359: }
1360:
1361: /*
1.1 jtc 1362: - othercase - return the case counterpart of an alphabetic
1.39 christos 1363: == static wint_t othercase(wint_t ch);
1.1 jtc 1364: */
1.39 christos 1365: static wint_t /* if no counterpart, return ch */
1366: othercase(wint_t ch)
1367: {
1368: assert(iswalpha(ch));
1369: if (iswupper(ch))
1370: return(towlower(ch));
1371: else if (iswlower(ch))
1372: return(towupper(ch));
1.1 jtc 1373: else /* peculiar, but could happen */
1374: return(ch);
1375: }
1376:
1377: /*
1378: - bothcases - emit a dualcase version of a two-case character
1.39 christos 1379: == static void bothcases(struct parse *p, wint_t ch);
1.1 jtc 1380: *
1381: * Boy, is this implementation ever a kludge...
1382: */
1383: static void
1.39 christos 1384: bothcases(struct parse *p, wint_t ch)
1.1 jtc 1385: {
1.39 christos 1386: const char *oldnext = p->next;
1387: const char *oldend = p->end;
1388: char bracket[3 + MB_LEN_MAX];
1389: size_t n;
1390: mbstate_t mbs;
1.1 jtc 1391:
1.14 lukem 1392: _DIAGASSERT(p != NULL);
1393:
1.1 jtc 1394: assert(othercase(ch) != ch); /* p_bracket() would recurse */
1395: p->next = bracket;
1.39 christos 1396: memset(&mbs, 0, sizeof(mbs));
1397: n = wcrtomb(bracket, ch, &mbs);
1398: assert(n != (size_t)-1);
1399: bracket[n] = ']';
1400: bracket[n + 1] = '\0';
1401: p->end = bracket+n+1;
1.1 jtc 1402: p_bracket(p);
1.39 christos 1403: assert(p->next == p->end);
1.1 jtc 1404: p->next = oldnext;
1405: p->end = oldend;
1406: }
1407:
1408: /*
1409: - ordinary - emit an ordinary character
1.39 christos 1410: == static void ordinary(struct parse *p, wint_t ch);
1.1 jtc 1411: */
1412: static void
1.39 christos 1413: ordinary(struct parse *p, wint_t ch)
1.1 jtc 1414: {
1.39 christos 1415: cset *cs;
1.1 jtc 1416:
1.14 lukem 1417: _DIAGASSERT(p != NULL);
1418:
1.39 christos 1419: if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch)
1420: bothcases(p, ch);
1.40 christos 1421: else if ((wint_t)(ch & OPDMASK) == ch)
1422: EMIT(OCHAR, (size_t)ch);
1.1 jtc 1423: else {
1.39 christos 1424: /*
1425: * Kludge: character is too big to fit into an OCHAR operand.
1426: * Emit a singleton set.
1427: */
1428: if ((cs = allocset(p)) == NULL)
1429: return;
1430: CHadd(p, cs, ch);
1.40 christos 1431: EMIT(OANYOF, (size_t)(cs - p->g->sets));
1.1 jtc 1432: }
1433: }
1434:
1435: /*
1436: - nonnewline - emit REG_NEWLINE version of OANY
1.9 perry 1437: == static void nonnewline(struct parse *p);
1.1 jtc 1438: *
1439: * Boy, is this implementation ever a kludge...
1440: */
1441: static void
1.39 christos 1442: nonnewline(struct parse *p)
1.1 jtc 1443: {
1.39 christos 1444: const char *oldnext = p->next;
1445: const char *oldend = p->end;
1.1 jtc 1446: char bracket[4];
1447:
1.14 lukem 1448: _DIAGASSERT(p != NULL);
1449:
1.1 jtc 1450: p->next = bracket;
1451: p->end = bracket+3;
1452: bracket[0] = '^';
1453: bracket[1] = '\n';
1454: bracket[2] = ']';
1455: bracket[3] = '\0';
1456: p_bracket(p);
1457: assert(p->next == bracket+3);
1458: p->next = oldnext;
1459: p->end = oldend;
1460: }
1461:
1462: /*
1463: - repeat - generate code for a bounded repetition, recursively if needed
1.39 christos 1464: == static void repeat(struct parse *p, sopno start, int from, int to);
1.1 jtc 1465: */
1466: static void
1.39 christos 1467: repeat(struct parse *p,
1468: sopno start, /* operand from here to end of strip */
1469: int from, /* repeated from this number */
1470: int to) /* to this number of times (maybe INFINITY) */
1.1 jtc 1471: {
1.39 christos 1472: sopno finish = HERE();
1.1 jtc 1473: # define N 2
1474: # define INF 3
1475: # define REP(f, t) ((f)*8 + (t))
1476: # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1.9 perry 1477: sopno copy;
1.1 jtc 1478:
1.14 lukem 1479: _DIAGASSERT(p != NULL);
1480:
1.39 christos 1481: if (p->error != 0) /* head off possible runaway recursion */
1.30 christos 1482: return;
1483:
1.1 jtc 1484: assert(from <= to);
1485:
1486: switch (REP(MAP(from), MAP(to))) {
1487: case REP(0, 0): /* must be user doing this */
1488: DROP(finish-start); /* drop the operand */
1489: break;
1490: case REP(0, 1): /* as x{1,1}? */
1491: case REP(0, N): /* as x{1,n}? */
1492: case REP(0, INF): /* as x{1,}? */
1.4 jtc 1493: /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1494: INSERT(OCH_, start); /* offset is wrong... */
1.39 christos 1495: repeat(p, start+1, 1, to);
1.4 jtc 1496: ASTERN(OOR1, start);
1.1 jtc 1497: AHEAD(start); /* ... fix it */
1.4 jtc 1498: EMIT(OOR2, 0);
1499: AHEAD(THERE());
1500: ASTERN(O_CH, THERETHERE());
1.1 jtc 1501: break;
1502: case REP(1, 1): /* trivial case */
1503: /* done */
1504: break;
1505: case REP(1, N): /* as x?x{1,n-1} */
1.4 jtc 1506: /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1507: INSERT(OCH_, start);
1508: ASTERN(OOR1, start);
1509: AHEAD(start);
1510: EMIT(OOR2, 0); /* offset very wrong... */
1511: AHEAD(THERE()); /* ...so fix it */
1512: ASTERN(O_CH, THERETHERE());
1.1 jtc 1513: copy = dupl(p, start+1, finish+1);
1.4 jtc 1514: assert(copy == finish+4);
1.39 christos 1515: repeat(p, copy, 1, to-1);
1.1 jtc 1516: break;
1517: case REP(1, INF): /* as x+ */
1518: INSERT(OPLUS_, start);
1519: ASTERN(O_PLUS, start);
1520: break;
1521: case REP(N, N): /* as xx{m-1,n-1} */
1522: copy = dupl(p, start, finish);
1.39 christos 1523: repeat(p, copy, from-1, to-1);
1.1 jtc 1524: break;
1525: case REP(N, INF): /* as xx{n-1,INF} */
1526: copy = dupl(p, start, finish);
1.39 christos 1527: repeat(p, copy, from-1, to);
1.1 jtc 1528: break;
1529: default: /* "can't happen" */
1530: SETERROR(REG_ASSERT); /* just in case */
1531: break;
1532: }
1533: }
1534:
1535: /*
1.39 christos 1536: - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1537: - character from the parse struct, signals a REG_ILLSEQ error if the
1538: - character can't be converted. Returns the number of bytes consumed.
1539: */
1540: static wint_t
1541: wgetnext(struct parse *p)
1542: {
1543: mbstate_t mbs;
1544: wchar_t wc;
1545: size_t n;
1546:
1547: memset(&mbs, 0, sizeof(mbs));
1.40 christos 1548: n = mbrtowc(&wc, p->next, (size_t)(p->end - p->next), &mbs);
1.39 christos 1549: if (n == (size_t)-1 || n == (size_t)-2) {
1550: SETERROR(REG_ILLSEQ);
1551: return (0);
1552: }
1553: if (n == 0)
1554: n = 1;
1555: p->next += n;
1556: return (wc);
1557: }
1558:
1559: /*
1.1 jtc 1560: - seterr - set an error condition
1.9 perry 1561: == static int seterr(struct parse *p, int e);
1.1 jtc 1562: */
1563: static int /* useless but makes type checking happy */
1.39 christos 1564: seterr(struct parse *p, int e)
1.1 jtc 1565: {
1.14 lukem 1566:
1567: _DIAGASSERT(p != NULL);
1568:
1.1 jtc 1569: if (p->error == 0) /* keep earliest error condition */
1570: p->error = e;
1571: p->next = nuls; /* try to bring things to a halt */
1572: p->end = nuls;
1573: return(0); /* make the return value well-defined */
1574: }
1575:
1576: /*
1577: - allocset - allocate a set of characters for []
1.9 perry 1578: == static cset *allocset(struct parse *p);
1.1 jtc 1579: */
1580: static cset *
1.39 christos 1581: allocset(struct parse *p)
1.1 jtc 1582: {
1.39 christos 1583: cset *cs, *ncs;
1.1 jtc 1584:
1.14 lukem 1585: _DIAGASSERT(p != NULL);
1586:
1.39 christos 1587: ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs));
1588: if (ncs == NULL) {
1589: SETERROR(REG_ESPACE);
1590: return (NULL);
1.1 jtc 1591: }
1.39 christos 1592: p->g->sets = ncs;
1593: cs = &p->g->sets[p->g->ncsets++];
1594: memset(cs, 0, sizeof(*cs));
1.1 jtc 1595:
1596: return(cs);
1597: }
1598:
1599: /*
1600: - freeset - free a now-unused set
1.9 perry 1601: == static void freeset(struct parse *p, cset *cs);
1.1 jtc 1602: */
1603: static void
1.39 christos 1604: freeset(struct parse *p, cset *cs)
1.1 jtc 1605: {
1.14 lukem 1606: cset *top;
1607:
1608: _DIAGASSERT(p != NULL);
1609: _DIAGASSERT(cs != NULL);
1610:
1611: top = &p->g->sets[p->g->ncsets];
1.1 jtc 1612:
1.39 christos 1613: free(cs->wides);
1614: free(cs->ranges);
1615: free(cs->types);
1616: memset(cs, 0, sizeof(*cs));
1.1 jtc 1617: if (cs == top-1) /* recover only the easy case */
1618: p->g->ncsets--;
1619: }
1620:
1621: /*
1.39 christos 1622: - singleton - Determine whether a set contains only one character,
1623: - returning it if so, otherwise returning OUT.
1.1 jtc 1624: */
1.39 christos 1625: static wint_t
1626: singleton(cset *cs)
1.1 jtc 1627: {
1.39 christos 1628: wint_t i, s, n;
1.1 jtc 1629:
1.39 christos 1630: for (i = n = 0; i < NC; i++)
1631: if (CHIN(cs, i)) {
1.1 jtc 1632: n++;
1.39 christos 1633: s = i;
1634: }
1635: if (n == 1)
1636: return (s);
1637: if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1638: cs->icase == 0)
1639: return (cs->wides[0]);
1640: /* Don't bother handling the other cases. */
1641: return (OUT);
1.1 jtc 1642: }
1643:
1644: /*
1.39 christos 1645: - CHadd - add character to character set.
1.1 jtc 1646: */
1647: static void
1.39 christos 1648: CHadd(struct parse *p, cset *cs, wint_t ch)
1.1 jtc 1649: {
1.39 christos 1650: wint_t nch, *newwides;
1.14 lukem 1651:
1652: _DIAGASSERT(p != NULL);
1653: _DIAGASSERT(cs != NULL);
1654:
1.39 christos 1655: assert(ch >= 0);
1656: if (ch < NC)
1.40 christos 1657: cs->bmp[(unsigned)ch >> 3] |= 1 << (ch & 7);
1.39 christos 1658: else {
1659: newwides = reallocarray(cs->wides, cs->nwides + 1,
1660: sizeof(*cs->wides));
1661: if (newwides == NULL) {
1662: SETERROR(REG_ESPACE);
1663: return;
1664: }
1665: cs->wides = newwides;
1666: cs->wides[cs->nwides++] = ch;
1667: }
1668: if (cs->icase) {
1669: if ((nch = towlower(ch)) < NC)
1.40 christos 1670: cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
1.39 christos 1671: if ((nch = towupper(ch)) < NC)
1.40 christos 1672: cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
1.1 jtc 1673: }
1674: }
1675:
1676: /*
1.39 christos 1677: - CHaddrange - add all characters in the range [min,max] to a character set.
1.1 jtc 1678: */
1679: static void
1.39 christos 1680: CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1.1 jtc 1681: {
1.39 christos 1682: crange *newranges;
1.14 lukem 1683:
1.39 christos 1684: _DIAGASSERT(p != NULL);
1.14 lukem 1685: _DIAGASSERT(cs != NULL);
1686:
1.39 christos 1687: for (; min < NC && min <= max; min++)
1688: CHadd(p, cs, min);
1689: if (min >= max)
1690: return;
1691: newranges = reallocarray(cs->ranges, cs->nranges + 1,
1692: sizeof(*cs->ranges));
1693: if (newranges == NULL) {
1694: SETERROR(REG_ESPACE);
1.1 jtc 1695: return;
1696: }
1.39 christos 1697: cs->ranges = newranges;
1698: cs->ranges[cs->nranges].min = min;
1699: cs->ranges[cs->nranges].max = max;
1700: cs->nranges++;
1.1 jtc 1701: }
1702:
1703: /*
1.39 christos 1704: - CHaddtype - add all characters of a certain type to a character set.
1.1 jtc 1705: */
1706: static void
1.39 christos 1707: CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1.1 jtc 1708: {
1.39 christos 1709: wint_t i;
1710: wctype_t *newtypes;
1.14 lukem 1711:
1712: _DIAGASSERT(p != NULL);
1713: _DIAGASSERT(cs != NULL);
1714:
1.39 christos 1715: for (i = 0; i < NC; i++)
1716: if (iswctype(i, wct))
1717: CHadd(p, cs, i);
1718: newtypes = reallocarray(cs->types, cs->ntypes + 1,
1719: sizeof(*cs->types));
1720: if (newtypes == NULL) {
1721: SETERROR(REG_ESPACE);
1.1 jtc 1722: return;
1.39 christos 1723: }
1724: cs->types = newtypes;
1725: cs->types[cs->ntypes++] = wct;
1.1 jtc 1726: }
1727:
1728: /*
1729: - dupl - emit a duplicate of a bunch of sops
1.9 perry 1730: == static sopno dupl(struct parse *p, sopno start, sopno finish);
1.1 jtc 1731: */
1732: static sopno /* start of duplicate */
1.39 christos 1733: dupl(struct parse *p,
1734: sopno start, /* from here */
1735: sopno finish) /* to this less one */
1.1 jtc 1736: {
1.39 christos 1737: sopno ret = HERE();
1.9 perry 1738: sopno len = finish - start;
1.1 jtc 1739:
1.14 lukem 1740: _DIAGASSERT(p != NULL);
1741:
1.1 jtc 1742: assert(finish >= start);
1743: if (len == 0)
1744: return(ret);
1.39 christos 1745: if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1746: return(ret);
1.40 christos 1747: (void) memcpy(p->strip + p->slen,
1748: p->strip + start, len * sizeof(*p->strip));
1.1 jtc 1749: p->slen += len;
1750: return(ret);
1751: }
1752:
1753: /*
1754: - doemit - emit a strip operator
1.9 perry 1755: == static void doemit(struct parse *p, sop op, size_t opnd);
1.1 jtc 1756: *
1757: * It might seem better to implement this as a macro with a function as
1758: * hard-case backup, but it's just too big and messy unless there are
1759: * some changes to the data structures. Maybe later.
1760: */
1761: static void
1.39 christos 1762: doemit(struct parse *p, sop op, size_t opnd)
1.1 jtc 1763: {
1764: /* avoid making error situations worse */
1765: if (p->error != 0)
1766: return;
1767:
1.39 christos 1768: _DIAGASSERT(p != NULL);
1769:
1.1 jtc 1770: /* deal with oversize operands ("can't happen", more or less) */
1771: assert(opnd < 1<<OPSHIFT);
1772:
1773: /* deal with undersized strip */
1774: if (p->slen >= p->ssize)
1.30 christos 1775: if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */
1776: return;
1.1 jtc 1777:
1778: /* finally, it's all reduced to the easy case */
1.40 christos 1779: p->strip[p->slen++] = (sopno)SOP(op, opnd);
1.1 jtc 1780: }
1781:
1782: /*
1783: - doinsert - insert a sop into the strip
1.9 perry 1784: == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1.1 jtc 1785: */
1786: static void
1.39 christos 1787: doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1.1 jtc 1788: {
1.9 perry 1789: sopno sn;
1790: sop s;
1791: int i;
1.1 jtc 1792:
1.14 lukem 1793: _DIAGASSERT(p != NULL);
1794:
1.1 jtc 1795: /* avoid making error situations worse */
1796: if (p->error != 0)
1797: return;
1798:
1799: sn = HERE();
1800: EMIT(op, opnd); /* do checks, ensure space */
1801: assert(HERE() == sn+1);
1802: s = p->strip[sn];
1803:
1804: /* adjust paren pointers */
1805: assert(pos > 0);
1806: for (i = 1; i < NPAREN; i++) {
1807: if (p->pbegin[i] >= pos) {
1808: p->pbegin[i]++;
1809: }
1810: if (p->pend[i] >= pos) {
1811: p->pend[i]++;
1812: }
1813: }
1814:
1.40 christos 1815: memmove(&p->strip[pos+1], &p->strip[pos],
1816: (HERE()-pos-1)*sizeof(*p->strip));
1.1 jtc 1817: p->strip[pos] = s;
1818: }
1819:
1820: /*
1821: - dofwd - complete a forward reference
1.9 perry 1822: == static void dofwd(struct parse *p, sopno pos, sop value);
1.1 jtc 1823: */
1824: static void
1.39 christos 1825: dofwd(struct parse *p, sopno pos, sop value)
1.1 jtc 1826: {
1.14 lukem 1827:
1828: _DIAGASSERT(p != NULL);
1829:
1.1 jtc 1830: /* avoid making error situations worse */
1831: if (p->error != 0)
1832: return;
1833:
1834: assert(value < 1<<OPSHIFT);
1.39 christos 1835: p->strip[pos] = OP(p->strip[pos]) | value;
1.1 jtc 1836: }
1837:
1838: /*
1839: - enlarge - enlarge the strip
1.39 christos 1840: == static int enlarge(struct parse *p, sopno size);
1.1 jtc 1841: */
1.30 christos 1842: static int
1.35 joerg 1843: enlarge(struct parse *p, sopno size)
1.1 jtc 1844: {
1.39 christos 1845: sop *sp;
1846:
1.14 lukem 1847: _DIAGASSERT(p != NULL);
1848:
1.1 jtc 1849: if (p->ssize >= size)
1.30 christos 1850: return 1;
1.1 jtc 1851:
1.40 christos 1852: sp = reallocarray(p->strip, size, sizeof(*p->strip));
1.39 christos 1853: if (sp == NULL) {
1.1 jtc 1854: SETERROR(REG_ESPACE);
1.30 christos 1855: return 0;
1.1 jtc 1856: }
1.39 christos 1857: p->strip = sp;
1.35 joerg 1858: p->ssize = size;
1.30 christos 1859: return 1;
1.1 jtc 1860: }
1861:
1862: /*
1863: - stripsnug - compact the strip
1.9 perry 1864: == static void stripsnug(struct parse *p, struct re_guts *g);
1.1 jtc 1865: */
1866: static void
1.39 christos 1867: stripsnug(struct parse *p, struct re_guts *g)
1.1 jtc 1868: {
1.14 lukem 1869:
1870: _DIAGASSERT(p != NULL);
1871: _DIAGASSERT(g != NULL);
1872:
1.1 jtc 1873: g->nstates = p->slen;
1.40 christos 1874: g->strip = reallocarray(p->strip, p->slen, sizeof(*p->strip));
1.39 christos 1875: if (g->strip == NULL) {
1876: SETERROR(REG_ESPACE);
1877: g->strip = p->strip;
1878: }
1.1 jtc 1879: }
1880:
1881: /*
1882: - findmust - fill in must and mlen with longest mandatory literal string
1.9 perry 1883: == static void findmust(struct parse *p, struct re_guts *g);
1.1 jtc 1884: *
1885: * This algorithm could do fancy things like analyzing the operands of |
1886: * for common subsequences. Someday. This code is simple and finds most
1887: * of the interesting cases.
1888: *
1889: * Note that must and mlen got initialized during setup.
1890: */
1891: static void
1.39 christos 1892: findmust(struct parse *p, struct re_guts *g)
1.1 jtc 1893: {
1.9 perry 1894: sop *scan;
1.7 christos 1895: sop *start = NULL;
1.9 perry 1896: sop *newstart = NULL;
1897: sopno newlen;
1898: sop s;
1899: char *cp;
1.39 christos 1900: int offset;
1901: char buf[MB_LEN_MAX];
1902: size_t clen;
1903: mbstate_t mbs;
1.1 jtc 1904:
1.14 lukem 1905: _DIAGASSERT(p != NULL);
1906: _DIAGASSERT(g != NULL);
1907:
1.1 jtc 1908: /* avoid making error situations worse */
1909: if (p->error != 0)
1910: return;
1911:
1.39 christos 1912: #ifdef notyet
1913: /*
1914: * It's not generally safe to do a ``char'' substring search on
1915: * multibyte character strings, but it's safe for at least
1916: * UTF-8 (see RFC 3629).
1917: */
1918: if (MB_CUR_MAX > 1 &&
1919: strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1920: return;
1921: #endif
1922:
1.1 jtc 1923: /* find the longest OCHAR sequence in strip */
1924: newlen = 0;
1.39 christos 1925: offset = 0;
1926: g->moffset = 0;
1.1 jtc 1927: scan = g->strip + 1;
1928: do {
1929: s = *scan++;
1930: switch (OP(s)) {
1931: case OCHAR: /* sequence member */
1.39 christos 1932: if (newlen == 0) { /* new sequence */
1933: memset(&mbs, 0, sizeof(mbs));
1.1 jtc 1934: newstart = scan - 1;
1.39 christos 1935: }
1.40 christos 1936: clen = wcrtomb(buf, (int)OPND(s), &mbs);
1.39 christos 1937: if (clen == (size_t)-1)
1938: goto toohard;
1.40 christos 1939: newlen += (sopno)clen;
1.1 jtc 1940: break;
1941: case OPLUS_: /* things that don't break one */
1942: case OLPAREN:
1943: case ORPAREN:
1944: break;
1945: case OQUEST_: /* things that must be skipped */
1946: case OCH_:
1.39 christos 1947: offset = altoffset(scan, offset);
1.1 jtc 1948: scan--;
1949: do {
1950: scan += OPND(s);
1951: s = *scan;
1952: /* assert() interferes w debug printouts */
1.40 christos 1953: if (OP(s) != O_QUEST &&
1954: OP(s) != O_CH && OP(s) != OOR2) {
1.1 jtc 1955: g->iflags |= BAD;
1956: return;
1957: }
1.40 christos 1958: } while (OP(s) != O_QUEST && OP(s) != O_CH);
1.11 christos 1959: /* FALLTHROUGH */
1.39 christos 1960: case OBOW: /* things that break a sequence */
1961: case OEOW:
1962: case OBOL:
1963: case OEOL:
1964: case OBOS:
1965: case OEOS:
1966: case OWBND:
1967: case ONWBND:
1968: case O_QUEST:
1969: case O_CH:
1970: case OEND:
1971: if (newlen > (sopno)g->mlen) { /* ends one */
1.1 jtc 1972: start = newstart;
1973: g->mlen = newlen;
1.39 christos 1974: if (offset > -1) {
1975: g->moffset += offset;
1976: offset = newlen;
1977: } else
1978: g->moffset = offset;
1979: } else {
1980: if (offset > -1)
1981: offset += newlen;
1.1 jtc 1982: }
1983: newlen = 0;
1984: break;
1.39 christos 1985: case OANY:
1986: if (newlen > (sopno)g->mlen) { /* ends one */
1987: start = newstart;
1988: g->mlen = newlen;
1989: if (offset > -1) {
1990: g->moffset += offset;
1991: offset = newlen;
1992: } else
1993: g->moffset = offset;
1994: } else {
1995: if (offset > -1)
1996: offset += newlen;
1997: }
1998: if (offset > -1)
1999: offset++;
2000: newlen = 0;
2001: break;
2002: case OANYOF: /* may or may not invalidate offset */
2003: /* First, everything as OANY */
2004: if (newlen > (sopno)g->mlen) { /* ends one */
2005: start = newstart;
2006: g->mlen = newlen;
2007: if (offset > -1) {
2008: g->moffset += offset;
2009: offset = newlen;
2010: } else
2011: g->moffset = offset;
2012: } else {
2013: if (offset > -1)
2014: offset += newlen;
2015: }
2016: if (offset > -1)
2017: offset++;
2018: newlen = 0;
2019: break;
1.40 christos 2020: toohard:/*FALLTHROUGH*/
1.39 christos 2021: default:
2022: /* Anything here makes it impossible or too hard
2023: * to calculate the offset -- so we give up;
2024: * save the last known good offset, in case the
2025: * must sequence doesn't occur later.
2026: */
2027: if (newlen > (sopno)g->mlen) { /* ends one */
2028: start = newstart;
2029: g->mlen = newlen;
2030: if (offset > -1)
2031: g->moffset += offset;
2032: else
2033: g->moffset = offset;
2034: }
2035: offset = -1;
2036: newlen = 0;
2037: break;
1.1 jtc 2038: }
2039: } while (OP(s) != OEND);
2040:
1.39 christos 2041: if (g->mlen == 0) { /* there isn't one */
2042: g->moffset = -1;
1.1 jtc 2043: return;
1.39 christos 2044: }
1.1 jtc 2045:
2046: /* turn it into a character string */
2047: g->must = malloc((size_t)g->mlen + 1);
2048: if (g->must == NULL) { /* argh; just forget it */
2049: g->mlen = 0;
1.39 christos 2050: g->moffset = -1;
1.1 jtc 2051: return;
2052: }
2053: cp = g->must;
2054: scan = start;
1.39 christos 2055: memset(&mbs, 0, sizeof(mbs));
2056: while (cp < g->must + g->mlen) {
1.1 jtc 2057: while (OP(s = *scan++) != OCHAR)
2058: continue;
1.40 christos 2059: clen = wcrtomb(cp, (int)OPND(s), &mbs);
1.39 christos 2060: assert(clen != (size_t)-1);
2061: cp += clen;
1.1 jtc 2062: }
1.2 jtc 2063: assert(cp == g->must + g->mlen);
1.1 jtc 2064: *cp++ = '\0'; /* just on general principles */
2065: }
2066:
2067: /*
1.39 christos 2068: - altoffset - choose biggest offset among multiple choices
2069: == static int altoffset(sop *scan, int offset);
2070: *
2071: * Compute, recursively if necessary, the largest offset among multiple
2072: * re paths.
2073: */
2074: static int
2075: altoffset(sop *scan, int offset)
2076: {
2077: int largest;
2078: int try;
2079: sop s;
2080:
2081: _DIAGASSERT(scan != NULL);
2082:
2083: /* If we gave up already on offsets, return */
2084: if (offset == -1)
2085: return -1;
2086:
2087: largest = 0;
2088: try = 0;
2089: s = *scan++;
1.40 christos 2090: while (OP(s) != O_QUEST && OP(s) != O_CH) {
1.39 christos 2091: switch (OP(s)) {
2092: case OOR1:
2093: if (try > largest)
2094: largest = try;
2095: try = 0;
2096: break;
2097: case OQUEST_:
2098: case OCH_:
2099: try = altoffset(scan, try);
2100: if (try == -1)
2101: return -1;
2102: scan--;
2103: do {
2104: scan += OPND(s);
2105: s = *scan;
1.40 christos 2106: if (OP(s) != O_QUEST &&
2107: OP(s) != O_CH && OP(s) != OOR2)
1.39 christos 2108: return -1;
1.40 christos 2109: } while (OP(s) != O_QUEST && OP(s) != O_CH);
1.39 christos 2110: /* We must skip to the next position, or we'll
2111: * leave altoffset() too early.
2112: */
2113: scan++;
2114: break;
2115: case OANYOF:
2116: case OCHAR:
2117: case OANY:
2118: try++;
1.40 christos 2119: /*FALLTHROUGH*/
1.39 christos 2120: case OBOW:
2121: case OEOW:
2122: case OWBND:
2123: case ONWBND:
2124: case OLPAREN:
2125: case ORPAREN:
2126: case OOR2:
2127: break;
2128: default:
2129: try = -1;
2130: break;
2131: }
2132: if (try == -1)
2133: return -1;
2134: s = *scan++;
2135: }
2136:
2137: if (try > largest)
2138: largest = try;
2139:
2140: return largest+offset;
2141: }
2142:
2143: /*
2144: - computejumps - compute char jumps for BM scan
2145: == static void computejumps(struct parse *p, struct re_guts *g);
2146: *
2147: * This algorithm assumes g->must exists and is has size greater than
2148: * zero. It's based on the algorithm found on Computer Algorithms by
2149: * Sara Baase.
2150: *
2151: * A char jump is the number of characters one needs to jump based on
2152: * the value of the character from the text that was mismatched.
2153: */
2154: static void
2155: computejumps(struct parse *p, struct re_guts *g)
2156: {
2157: int ch;
2158: size_t mindex;
2159:
2160: _DIAGASSERT(p != NULL);
2161: _DIAGASSERT(g != NULL);
2162:
2163: /* Avoid making errors worse */
2164: if (p->error != 0)
2165: return;
2166:
2167: g->charjump = calloc((NC_MAX + 1), sizeof(*g->charjump));
2168: if (g->charjump == NULL) /* Not a fatal error */
2169: return;
2170: /* Adjust for signed chars, if necessary */
2171: g->charjump = &g->charjump[-(CHAR_MIN)];
2172:
2173: /* If the character does not exist in the pattern, the jump
2174: * is equal to the number of characters in the pattern.
2175: */
2176: for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
2177: g->charjump[ch] = g->mlen;
2178:
2179: /* If the character does exist, compute the jump that would
2180: * take us to the last character in the pattern equal to it
2181: * (notice that we match right to left, so that last character
2182: * is the first one that would be matched).
2183: */
2184: for (mindex = 0; mindex < g->mlen; mindex++)
2185: g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
2186: }
2187:
2188: /*
2189: - computematchjumps - compute match jumps for BM scan
2190: == static void computematchjumps(struct parse *p, struct re_guts *g);
2191: *
2192: * This algorithm assumes g->must exists and is has size greater than
2193: * zero. It's based on the algorithm found on Computer Algorithms by
2194: * Sara Baase.
2195: *
2196: * A match jump is the number of characters one needs to advance based
2197: * on the already-matched suffix.
2198: * Notice that all values here are minus (g->mlen-1), because of the way
2199: * the search algorithm works.
2200: */
2201: static void
2202: computematchjumps(struct parse *p, struct re_guts *g)
2203: {
2204: size_t mindex; /* General "must" iterator */
2205: size_t suffix; /* Keeps track of matching suffix */
2206: size_t ssuffix; /* Keeps track of suffixes' suffix */
2207: size_t* pmatches; /* pmatches[k] points to the next i
2208: * such that i+1...mlen is a substring
2209: * of k+1...k+mlen-i-1
2210: */
2211:
2212: _DIAGASSERT(p != NULL);
2213: _DIAGASSERT(g != NULL);
2214:
2215: /* Avoid making errors worse */
2216: if (p->error != 0)
2217: return;
2218:
2219: pmatches = calloc(g->mlen, sizeof(*pmatches));
2220: if (pmatches == NULL) {
2221: g->matchjump = NULL;
2222: return;
2223: }
2224:
2225: g->matchjump = calloc(g->mlen, sizeof(*g->matchjump));
2226: if (g->matchjump == NULL) { /* Not a fatal error */
2227: free(pmatches);
2228: return;
2229: }
2230:
2231: /* Set maximum possible jump for each character in the pattern */
2232: for (mindex = 0; mindex < g->mlen; mindex++)
2233: g->matchjump[mindex] = 2 * g->mlen - mindex - 1;
2234:
2235: /* Compute pmatches[] */
2236: for (suffix = mindex = g->mlen; mindex-- > 0; suffix--) {
2237: pmatches[mindex] = suffix;
2238:
2239: /* If a mismatch is found, interrupting the substring,
2240: * compute the matchjump for that position. If no
2241: * mismatch is found, then a text substring mismatched
2242: * against the suffix will also mismatch against the
2243: * substring.
2244: */
2245: while (suffix < g->mlen
2246: && g->must[mindex] != g->must[suffix]) {
2247: g->matchjump[suffix] = MIN(g->matchjump[suffix],
2248: g->mlen - mindex - 1);
2249: suffix = pmatches[suffix];
2250: }
2251: }
2252:
2253: /* Compute the matchjump up to the last substring found to jump
2254: * to the beginning of the largest must pattern prefix matching
2255: * it's own suffix.
2256: */
2257: for (mindex = 0; mindex <= suffix; mindex++)
2258: g->matchjump[mindex] = MIN(g->matchjump[mindex],
2259: g->mlen + suffix - mindex);
2260:
2261: ssuffix = pmatches[suffix];
2262: while (suffix < g->mlen) {
2263: while (suffix <= ssuffix && suffix < g->mlen) {
2264: g->matchjump[suffix] = MIN(g->matchjump[suffix],
2265: g->mlen + ssuffix - suffix);
2266: suffix++;
2267: }
2268: if (suffix < g->mlen)
2269: ssuffix = pmatches[ssuffix];
2270: }
2271:
2272: free(pmatches);
2273: }
2274:
2275: /*
1.1 jtc 2276: - pluscount - count + nesting
1.9 perry 2277: == static sopno pluscount(struct parse *p, struct re_guts *g);
1.1 jtc 2278: */
2279: static sopno /* nesting depth */
1.39 christos 2280: pluscount(struct parse *p, struct re_guts *g)
1.1 jtc 2281: {
1.9 perry 2282: sop *scan;
2283: sop s;
2284: sopno plusnest = 0;
2285: sopno maxnest = 0;
1.14 lukem 2286:
2287: _DIAGASSERT(p != NULL);
2288: _DIAGASSERT(g != NULL);
1.1 jtc 2289:
2290: if (p->error != 0)
2291: return(0); /* there may not be an OEND */
2292:
2293: scan = g->strip + 1;
2294: do {
2295: s = *scan++;
2296: switch (OP(s)) {
2297: case OPLUS_:
2298: plusnest++;
2299: break;
2300: case O_PLUS:
2301: if (plusnest > maxnest)
2302: maxnest = plusnest;
2303: plusnest--;
2304: break;
2305: }
2306: } while (OP(s) != OEND);
2307: if (plusnest != 0)
2308: g->iflags |= BAD;
2309: return(maxnest);
2310: }
CVSweb <webmaster@jp.NetBSD.org>