Annotation of src/lib/libc/regex/regcomp.c, Revision 1.16
1.16 ! mycroft 1: /* $NetBSD: regcomp.c,v 1.15 1999/09/20 04:39:19 lukem Exp $ */
1.6 cgd 2:
1.5 cgd 3: /*-
4: * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5: * Copyright (c) 1992, 1993, 1994
6: * The Regents of the University of California. All rights reserved.
7: *
8: * This code is derived from software contributed to Berkeley by
9: * Henry Spencer.
10: *
11: * Redistribution and use in source and binary forms, with or without
12: * modification, are permitted provided that the following conditions
13: * are met:
14: * 1. Redistributions of source code must retain the above copyright
15: * notice, this list of conditions and the following disclaimer.
16: * 2. Redistributions in binary form must reproduce the above copyright
17: * notice, this list of conditions and the following disclaimer in the
18: * documentation and/or other materials provided with the distribution.
19: * 3. All advertising materials mentioning features or use of this software
20: * must display the following acknowledgement:
21: * This product includes software developed by the University of
22: * California, Berkeley and its contributors.
23: * 4. Neither the name of the University nor the names of its contributors
24: * may be used to endorse or promote products derived from this software
25: * without specific prior written permission.
26: *
27: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37: * SUCH DAMAGE.
38: *
39: * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
40: */
41:
1.7 christos 42: #include <sys/cdefs.h>
1.5 cgd 43: #if defined(LIBC_SCCS) && !defined(lint)
1.6 cgd 44: #if 0
1.5 cgd 45: static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
1.6 cgd 46: #else
1.16 ! mycroft 47: __RCSID("$NetBSD: regcomp.c,v 1.15 1999/09/20 04:39:19 lukem Exp $");
1.6 cgd 48: #endif
1.5 cgd 49: #endif /* LIBC_SCCS and not lint */
50:
1.8 jtc 51: #include "namespace.h"
1.1 jtc 52: #include <sys/types.h>
1.14 lukem 53:
54: #include <assert.h>
1.1 jtc 55: #include <ctype.h>
56: #include <limits.h>
1.14 lukem 57: #include <regex.h>
58: #include <stdio.h>
1.1 jtc 59: #include <stdlib.h>
1.14 lukem 60: #include <string.h>
1.8 jtc 61:
62: #ifdef __weak_alias
1.16 ! mycroft 63: __weak_alias(regcomp,_regcomp)
1.8 jtc 64: #endif
1.1 jtc 65:
66: #include "utils.h"
67: #include "regex2.h"
68:
69: #include "cclass.h"
70: #include "cname.h"
71:
72: /*
73: * parse structure, passed up and down to avoid global variables and
74: * other clumsinesses
75: */
76: struct parse {
77: char *next; /* next character in RE */
78: char *end; /* end of string (-> NUL normally) */
79: int error; /* has an error been seen? */
80: sop *strip; /* malloced strip */
81: sopno ssize; /* malloced strip size (allocated) */
82: sopno slen; /* malloced strip length (used) */
83: int ncsalloc; /* number of csets allocated */
84: struct re_guts *g;
85: # define NPAREN 10 /* we need to remember () 1-9 for back refs */
86: sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
87: sopno pend[NPAREN]; /* -> ) ([0] unused) */
88: };
89:
1.5 cgd 90: /* ========= begin header generated by ./mkh ========= */
91: #ifdef __cplusplus
92: extern "C" {
93: #endif
94:
95: /* === regcomp.c === */
96: static void p_ere __P((struct parse *p, int stop));
97: static void p_ere_exp __P((struct parse *p));
98: static void p_str __P((struct parse *p));
99: static void p_bre __P((struct parse *p, int end1, int end2));
100: static int p_simp_re __P((struct parse *p, int starordinary));
101: static int p_count __P((struct parse *p));
102: static void p_bracket __P((struct parse *p));
103: static void p_b_term __P((struct parse *p, cset *cs));
104: static void p_b_cclass __P((struct parse *p, cset *cs));
105: static void p_b_eclass __P((struct parse *p, cset *cs));
106: static char p_b_symbol __P((struct parse *p));
107: static char p_b_coll_elem __P((struct parse *p, int endc));
108: static char othercase __P((int ch));
109: static void bothcases __P((struct parse *p, int ch));
110: static void ordinary __P((struct parse *p, int ch));
111: static void nonnewline __P((struct parse *p));
112: static void repeat __P((struct parse *p, sopno start, int from, int to));
113: static int seterr __P((struct parse *p, int e));
114: static cset *allocset __P((struct parse *p));
115: static void freeset __P((struct parse *p, cset *cs));
116: static int freezeset __P((struct parse *p, cset *cs));
117: static int firstch __P((struct parse *p, cset *cs));
118: static int nch __P((struct parse *p, cset *cs));
1.10 mycroft 119: static void mcadd __P((struct parse *p, cset *cs, const char *cp));
1.7 christos 120: #if 0
1.5 cgd 121: static void mcsub __P((cset *cs, char *cp));
122: static int mcin __P((cset *cs, char *cp));
123: static char *mcfind __P((cset *cs, char *cp));
1.7 christos 124: #endif
1.5 cgd 125: static void mcinvert __P((struct parse *p, cset *cs));
126: static void mccase __P((struct parse *p, cset *cs));
127: static int isinsets __P((struct re_guts *g, int c));
128: static int samesets __P((struct re_guts *g, int c1, int c2));
129: static void categorize __P((struct parse *p, struct re_guts *g));
130: static sopno dupl __P((struct parse *p, sopno start, sopno finish));
1.12 drochner 131: static void doemit __P((struct parse *p, sop op, sopno opnd));
132: static void doinsert __P((struct parse *p, sop op, sopno opnd, sopno pos));
133: static void dofwd __P((struct parse *p, sopno pos, sopno value));
1.5 cgd 134: static void enlarge __P((struct parse *p, sopno size));
135: static void stripsnug __P((struct parse *p, struct re_guts *g));
136: static void findmust __P((struct parse *p, struct re_guts *g));
137: static sopno pluscount __P((struct parse *p, struct re_guts *g));
138:
139: #ifdef __cplusplus
140: }
141: #endif
142: /* ========= end header generated by ./mkh ========= */
1.1 jtc 143:
144: static char nuls[10]; /* place to point scanner in event of error */
145:
146: /*
147: * macros for use with parse structure
148: * BEWARE: these know that the parse structure is named `p' !!!
149: */
150: #define PEEK() (*p->next)
151: #define PEEK2() (*(p->next+1))
152: #define MORE() (p->next < p->end)
153: #define MORE2() (p->next+1 < p->end)
154: #define SEE(c) (MORE() && PEEK() == (c))
155: #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
156: #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
157: #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
158: #define NEXT() (p->next++)
159: #define NEXT2() (p->next += 2)
160: #define NEXTn(n) (p->next += (n))
161: #define GETNEXT() (*p->next++)
162: #define SETERROR(e) seterr(p, (e))
1.7 christos 163: #define REQUIRE(co, e) (void) ((co) || SETERROR(e))
1.1 jtc 164: #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
1.7 christos 165: #define MUSTEAT(c, e) (void) (REQUIRE(MORE() && GETNEXT() == (c), e))
1.1 jtc 166: #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
1.12 drochner 167: #define EMIT(op, sopnd) doemit(p, (sop)(op), sopnd)
168: #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
169: #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
1.1 jtc 170: #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
171: #define HERE() (p->slen)
172: #define THERE() (p->slen - 1)
1.4 jtc 173: #define THERETHERE() (p->slen - 2)
1.1 jtc 174: #define DROP(n) (p->slen -= (n))
175:
176: #ifndef NDEBUG
177: static int never = 0; /* for use in asserts; shuts lint up */
1.3 jtc 178: #else
179: #define never 0 /* some <assert.h>s have bugs too */
1.1 jtc 180: #endif
181:
182: /*
183: - regcomp - interface for parser and compilation
1.2 jtc 184: = extern int regcomp(regex_t *, const char *, int);
1.1 jtc 185: = #define REG_BASIC 0000
186: = #define REG_EXTENDED 0001
187: = #define REG_ICASE 0002
188: = #define REG_NOSUB 0004
189: = #define REG_NEWLINE 0010
190: = #define REG_NOSPEC 0020
191: = #define REG_PEND 0040
192: = #define REG_DUMP 0200
193: */
194: int /* 0 success, otherwise REG_something */
195: regcomp(preg, pattern, cflags)
196: regex_t *preg;
197: const char *pattern;
198: int cflags;
199: {
200: struct parse pa;
1.9 perry 201: struct re_guts *g;
202: struct parse *p = &pa;
203: int i;
204: size_t len;
1.3 jtc 205: #ifdef REDEBUG
206: # define GOODFLAGS(f) (f)
207: #else
208: # define GOODFLAGS(f) ((f)&~REG_DUMP)
209: #endif
1.1 jtc 210:
1.14 lukem 211: _DIAGASSERT(preg != NULL);
212: _DIAGASSERT(pattern != NULL);
213:
1.3 jtc 214: cflags = GOODFLAGS(cflags);
1.1 jtc 215: if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
216: return(REG_INVARG);
217:
218: if (cflags®_PEND) {
219: if (preg->re_endp < pattern)
220: return(REG_INVARG);
221: len = preg->re_endp - pattern;
222: } else
1.11 christos 223: len = strlen(pattern);
1.1 jtc 224:
225: /* do the mallocs early so failure handling is easy */
226: g = (struct re_guts *)malloc(sizeof(struct re_guts) +
227: (NC-1)*sizeof(cat_t));
228: if (g == NULL)
229: return(REG_ESPACE);
230: p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
231: p->strip = (sop *)malloc(p->ssize * sizeof(sop));
232: p->slen = 0;
233: if (p->strip == NULL) {
1.11 christos 234: free(g);
1.1 jtc 235: return(REG_ESPACE);
236: }
237:
238: /* set things up */
239: p->g = g;
1.11 christos 240: /* LINTED convenience; we do not modify it */
241: p->next = (char *)pattern;
1.1 jtc 242: p->end = p->next + len;
243: p->error = 0;
244: p->ncsalloc = 0;
245: for (i = 0; i < NPAREN; i++) {
246: p->pbegin[i] = 0;
247: p->pend[i] = 0;
248: }
249: g->csetsize = NC;
250: g->sets = NULL;
251: g->setbits = NULL;
252: g->ncsets = 0;
253: g->cflags = cflags;
254: g->iflags = 0;
255: g->nbol = 0;
256: g->neol = 0;
257: g->must = NULL;
258: g->mlen = 0;
259: g->nsub = 0;
260: g->ncategories = 1; /* category 0 is "everything else" */
261: g->categories = &g->catspace[-(CHAR_MIN)];
262: (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
263: g->backrefs = 0;
264:
265: /* do it */
266: EMIT(OEND, 0);
267: g->firststate = THERE();
268: if (cflags®_EXTENDED)
269: p_ere(p, OUT);
270: else if (cflags®_NOSPEC)
271: p_str(p);
272: else
273: p_bre(p, OUT, OUT);
274: EMIT(OEND, 0);
275: g->laststate = THERE();
276:
277: /* tidy up loose ends and fill things in */
278: categorize(p, g);
279: stripsnug(p, g);
280: findmust(p, g);
281: g->nplus = pluscount(p, g);
282: g->magic = MAGIC2;
283: preg->re_nsub = g->nsub;
284: preg->re_g = g;
285: preg->re_magic = MAGIC1;
286: #ifndef REDEBUG
287: /* not debugging, so can't rely on the assert() in regexec() */
288: if (g->iflags&BAD)
289: SETERROR(REG_ASSERT);
290: #endif
291:
292: /* win or lose, we're done */
293: if (p->error != 0) /* lose */
294: regfree(preg);
295: return(p->error);
296: }
297:
298: /*
299: - p_ere - ERE parser top level, concatenation and alternation
1.9 perry 300: == static void p_ere(struct parse *p, int stop);
1.1 jtc 301: */
302: static void
303: p_ere(p, stop)
1.9 perry 304: struct parse *p;
1.1 jtc 305: int stop; /* character this ERE should end at */
306: {
1.9 perry 307: char c;
308: sopno prevback = 0; /* pacify gcc */
309: sopno prevfwd = 0; /* pacify gcc */
310: sopno conc;
311: int first = 1; /* is this the first alternative? */
1.1 jtc 312:
1.14 lukem 313: _DIAGASSERT(p != NULL);
314:
1.1 jtc 315: for (;;) {
316: /* do a bunch of concatenated expressions */
317: conc = HERE();
318: while (MORE() && (c = PEEK()) != '|' && c != stop)
319: p_ere_exp(p);
320: REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
321:
322: if (!EAT('|'))
323: break; /* NOTE BREAK OUT */
324:
325: if (first) {
326: INSERT(OCH_, conc); /* offset is wrong */
327: prevfwd = conc;
328: prevback = conc;
329: first = 0;
330: }
331: ASTERN(OOR1, prevback);
332: prevback = THERE();
333: AHEAD(prevfwd); /* fix previous offset */
334: prevfwd = HERE();
335: EMIT(OOR2, 0); /* offset is very wrong */
336: }
337:
338: if (!first) { /* tail-end fixups */
339: AHEAD(prevfwd);
340: ASTERN(O_CH, prevback);
341: }
342:
343: assert(!MORE() || SEE(stop));
344: }
345:
346: /*
347: - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
1.9 perry 348: == static void p_ere_exp(struct parse *p);
1.1 jtc 349: */
350: static void
351: p_ere_exp(p)
1.9 perry 352: struct parse *p;
1.1 jtc 353: {
1.9 perry 354: char c;
355: sopno pos;
356: int count;
357: int count2;
358: sopno subno;
1.1 jtc 359: int wascaret = 0;
360:
1.14 lukem 361: _DIAGASSERT(p != NULL);
362:
1.1 jtc 363: assert(MORE()); /* caller should have ensured this */
364: c = GETNEXT();
365:
366: pos = HERE();
367: switch (c) {
368: case '(':
369: REQUIRE(MORE(), REG_EPAREN);
370: p->g->nsub++;
371: subno = p->g->nsub;
372: if (subno < NPAREN)
373: p->pbegin[subno] = HERE();
374: EMIT(OLPAREN, subno);
375: if (!SEE(')'))
376: p_ere(p, ')');
377: if (subno < NPAREN) {
378: p->pend[subno] = HERE();
379: assert(p->pend[subno] != 0);
380: }
381: EMIT(ORPAREN, subno);
382: MUSTEAT(')', REG_EPAREN);
383: break;
384: #ifndef POSIX_MISTAKE
385: case ')': /* happens only if no current unmatched ( */
386: /*
387: * You may ask, why the ifndef? Because I didn't notice
388: * this until slightly too late for 1003.2, and none of the
389: * other 1003.2 regular-expression reviewers noticed it at
390: * all. So an unmatched ) is legal POSIX, at least until
391: * we can get it fixed.
392: */
393: SETERROR(REG_EPAREN);
394: break;
395: #endif
396: case '^':
397: EMIT(OBOL, 0);
398: p->g->iflags |= USEBOL;
399: p->g->nbol++;
400: wascaret = 1;
401: break;
402: case '$':
403: EMIT(OEOL, 0);
404: p->g->iflags |= USEEOL;
405: p->g->neol++;
406: break;
407: case '|':
408: SETERROR(REG_EMPTY);
409: break;
410: case '*':
411: case '+':
412: case '?':
413: SETERROR(REG_BADRPT);
414: break;
415: case '.':
416: if (p->g->cflags®_NEWLINE)
417: nonnewline(p);
418: else
419: EMIT(OANY, 0);
420: break;
421: case '[':
422: p_bracket(p);
423: break;
424: case '\\':
425: REQUIRE(MORE(), REG_EESCAPE);
426: c = GETNEXT();
427: ordinary(p, c);
428: break;
429: case '{': /* okay as ordinary except if digit follows */
430: REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);
431: /* FALLTHROUGH */
432: default:
433: ordinary(p, c);
434: break;
435: }
436:
437: if (!MORE())
438: return;
439: c = PEEK();
440: /* we call { a repetition if followed by a digit */
441: if (!( c == '*' || c == '+' || c == '?' ||
442: (c == '{' && MORE2() && isdigit(PEEK2())) ))
443: return; /* no repetition, we're done */
444: NEXT();
445:
446: REQUIRE(!wascaret, REG_BADRPT);
447: switch (c) {
448: case '*': /* implemented as +? */
1.4 jtc 449: /* this case does not require the (y|) trick, noKLUDGE */
1.1 jtc 450: INSERT(OPLUS_, pos);
451: ASTERN(O_PLUS, pos);
452: INSERT(OQUEST_, pos);
453: ASTERN(O_QUEST, pos);
454: break;
455: case '+':
456: INSERT(OPLUS_, pos);
457: ASTERN(O_PLUS, pos);
458: break;
459: case '?':
1.4 jtc 460: /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
461: INSERT(OCH_, pos); /* offset slightly wrong */
462: ASTERN(OOR1, pos); /* this one's right */
463: AHEAD(pos); /* fix the OCH_ */
464: EMIT(OOR2, 0); /* offset very wrong... */
465: AHEAD(THERE()); /* ...so fix it */
466: ASTERN(O_CH, THERETHERE());
1.1 jtc 467: break;
468: case '{':
469: count = p_count(p);
470: if (EAT(',')) {
471: if (isdigit(PEEK())) {
472: count2 = p_count(p);
473: REQUIRE(count <= count2, REG_BADBR);
474: } else /* single number with comma */
475: count2 = INFINITY;
476: } else /* just a single number */
477: count2 = count;
478: repeat(p, pos, count, count2);
479: if (!EAT('}')) { /* error heuristics */
480: while (MORE() && PEEK() != '}')
481: NEXT();
482: REQUIRE(MORE(), REG_EBRACE);
483: SETERROR(REG_BADBR);
484: }
485: break;
486: }
487:
488: if (!MORE())
489: return;
490: c = PEEK();
491: if (!( c == '*' || c == '+' || c == '?' ||
492: (c == '{' && MORE2() && isdigit(PEEK2())) ) )
493: return;
494: SETERROR(REG_BADRPT);
495: }
496:
497: /*
498: - p_str - string (no metacharacters) "parser"
1.9 perry 499: == static void p_str(struct parse *p);
1.1 jtc 500: */
501: static void
502: p_str(p)
1.9 perry 503: struct parse *p;
1.1 jtc 504: {
1.14 lukem 505:
506: _DIAGASSERT(p != NULL);
507:
1.1 jtc 508: REQUIRE(MORE(), REG_EMPTY);
509: while (MORE())
510: ordinary(p, GETNEXT());
511: }
512:
513: /*
514: - p_bre - BRE parser top level, anchoring and concatenation
1.9 perry 515: == static void p_bre(struct parse *p, int end1, \
516: == int end2);
1.1 jtc 517: * Giving end1 as OUT essentially eliminates the end1/end2 check.
518: *
519: * This implementation is a bit of a kludge, in that a trailing $ is first
520: * taken as an ordinary character and then revised to be an anchor. The
521: * only undesirable side effect is that '$' gets included as a character
522: * category in such cases. This is fairly harmless; not worth fixing.
523: * The amount of lookahead needed to avoid this kludge is excessive.
524: */
525: static void
526: p_bre(p, end1, end2)
1.9 perry 527: struct parse *p;
528: int end1; /* first terminating character */
529: int end2; /* second terminating character */
530: {
1.14 lukem 531: sopno start;
1.9 perry 532: int first = 1; /* first subexpression? */
533: int wasdollar = 0;
1.1 jtc 534:
1.14 lukem 535: _DIAGASSERT(p != NULL);
536:
537: start = HERE();
538:
1.1 jtc 539: if (EAT('^')) {
540: EMIT(OBOL, 0);
541: p->g->iflags |= USEBOL;
542: p->g->nbol++;
543: }
544: while (MORE() && !SEETWO(end1, end2)) {
545: wasdollar = p_simp_re(p, first);
546: first = 0;
547: }
548: if (wasdollar) { /* oops, that was a trailing anchor */
549: DROP(1);
550: EMIT(OEOL, 0);
551: p->g->iflags |= USEEOL;
552: p->g->neol++;
553: }
554:
555: REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
556: }
557:
558: /*
559: - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
1.9 perry 560: == static int p_simp_re(struct parse *p, int starordinary);
1.1 jtc 561: */
562: static int /* was the simple RE an unbackslashed $? */
563: p_simp_re(p, starordinary)
1.9 perry 564: struct parse *p;
1.1 jtc 565: int starordinary; /* is a leading * an ordinary character? */
566: {
1.9 perry 567: int c;
568: int count;
569: int count2;
570: sopno pos;
571: int i;
572: sopno subno;
1.1 jtc 573: # define BACKSL (1<<CHAR_BIT)
574:
1.14 lukem 575: _DIAGASSERT(p != NULL);
576:
1.1 jtc 577: pos = HERE(); /* repetion op, if any, covers from here */
578:
579: assert(MORE()); /* caller should have ensured this */
580: c = GETNEXT();
581: if (c == '\\') {
582: REQUIRE(MORE(), REG_EESCAPE);
583: c = BACKSL | (unsigned char)GETNEXT();
584: }
585: switch (c) {
586: case '.':
587: if (p->g->cflags®_NEWLINE)
588: nonnewline(p);
589: else
590: EMIT(OANY, 0);
591: break;
592: case '[':
593: p_bracket(p);
594: break;
595: case BACKSL|'{':
596: SETERROR(REG_BADRPT);
597: break;
598: case BACKSL|'(':
599: p->g->nsub++;
600: subno = p->g->nsub;
601: if (subno < NPAREN)
602: p->pbegin[subno] = HERE();
603: EMIT(OLPAREN, subno);
604: /* the MORE here is an error heuristic */
605: if (MORE() && !SEETWO('\\', ')'))
606: p_bre(p, '\\', ')');
607: if (subno < NPAREN) {
608: p->pend[subno] = HERE();
609: assert(p->pend[subno] != 0);
610: }
611: EMIT(ORPAREN, subno);
612: REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
613: break;
614: case BACKSL|')': /* should not get here -- must be user */
615: case BACKSL|'}':
616: SETERROR(REG_EPAREN);
617: break;
618: case BACKSL|'1':
619: case BACKSL|'2':
620: case BACKSL|'3':
621: case BACKSL|'4':
622: case BACKSL|'5':
623: case BACKSL|'6':
624: case BACKSL|'7':
625: case BACKSL|'8':
626: case BACKSL|'9':
627: i = (c&~BACKSL) - '0';
628: assert(i < NPAREN);
629: if (p->pend[i] != 0) {
630: assert(i <= p->g->nsub);
631: EMIT(OBACK_, i);
632: assert(p->pbegin[i] != 0);
633: assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
634: assert(OP(p->strip[p->pend[i]]) == ORPAREN);
635: (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
636: EMIT(O_BACK, i);
637: } else
638: SETERROR(REG_ESUBREG);
639: p->g->backrefs = 1;
640: break;
641: case '*':
642: REQUIRE(starordinary, REG_BADRPT);
643: /* FALLTHROUGH */
644: default:
645: ordinary(p, c &~ BACKSL);
646: break;
647: }
648:
649: if (EAT('*')) { /* implemented as +? */
1.4 jtc 650: /* this case does not require the (y|) trick, noKLUDGE */
1.1 jtc 651: INSERT(OPLUS_, pos);
652: ASTERN(O_PLUS, pos);
653: INSERT(OQUEST_, pos);
654: ASTERN(O_QUEST, pos);
655: } else if (EATTWO('\\', '{')) {
656: count = p_count(p);
657: if (EAT(',')) {
658: if (MORE() && isdigit(PEEK())) {
659: count2 = p_count(p);
660: REQUIRE(count <= count2, REG_BADBR);
661: } else /* single number with comma */
662: count2 = INFINITY;
663: } else /* just a single number */
664: count2 = count;
665: repeat(p, pos, count, count2);
666: if (!EATTWO('\\', '}')) { /* error heuristics */
667: while (MORE() && !SEETWO('\\', '}'))
668: NEXT();
669: REQUIRE(MORE(), REG_EBRACE);
670: SETERROR(REG_BADBR);
671: }
672: } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */
673: return(1);
674:
675: return(0);
676: }
677:
678: /*
679: - p_count - parse a repetition count
1.9 perry 680: == static int p_count(struct parse *p);
1.1 jtc 681: */
682: static int /* the value */
683: p_count(p)
1.9 perry 684: struct parse *p;
1.1 jtc 685: {
1.9 perry 686: int count = 0;
687: int ndigits = 0;
1.1 jtc 688:
1.14 lukem 689: _DIAGASSERT(p != NULL);
690:
1.1 jtc 691: while (MORE() && isdigit(PEEK()) && count <= DUPMAX) {
692: count = count*10 + (GETNEXT() - '0');
693: ndigits++;
694: }
695:
696: REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
697: return(count);
698: }
699:
700: /*
701: - p_bracket - parse a bracketed character list
1.9 perry 702: == static void p_bracket(struct parse *p);
1.1 jtc 703: *
704: * Note a significant property of this code: if the allocset() did SETERROR,
705: * no set operations are done.
706: */
707: static void
708: p_bracket(p)
1.9 perry 709: struct parse *p;
1.1 jtc 710: {
1.14 lukem 711: cset *cs;
1.9 perry 712: int invert = 0;
1.1 jtc 713:
1.14 lukem 714: _DIAGASSERT(p != NULL);
715:
716: cs = allocset(p);
717:
1.1 jtc 718: /* Dept of Truly Sickening Special-Case Kludges */
1.13 drochner 719: if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]",
720: (size_t)6) == 0) {
1.1 jtc 721: EMIT(OBOW, 0);
722: NEXTn(6);
723: return;
724: }
1.13 drochner 725: if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]",
726: (size_t)6) == 0) {
1.1 jtc 727: EMIT(OEOW, 0);
728: NEXTn(6);
729: return;
730: }
731:
732: if (EAT('^'))
733: invert++; /* make note to invert set at end */
734: if (EAT(']'))
735: CHadd(cs, ']');
736: else if (EAT('-'))
737: CHadd(cs, '-');
738: while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
739: p_b_term(p, cs);
740: if (EAT('-'))
741: CHadd(cs, '-');
742: MUSTEAT(']', REG_EBRACK);
743:
744: if (p->error != 0) /* don't mess things up further */
745: return;
746:
747: if (p->g->cflags®_ICASE) {
1.9 perry 748: int i;
749: int ci;
1.1 jtc 750:
751: for (i = p->g->csetsize - 1; i >= 0; i--)
752: if (CHIN(cs, i) && isalpha(i)) {
753: ci = othercase(i);
754: if (ci != i)
755: CHadd(cs, ci);
756: }
757: if (cs->multis != NULL)
1.2 jtc 758: mccase(p, cs);
1.1 jtc 759: }
760: if (invert) {
1.9 perry 761: int i;
1.1 jtc 762:
763: for (i = p->g->csetsize - 1; i >= 0; i--)
764: if (CHIN(cs, i))
765: CHsub(cs, i);
766: else
767: CHadd(cs, i);
768: if (p->g->cflags®_NEWLINE)
769: CHsub(cs, '\n');
770: if (cs->multis != NULL)
1.2 jtc 771: mcinvert(p, cs);
1.1 jtc 772: }
773:
774: assert(cs->multis == NULL); /* xxx */
775:
776: if (nch(p, cs) == 1) { /* optimize singleton sets */
777: ordinary(p, firstch(p, cs));
778: freeset(p, cs);
779: } else
780: EMIT(OANYOF, freezeset(p, cs));
781: }
782:
783: /*
784: - p_b_term - parse one term of a bracketed character list
1.9 perry 785: == static void p_b_term(struct parse *p, cset *cs);
1.1 jtc 786: */
787: static void
788: p_b_term(p, cs)
1.9 perry 789: struct parse *p;
790: cset *cs;
1.1 jtc 791: {
1.9 perry 792: char c;
793: char start, finish;
794: int i;
1.1 jtc 795:
1.14 lukem 796: _DIAGASSERT(p != NULL);
797: _DIAGASSERT(cs != NULL);
798:
1.1 jtc 799: /* classify what we've got */
800: switch ((MORE()) ? PEEK() : '\0') {
801: case '[':
802: c = (MORE2()) ? PEEK2() : '\0';
803: break;
1.11 christos 804:
1.1 jtc 805: case '-':
806: SETERROR(REG_ERANGE);
807: return; /* NOTE RETURN */
1.11 christos 808:
1.1 jtc 809: default:
810: c = '\0';
811: break;
812: }
813:
814: switch (c) {
815: case ':': /* character class */
816: NEXT2();
817: REQUIRE(MORE(), REG_EBRACK);
818: c = PEEK();
819: REQUIRE(c != '-' && c != ']', REG_ECTYPE);
820: p_b_cclass(p, cs);
821: REQUIRE(MORE(), REG_EBRACK);
822: REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
823: break;
824: case '=': /* equivalence class */
825: NEXT2();
826: REQUIRE(MORE(), REG_EBRACK);
827: c = PEEK();
828: REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
829: p_b_eclass(p, cs);
830: REQUIRE(MORE(), REG_EBRACK);
831: REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
832: break;
833: default: /* symbol, ordinary character, or range */
834: /* xxx revision needed for multichar stuff */
835: start = p_b_symbol(p);
836: if (SEE('-') && MORE2() && PEEK2() != ']') {
837: /* range */
838: NEXT();
839: if (EAT('-'))
840: finish = '-';
841: else
842: finish = p_b_symbol(p);
843: } else
844: finish = start;
845: /* xxx what about signed chars here... */
846: REQUIRE(start <= finish, REG_ERANGE);
847: for (i = start; i <= finish; i++)
848: CHadd(cs, i);
849: break;
850: }
851: }
852:
853: /*
854: - p_b_cclass - parse a character-class name and deal with it
1.9 perry 855: == static void p_b_cclass(struct parse *p, cset *cs);
1.1 jtc 856: */
857: static void
858: p_b_cclass(p, cs)
1.9 perry 859: struct parse *p;
860: cset *cs;
1.1 jtc 861: {
1.14 lukem 862: char *sp;
1.10 mycroft 863: const struct cclass *cp;
1.9 perry 864: size_t len;
1.10 mycroft 865: const char *u;
1.9 perry 866: char c;
1.1 jtc 867:
1.14 lukem 868: _DIAGASSERT(p != NULL);
869: _DIAGASSERT(cs != NULL);
870:
871: sp = p->next;
872:
1.1 jtc 873: while (MORE() && isalpha(PEEK()))
874: NEXT();
875: len = p->next - sp;
876: for (cp = cclasses; cp->name != NULL; cp++)
877: if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
878: break;
879: if (cp->name == NULL) {
880: /* oops, didn't find it */
881: SETERROR(REG_ECTYPE);
882: return;
883: }
884:
885: u = cp->chars;
886: while ((c = *u++) != '\0')
887: CHadd(cs, c);
888: for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
1.2 jtc 889: MCadd(p, cs, u);
1.1 jtc 890: }
891:
892: /*
893: - p_b_eclass - parse an equivalence-class name and deal with it
1.9 perry 894: == static void p_b_eclass(struct parse *p, cset *cs);
1.1 jtc 895: *
896: * This implementation is incomplete. xxx
897: */
898: static void
899: p_b_eclass(p, cs)
1.9 perry 900: struct parse *p;
901: cset *cs;
1.1 jtc 902: {
1.9 perry 903: char c;
1.1 jtc 904:
1.14 lukem 905: _DIAGASSERT(p != NULL);
906: _DIAGASSERT(cs != NULL);
907:
1.1 jtc 908: c = p_b_coll_elem(p, '=');
909: CHadd(cs, c);
910: }
911:
912: /*
913: - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1.9 perry 914: == static char p_b_symbol(struct parse *p);
1.1 jtc 915: */
916: static char /* value of symbol */
917: p_b_symbol(p)
1.9 perry 918: struct parse *p;
1.1 jtc 919: {
1.9 perry 920: char value;
1.1 jtc 921:
1.14 lukem 922: _DIAGASSERT(p != NULL);
923:
1.1 jtc 924: REQUIRE(MORE(), REG_EBRACK);
925: if (!EATTWO('[', '.'))
926: return(GETNEXT());
927:
928: /* collating symbol */
929: value = p_b_coll_elem(p, '.');
930: REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
931: return(value);
932: }
933:
934: /*
935: - p_b_coll_elem - parse a collating-element name and look it up
1.9 perry 936: == static char p_b_coll_elem(struct parse *p, int endc);
1.1 jtc 937: */
938: static char /* value of collating element */
939: p_b_coll_elem(p, endc)
1.9 perry 940: struct parse *p;
1.1 jtc 941: int endc; /* name ended by endc,']' */
942: {
1.14 lukem 943: char *sp;
1.10 mycroft 944: const struct cname *cp;
1.11 christos 945: size_t len;
1.1 jtc 946:
1.14 lukem 947: _DIAGASSERT(p != NULL);
948:
949: sp = p->next;
950:
1.1 jtc 951: while (MORE() && !SEETWO(endc, ']'))
952: NEXT();
953: if (!MORE()) {
954: SETERROR(REG_EBRACK);
955: return(0);
956: }
957: len = p->next - sp;
958: for (cp = cnames; cp->name != NULL; cp++)
959: if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
960: return(cp->code); /* known name */
961: if (len == 1)
962: return(*sp); /* single character */
963: SETERROR(REG_ECOLLATE); /* neither */
964: return(0);
965: }
966:
967: /*
968: - othercase - return the case counterpart of an alphabetic
969: == static char othercase(int ch);
970: */
971: static char /* if no counterpart, return ch */
972: othercase(ch)
973: int ch;
974: {
975: assert(isalpha(ch));
976: if (isupper(ch))
977: return(tolower(ch));
978: else if (islower(ch))
979: return(toupper(ch));
980: else /* peculiar, but could happen */
981: return(ch);
982: }
983:
984: /*
985: - bothcases - emit a dualcase version of a two-case character
1.9 perry 986: == static void bothcases(struct parse *p, int ch);
1.1 jtc 987: *
988: * Boy, is this implementation ever a kludge...
989: */
990: static void
991: bothcases(p, ch)
1.9 perry 992: struct parse *p;
1.1 jtc 993: int ch;
994: {
1.14 lukem 995: char *oldnext;
996: char *oldend;
1.1 jtc 997: char bracket[3];
998:
1.14 lukem 999: _DIAGASSERT(p != NULL);
1000:
1001: oldnext = p->next;
1002: oldend = p->end;
1003:
1.1 jtc 1004: assert(othercase(ch) != ch); /* p_bracket() would recurse */
1005: p->next = bracket;
1006: p->end = bracket+2;
1007: bracket[0] = ch;
1008: bracket[1] = ']';
1009: bracket[2] = '\0';
1010: p_bracket(p);
1011: assert(p->next == bracket+2);
1012: p->next = oldnext;
1013: p->end = oldend;
1014: }
1015:
1016: /*
1017: - ordinary - emit an ordinary character
1.9 perry 1018: == static void ordinary(struct parse *p, int ch);
1.1 jtc 1019: */
1020: static void
1021: ordinary(p, ch)
1.9 perry 1022: struct parse *p;
1023: int ch;
1.1 jtc 1024: {
1.14 lukem 1025: cat_t *cap;
1.1 jtc 1026:
1.14 lukem 1027: _DIAGASSERT(p != NULL);
1028:
1029: cap = p->g->categories;
1.1 jtc 1030: if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch)
1031: bothcases(p, ch);
1032: else {
1033: EMIT(OCHAR, (unsigned char)ch);
1034: if (cap[ch] == 0)
1035: cap[ch] = p->g->ncategories++;
1036: }
1037: }
1038:
1039: /*
1040: - nonnewline - emit REG_NEWLINE version of OANY
1.9 perry 1041: == static void nonnewline(struct parse *p);
1.1 jtc 1042: *
1043: * Boy, is this implementation ever a kludge...
1044: */
1045: static void
1046: nonnewline(p)
1.9 perry 1047: struct parse *p;
1.1 jtc 1048: {
1.14 lukem 1049: char *oldnext;
1050: char *oldend;
1.1 jtc 1051: char bracket[4];
1052:
1.14 lukem 1053: _DIAGASSERT(p != NULL);
1054:
1055: oldnext = p->next;
1056: oldend = p->end;
1057:
1.1 jtc 1058: p->next = bracket;
1059: p->end = bracket+3;
1060: bracket[0] = '^';
1061: bracket[1] = '\n';
1062: bracket[2] = ']';
1063: bracket[3] = '\0';
1064: p_bracket(p);
1065: assert(p->next == bracket+3);
1066: p->next = oldnext;
1067: p->end = oldend;
1068: }
1069:
1070: /*
1071: - repeat - generate code for a bounded repetition, recursively if needed
1.9 perry 1072: == static void repeat(struct parse *p, sopno start, int from, int to);
1.1 jtc 1073: */
1074: static void
1075: repeat(p, start, from, to)
1.9 perry 1076: struct parse *p;
1.1 jtc 1077: sopno start; /* operand from here to end of strip */
1078: int from; /* repeated from this number */
1079: int to; /* to this number of times (maybe INFINITY) */
1080: {
1.14 lukem 1081: sopno finish;
1.1 jtc 1082: # define N 2
1083: # define INF 3
1084: # define REP(f, t) ((f)*8 + (t))
1085: # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1.9 perry 1086: sopno copy;
1.1 jtc 1087:
1.14 lukem 1088: _DIAGASSERT(p != NULL);
1089:
1090: finish = HERE();
1091:
1.1 jtc 1092: if (p->error != 0) /* head off possible runaway recursion */
1093: return;
1094:
1095: assert(from <= to);
1096:
1097: switch (REP(MAP(from), MAP(to))) {
1098: case REP(0, 0): /* must be user doing this */
1099: DROP(finish-start); /* drop the operand */
1100: break;
1101: case REP(0, 1): /* as x{1,1}? */
1102: case REP(0, N): /* as x{1,n}? */
1103: case REP(0, INF): /* as x{1,}? */
1.4 jtc 1104: /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1105: INSERT(OCH_, start); /* offset is wrong... */
1.1 jtc 1106: repeat(p, start+1, 1, to);
1.4 jtc 1107: ASTERN(OOR1, start);
1.1 jtc 1108: AHEAD(start); /* ... fix it */
1.4 jtc 1109: EMIT(OOR2, 0);
1110: AHEAD(THERE());
1111: ASTERN(O_CH, THERETHERE());
1.1 jtc 1112: break;
1113: case REP(1, 1): /* trivial case */
1114: /* done */
1115: break;
1116: case REP(1, N): /* as x?x{1,n-1} */
1.4 jtc 1117: /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1118: INSERT(OCH_, start);
1119: ASTERN(OOR1, start);
1120: AHEAD(start);
1121: EMIT(OOR2, 0); /* offset very wrong... */
1122: AHEAD(THERE()); /* ...so fix it */
1123: ASTERN(O_CH, THERETHERE());
1.1 jtc 1124: copy = dupl(p, start+1, finish+1);
1.4 jtc 1125: assert(copy == finish+4);
1.1 jtc 1126: repeat(p, copy, 1, to-1);
1127: break;
1128: case REP(1, INF): /* as x+ */
1129: INSERT(OPLUS_, start);
1130: ASTERN(O_PLUS, start);
1131: break;
1132: case REP(N, N): /* as xx{m-1,n-1} */
1133: copy = dupl(p, start, finish);
1134: repeat(p, copy, from-1, to-1);
1135: break;
1136: case REP(N, INF): /* as xx{n-1,INF} */
1137: copy = dupl(p, start, finish);
1138: repeat(p, copy, from-1, to);
1139: break;
1140: default: /* "can't happen" */
1141: SETERROR(REG_ASSERT); /* just in case */
1142: break;
1143: }
1144: }
1145:
1146: /*
1147: - seterr - set an error condition
1.9 perry 1148: == static int seterr(struct parse *p, int e);
1.1 jtc 1149: */
1150: static int /* useless but makes type checking happy */
1151: seterr(p, e)
1.9 perry 1152: struct parse *p;
1.1 jtc 1153: int e;
1154: {
1.14 lukem 1155:
1156: _DIAGASSERT(p != NULL);
1157:
1.1 jtc 1158: if (p->error == 0) /* keep earliest error condition */
1159: p->error = e;
1160: p->next = nuls; /* try to bring things to a halt */
1161: p->end = nuls;
1162: return(0); /* make the return value well-defined */
1163: }
1164:
1165: /*
1166: - allocset - allocate a set of characters for []
1.9 perry 1167: == static cset *allocset(struct parse *p);
1.1 jtc 1168: */
1169: static cset *
1170: allocset(p)
1.9 perry 1171: struct parse *p;
1.1 jtc 1172: {
1.14 lukem 1173: int no;
1.9 perry 1174: size_t nc;
1175: size_t nbytes;
1176: cset *cs;
1.14 lukem 1177: size_t css;
1.9 perry 1178: int i;
1.1 jtc 1179:
1.14 lukem 1180: _DIAGASSERT(p != NULL);
1181:
1182: no = p->g->ncsets++;
1183: css = (size_t)p->g->csetsize;
1.1 jtc 1184: if (no >= p->ncsalloc) { /* need another column of space */
1185: p->ncsalloc += CHAR_BIT;
1186: nc = p->ncsalloc;
1187: assert(nc % CHAR_BIT == 0);
1188: nbytes = nc / CHAR_BIT * css;
1189: if (p->g->sets == NULL)
1.11 christos 1190: p->g->sets = malloc(nc * sizeof(cset));
1.1 jtc 1191: else
1.11 christos 1192: p->g->sets = realloc(p->g->sets, nc * sizeof(cset));
1.1 jtc 1193: if (p->g->setbits == NULL)
1.11 christos 1194: p->g->setbits = malloc(nbytes);
1.2 jtc 1195: else {
1.11 christos 1196: p->g->setbits = realloc(p->g->setbits, nbytes);
1.2 jtc 1197: /* xxx this isn't right if setbits is now NULL */
1198: for (i = 0; i < no; i++)
1199: p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1200: }
1.1 jtc 1201: if (p->g->sets != NULL && p->g->setbits != NULL)
1202: (void) memset((char *)p->g->setbits + (nbytes - css),
1203: 0, css);
1204: else {
1205: no = 0;
1206: SETERROR(REG_ESPACE);
1207: /* caller's responsibility not to do set ops */
1208: }
1209: }
1210:
1211: assert(p->g->sets != NULL); /* xxx */
1212: cs = &p->g->sets[no];
1213: cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1214: cs->mask = 1 << ((no) % CHAR_BIT);
1215: cs->hash = 0;
1216: cs->smultis = 0;
1217: cs->multis = NULL;
1218:
1219: return(cs);
1220: }
1221:
1222: /*
1223: - freeset - free a now-unused set
1.9 perry 1224: == static void freeset(struct parse *p, cset *cs);
1.1 jtc 1225: */
1226: static void
1227: freeset(p, cs)
1.9 perry 1228: struct parse *p;
1229: cset *cs;
1.1 jtc 1230: {
1.9 perry 1231: int i;
1.14 lukem 1232: cset *top;
1233: size_t css;
1234:
1235: _DIAGASSERT(p != NULL);
1236: _DIAGASSERT(cs != NULL);
1237:
1238: top = &p->g->sets[p->g->ncsets];
1239: css = (size_t)p->g->csetsize;
1.1 jtc 1240:
1241: for (i = 0; i < css; i++)
1242: CHsub(cs, i);
1243: if (cs == top-1) /* recover only the easy case */
1244: p->g->ncsets--;
1245: }
1246:
1247: /*
1248: - freezeset - final processing on a set of characters
1.9 perry 1249: == static int freezeset(struct parse *p, cset *cs);
1.1 jtc 1250: *
1251: * The main task here is merging identical sets. This is usually a waste
1252: * of time (although the hash code minimizes the overhead), but can win
1253: * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1254: * is done using addition rather than xor -- all ASCII [aA] sets xor to
1255: * the same value!
1256: */
1257: static int /* set number */
1258: freezeset(p, cs)
1.9 perry 1259: struct parse *p;
1260: cset *cs;
1.1 jtc 1261: {
1.14 lukem 1262: uch h;
1.9 perry 1263: int i;
1.14 lukem 1264: cset *top;
1.9 perry 1265: cset *cs2;
1.14 lukem 1266: size_t css;
1267:
1268: _DIAGASSERT(p != NULL);
1269: _DIAGASSERT(cs != NULL);
1270:
1271: h = cs->hash;
1272: top = &p->g->sets[p->g->ncsets];
1273: css = (size_t)p->g->csetsize;
1.1 jtc 1274:
1275: /* look for an earlier one which is the same */
1276: for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1277: if (cs2->hash == h && cs2 != cs) {
1278: /* maybe */
1279: for (i = 0; i < css; i++)
1280: if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1281: break; /* no */
1282: if (i == css)
1283: break; /* yes */
1284: }
1285:
1286: if (cs2 < top) { /* found one */
1287: freeset(p, cs);
1288: cs = cs2;
1289: }
1290:
1291: return((int)(cs - p->g->sets));
1292: }
1293:
1294: /*
1295: - firstch - return first character in a set (which must have at least one)
1.9 perry 1296: == static int firstch(struct parse *p, cset *cs);
1.1 jtc 1297: */
1298: static int /* character; there is no "none" value */
1299: firstch(p, cs)
1.9 perry 1300: struct parse *p;
1301: cset *cs;
1.1 jtc 1302: {
1.9 perry 1303: int i;
1.14 lukem 1304: size_t css;
1305:
1306: _DIAGASSERT(p != NULL);
1307: _DIAGASSERT(cs != NULL);
1308:
1309: css = (size_t)p->g->csetsize;
1.1 jtc 1310:
1311: for (i = 0; i < css; i++)
1312: if (CHIN(cs, i))
1313: return((char)i);
1314: assert(never);
1315: return(0); /* arbitrary */
1316: }
1317:
1318: /*
1319: - nch - number of characters in a set
1.9 perry 1320: == static int nch(struct parse *p, cset *cs);
1.1 jtc 1321: */
1322: static int
1323: nch(p, cs)
1.9 perry 1324: struct parse *p;
1325: cset *cs;
1.1 jtc 1326: {
1.9 perry 1327: int i;
1.14 lukem 1328: size_t css;
1.9 perry 1329: int n = 0;
1.1 jtc 1330:
1.14 lukem 1331: _DIAGASSERT(p != NULL);
1332: _DIAGASSERT(cs != NULL);
1333:
1334: css = (size_t)p->g->csetsize;
1335:
1.1 jtc 1336: for (i = 0; i < css; i++)
1337: if (CHIN(cs, i))
1338: n++;
1339: return(n);
1340: }
1341:
1342: /*
1343: - mcadd - add a collating element to a cset
1.9 perry 1344: == static void mcadd(struct parse *p, cset *cs, \
1345: == char *cp);
1.1 jtc 1346: */
1347: static void
1348: mcadd(p, cs, cp)
1.9 perry 1349: struct parse *p;
1350: cset *cs;
1.10 mycroft 1351: const char *cp;
1.1 jtc 1352: {
1.14 lukem 1353: size_t oldend;
1354:
1355: _DIAGASSERT(p != NULL);
1356: _DIAGASSERT(cs != NULL);
1357: _DIAGASSERT(cp != NULL);
1358:
1359: oldend = cs->smultis;
1.1 jtc 1360:
1361: cs->smultis += strlen(cp) + 1;
1362: if (cs->multis == NULL)
1363: cs->multis = malloc(cs->smultis);
1364: else
1365: cs->multis = realloc(cs->multis, cs->smultis);
1366: if (cs->multis == NULL) {
1367: SETERROR(REG_ESPACE);
1368: return;
1369: }
1370:
1371: (void) strcpy(cs->multis + oldend - 1, cp);
1372: cs->multis[cs->smultis - 1] = '\0';
1373: }
1374:
1.7 christos 1375: #if 0
1.1 jtc 1376: /*
1377: - mcsub - subtract a collating element from a cset
1.9 perry 1378: == static void mcsub(cset *cs, char *cp);
1.1 jtc 1379: */
1380: static void
1381: mcsub(cs, cp)
1.9 perry 1382: cset *cs;
1383: char *cp;
1.1 jtc 1384: {
1.14 lukem 1385: char *fp;
1386: size_t len;
1387:
1388: _DIAGASSERT(cs != NULL);
1389: _DIAGASSERT(cp != NULL);
1390:
1391: fp = mcfind(cs, cp);
1392: len = strlen(fp);
1.1 jtc 1393:
1394: assert(fp != NULL);
1395: (void) memmove(fp, fp + len + 1,
1396: cs->smultis - (fp + len + 1 - cs->multis));
1397: cs->smultis -= len;
1398:
1399: if (cs->smultis == 0) {
1400: free(cs->multis);
1401: cs->multis = NULL;
1402: return;
1403: }
1404:
1405: cs->multis = realloc(cs->multis, cs->smultis);
1406: assert(cs->multis != NULL);
1407: }
1408:
1409: /*
1410: - mcin - is a collating element in a cset?
1.9 perry 1411: == static int mcin(cset *cs, char *cp);
1.1 jtc 1412: */
1413: static int
1414: mcin(cs, cp)
1.9 perry 1415: cset *cs;
1416: char *cp;
1.1 jtc 1417: {
1.14 lukem 1418:
1419: _DIAGASSERT(cs != NULL);
1420: _DIAGASSERT(cp != NULL);
1421:
1.1 jtc 1422: return(mcfind(cs, cp) != NULL);
1423: }
1424:
1425: /*
1426: - mcfind - find a collating element in a cset
1.9 perry 1427: == static char *mcfind(cset *cs, char *cp);
1.1 jtc 1428: */
1429: static char *
1430: mcfind(cs, cp)
1.9 perry 1431: cset *cs;
1432: char *cp;
1.1 jtc 1433: {
1.9 perry 1434: char *p;
1.1 jtc 1435:
1.14 lukem 1436: _DIAGASSERT(cs != NULL);
1437: _DIAGASSERT(cp != NULL);
1438:
1.1 jtc 1439: if (cs->multis == NULL)
1440: return(NULL);
1441: for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1442: if (strcmp(cp, p) == 0)
1443: return(p);
1444: return(NULL);
1445: }
1.7 christos 1446: #endif
1.1 jtc 1447:
1448: /*
1449: - mcinvert - invert the list of collating elements in a cset
1.9 perry 1450: == static void mcinvert(struct parse *p, cset *cs);
1.1 jtc 1451: *
1452: * This would have to know the set of possibilities. Implementation
1453: * is deferred.
1454: */
1.11 christos 1455: /* ARGSUSED */
1.1 jtc 1456: static void
1.2 jtc 1457: mcinvert(p, cs)
1.9 perry 1458: struct parse *p;
1459: cset *cs;
1.1 jtc 1460: {
1.14 lukem 1461:
1462: _DIAGASSERT(p != NULL);
1463: _DIAGASSERT(cs != NULL);
1464:
1.1 jtc 1465: assert(cs->multis == NULL); /* xxx */
1466: }
1467:
1468: /*
1469: - mccase - add case counterparts of the list of collating elements in a cset
1.9 perry 1470: == static void mccase(struct parse *p, cset *cs);
1.1 jtc 1471: *
1472: * This would have to know the set of possibilities. Implementation
1473: * is deferred.
1474: */
1.11 christos 1475: /* ARGSUSED */
1.1 jtc 1476: static void
1.2 jtc 1477: mccase(p, cs)
1.9 perry 1478: struct parse *p;
1479: cset *cs;
1.1 jtc 1480: {
1.14 lukem 1481:
1482: _DIAGASSERT(p != NULL);
1483: _DIAGASSERT(cs != NULL);
1484:
1.1 jtc 1485: assert(cs->multis == NULL); /* xxx */
1486: }
1487:
1488: /*
1489: - isinsets - is this character in any sets?
1.9 perry 1490: == static int isinsets(struct re_guts *g, int c);
1.1 jtc 1491: */
1492: static int /* predicate */
1493: isinsets(g, c)
1.9 perry 1494: struct re_guts *g;
1.1 jtc 1495: int c;
1496: {
1.9 perry 1497: uch *col;
1498: int i;
1.14 lukem 1499: int ncols;
1.9 perry 1500: unsigned uc = (unsigned char)c;
1.1 jtc 1501:
1.14 lukem 1502: _DIAGASSERT(g != NULL);
1503:
1504: ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1505:
1.1 jtc 1506: for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1507: if (col[uc] != 0)
1508: return(1);
1509: return(0);
1510: }
1511:
1512: /*
1513: - samesets - are these two characters in exactly the same sets?
1.9 perry 1514: == static int samesets(struct re_guts *g, int c1, int c2);
1.1 jtc 1515: */
1516: static int /* predicate */
1517: samesets(g, c1, c2)
1.9 perry 1518: struct re_guts *g;
1.1 jtc 1519: int c1;
1520: int c2;
1521: {
1.9 perry 1522: uch *col;
1523: int i;
1.14 lukem 1524: int ncols;
1.9 perry 1525: unsigned uc1 = (unsigned char)c1;
1526: unsigned uc2 = (unsigned char)c2;
1.1 jtc 1527:
1.14 lukem 1528: _DIAGASSERT(g != NULL);
1529:
1530: ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1531:
1.1 jtc 1532: for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1533: if (col[uc1] != col[uc2])
1534: return(0);
1535: return(1);
1536: }
1537:
1538: /*
1539: - categorize - sort out character categories
1.9 perry 1540: == static void categorize(struct parse *p, struct re_guts *g);
1.1 jtc 1541: */
1542: static void
1543: categorize(p, g)
1544: struct parse *p;
1.9 perry 1545: struct re_guts *g;
1.1 jtc 1546: {
1.14 lukem 1547: cat_t *cats;
1.9 perry 1548: int c;
1549: int c2;
1550: cat_t cat;
1.1 jtc 1551:
1.14 lukem 1552: _DIAGASSERT(p != NULL);
1553: _DIAGASSERT(g != NULL);
1554:
1555: cats = g->categories;
1556:
1.1 jtc 1557: /* avoid making error situations worse */
1558: if (p->error != 0)
1559: return;
1560:
1561: for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1562: if (cats[c] == 0 && isinsets(g, c)) {
1563: cat = g->ncategories++;
1564: cats[c] = cat;
1565: for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1566: if (cats[c2] == 0 && samesets(g, c, c2))
1567: cats[c2] = cat;
1568: }
1569: }
1570:
1571: /*
1572: - dupl - emit a duplicate of a bunch of sops
1.9 perry 1573: == static sopno dupl(struct parse *p, sopno start, sopno finish);
1.1 jtc 1574: */
1575: static sopno /* start of duplicate */
1576: dupl(p, start, finish)
1.9 perry 1577: struct parse *p;
1.1 jtc 1578: sopno start; /* from here */
1579: sopno finish; /* to this less one */
1580: {
1.14 lukem 1581: sopno ret;
1.9 perry 1582: sopno len = finish - start;
1.1 jtc 1583:
1.14 lukem 1584: _DIAGASSERT(p != NULL);
1585:
1586: ret = HERE();
1587:
1.1 jtc 1588: assert(finish >= start);
1589: if (len == 0)
1590: return(ret);
1591: enlarge(p, p->ssize + len); /* this many unexpected additions */
1592: assert(p->ssize >= p->slen + len);
1.12 drochner 1593: (void)memcpy(p->strip + p->slen, p->strip + start,
1.11 christos 1594: (size_t)len * sizeof(sop));
1.1 jtc 1595: p->slen += len;
1596: return(ret);
1597: }
1598:
1599: /*
1600: - doemit - emit a strip operator
1.9 perry 1601: == static void doemit(struct parse *p, sop op, size_t opnd);
1.1 jtc 1602: *
1603: * It might seem better to implement this as a macro with a function as
1604: * hard-case backup, but it's just too big and messy unless there are
1605: * some changes to the data structures. Maybe later.
1606: */
1607: static void
1608: doemit(p, op, opnd)
1.9 perry 1609: struct parse *p;
1.1 jtc 1610: sop op;
1.12 drochner 1611: sopno opnd;
1.1 jtc 1612: {
1.14 lukem 1613:
1614: _DIAGASSERT(p != NULL);
1615:
1.1 jtc 1616: /* avoid making error situations worse */
1617: if (p->error != 0)
1618: return;
1619:
1620: /* deal with oversize operands ("can't happen", more or less) */
1621: assert(opnd < 1<<OPSHIFT);
1622:
1623: /* deal with undersized strip */
1624: if (p->slen >= p->ssize)
1625: enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
1626: assert(p->slen < p->ssize);
1627:
1628: /* finally, it's all reduced to the easy case */
1629: p->strip[p->slen++] = SOP(op, opnd);
1630: }
1631:
1632: /*
1633: - doinsert - insert a sop into the strip
1.9 perry 1634: == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1.1 jtc 1635: */
1636: static void
1637: doinsert(p, op, opnd, pos)
1.9 perry 1638: struct parse *p;
1.1 jtc 1639: sop op;
1.12 drochner 1640: sopno opnd;
1.1 jtc 1641: sopno pos;
1642: {
1.9 perry 1643: sopno sn;
1644: sop s;
1645: int i;
1.1 jtc 1646:
1.14 lukem 1647: _DIAGASSERT(p != NULL);
1648:
1.1 jtc 1649: /* avoid making error situations worse */
1650: if (p->error != 0)
1651: return;
1652:
1653: sn = HERE();
1654: EMIT(op, opnd); /* do checks, ensure space */
1655: assert(HERE() == sn+1);
1656: s = p->strip[sn];
1657:
1658: /* adjust paren pointers */
1659: assert(pos > 0);
1660: for (i = 1; i < NPAREN; i++) {
1661: if (p->pbegin[i] >= pos) {
1662: p->pbegin[i]++;
1663: }
1664: if (p->pend[i] >= pos) {
1665: p->pend[i]++;
1666: }
1667: }
1668:
1.11 christos 1669: memmove(&p->strip[pos+1], &p->strip[pos], (HERE()-pos-1)*sizeof(sop));
1.1 jtc 1670: p->strip[pos] = s;
1671: }
1672:
1673: /*
1674: - dofwd - complete a forward reference
1.9 perry 1675: == static void dofwd(struct parse *p, sopno pos, sop value);
1.1 jtc 1676: */
1677: static void
1678: dofwd(p, pos, value)
1.9 perry 1679: struct parse *p;
1680: sopno pos;
1.12 drochner 1681: sopno value;
1.1 jtc 1682: {
1.14 lukem 1683:
1684: _DIAGASSERT(p != NULL);
1685:
1.1 jtc 1686: /* avoid making error situations worse */
1687: if (p->error != 0)
1688: return;
1689:
1690: assert(value < 1<<OPSHIFT);
1691: p->strip[pos] = OP(p->strip[pos]) | value;
1692: }
1693:
1694: /*
1695: - enlarge - enlarge the strip
1.9 perry 1696: == static void enlarge(struct parse *p, sopno size);
1.1 jtc 1697: */
1698: static void
1699: enlarge(p, size)
1.9 perry 1700: struct parse *p;
1701: sopno size;
1.1 jtc 1702: {
1.9 perry 1703: sop *sp;
1.1 jtc 1704:
1.14 lukem 1705: _DIAGASSERT(p != NULL);
1706:
1.1 jtc 1707: if (p->ssize >= size)
1708: return;
1709:
1710: sp = (sop *)realloc(p->strip, size*sizeof(sop));
1711: if (sp == NULL) {
1712: SETERROR(REG_ESPACE);
1713: return;
1714: }
1715: p->strip = sp;
1716: p->ssize = size;
1717: }
1718:
1719: /*
1720: - stripsnug - compact the strip
1.9 perry 1721: == static void stripsnug(struct parse *p, struct re_guts *g);
1.1 jtc 1722: */
1723: static void
1724: stripsnug(p, g)
1.9 perry 1725: struct parse *p;
1726: struct re_guts *g;
1.1 jtc 1727: {
1.14 lukem 1728:
1729: _DIAGASSERT(p != NULL);
1730: _DIAGASSERT(g != NULL);
1731:
1.1 jtc 1732: g->nstates = p->slen;
1.11 christos 1733: g->strip = realloc(p->strip, p->slen * sizeof(sop));
1.1 jtc 1734: if (g->strip == NULL) {
1735: SETERROR(REG_ESPACE);
1736: g->strip = p->strip;
1737: }
1738: }
1739:
1740: /*
1741: - findmust - fill in must and mlen with longest mandatory literal string
1.9 perry 1742: == static void findmust(struct parse *p, struct re_guts *g);
1.1 jtc 1743: *
1744: * This algorithm could do fancy things like analyzing the operands of |
1745: * for common subsequences. Someday. This code is simple and finds most
1746: * of the interesting cases.
1747: *
1748: * Note that must and mlen got initialized during setup.
1749: */
1750: static void
1751: findmust(p, g)
1752: struct parse *p;
1.9 perry 1753: struct re_guts *g;
1.1 jtc 1754: {
1.9 perry 1755: sop *scan;
1.7 christos 1756: sop *start = NULL;
1.9 perry 1757: sop *newstart = NULL;
1758: sopno newlen;
1759: sop s;
1760: char *cp;
1761: sopno i;
1.1 jtc 1762:
1.14 lukem 1763: _DIAGASSERT(p != NULL);
1764: _DIAGASSERT(g != NULL);
1765:
1.1 jtc 1766: /* avoid making error situations worse */
1767: if (p->error != 0)
1768: return;
1769:
1770: /* find the longest OCHAR sequence in strip */
1771: newlen = 0;
1772: scan = g->strip + 1;
1773: do {
1774: s = *scan++;
1775: switch (OP(s)) {
1776: case OCHAR: /* sequence member */
1777: if (newlen == 0) /* new sequence */
1778: newstart = scan - 1;
1779: newlen++;
1780: break;
1781: case OPLUS_: /* things that don't break one */
1782: case OLPAREN:
1783: case ORPAREN:
1784: break;
1785: case OQUEST_: /* things that must be skipped */
1786: case OCH_:
1787: scan--;
1788: do {
1789: scan += OPND(s);
1790: s = *scan;
1791: /* assert() interferes w debug printouts */
1792: if (OP(s) != O_QUEST && OP(s) != O_CH &&
1793: OP(s) != OOR2) {
1794: g->iflags |= BAD;
1795: return;
1796: }
1797: } while (OP(s) != O_QUEST && OP(s) != O_CH);
1.11 christos 1798: /* FALLTHROUGH */
1.1 jtc 1799: default: /* things that break a sequence */
1800: if (newlen > g->mlen) { /* ends one */
1801: start = newstart;
1802: g->mlen = newlen;
1803: }
1804: newlen = 0;
1805: break;
1806: }
1807: } while (OP(s) != OEND);
1808:
1809: if (g->mlen == 0) /* there isn't one */
1810: return;
1811:
1812: /* turn it into a character string */
1813: g->must = malloc((size_t)g->mlen + 1);
1814: if (g->must == NULL) { /* argh; just forget it */
1815: g->mlen = 0;
1816: return;
1817: }
1818: cp = g->must;
1819: scan = start;
1820: for (i = g->mlen; i > 0; i--) {
1821: while (OP(s = *scan++) != OCHAR)
1822: continue;
1.2 jtc 1823: assert(cp < g->must + g->mlen);
1.1 jtc 1824: *cp++ = (char)OPND(s);
1825: }
1.2 jtc 1826: assert(cp == g->must + g->mlen);
1.1 jtc 1827: *cp++ = '\0'; /* just on general principles */
1828: }
1829:
1830: /*
1831: - pluscount - count + nesting
1.9 perry 1832: == static sopno pluscount(struct parse *p, struct re_guts *g);
1.1 jtc 1833: */
1834: static sopno /* nesting depth */
1835: pluscount(p, g)
1836: struct parse *p;
1.9 perry 1837: struct re_guts *g;
1.1 jtc 1838: {
1.9 perry 1839: sop *scan;
1840: sop s;
1841: sopno plusnest = 0;
1842: sopno maxnest = 0;
1.14 lukem 1843:
1844: _DIAGASSERT(p != NULL);
1845: _DIAGASSERT(g != NULL);
1.1 jtc 1846:
1847: if (p->error != 0)
1848: return(0); /* there may not be an OEND */
1849:
1850: scan = g->strip + 1;
1851: do {
1852: s = *scan++;
1853: switch (OP(s)) {
1854: case OPLUS_:
1855: plusnest++;
1856: break;
1857: case O_PLUS:
1858: if (plusnest > maxnest)
1859: maxnest = plusnest;
1860: plusnest--;
1861: break;
1862: }
1863: } while (OP(s) != OEND);
1864: if (plusnest != 0)
1865: g->iflags |= BAD;
1866: return(maxnest);
1867: }
CVSweb <webmaster@jp.NetBSD.org>