Annotation of src/lib/libc/regex/engine.c, Revision 1.11
1.11 ! christos 1: /* $NetBSD: engine.c,v 1.10 1998/12/08 14:00:24 drochner Exp $ */
1.5 cgd 2:
1.4 cgd 3: /*-
4: * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5: * Copyright (c) 1992, 1993, 1994
6: * The Regents of the University of California. All rights reserved.
7: *
8: * This code is derived from software contributed to Berkeley by
9: * Henry Spencer.
10: *
11: * Redistribution and use in source and binary forms, with or without
12: * modification, are permitted provided that the following conditions
13: * are met:
14: * 1. Redistributions of source code must retain the above copyright
15: * notice, this list of conditions and the following disclaimer.
16: * 2. Redistributions in binary form must reproduce the above copyright
17: * notice, this list of conditions and the following disclaimer in the
18: * documentation and/or other materials provided with the distribution.
19: * 3. All advertising materials mentioning features or use of this software
20: * must display the following acknowledgement:
21: * This product includes software developed by the University of
22: * California, Berkeley and its contributors.
23: * 4. Neither the name of the University nor the names of its contributors
24: * may be used to endorse or promote products derived from this software
25: * without specific prior written permission.
26: *
27: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37: * SUCH DAMAGE.
38: *
39: * @(#)engine.c 8.5 (Berkeley) 3/20/94
40: */
41:
1.1 jtc 42: /*
43: * The matching engine and friends. This file is #included by regexec.c
44: * after suitable #defines of a variety of macros used herein, so that
45: * different state representations can be used without duplicating masses
46: * of code.
47: */
48:
49: #ifdef SNAMES
50: #define matcher smatcher
51: #define fast sfast
52: #define slow sslow
53: #define dissect sdissect
54: #define backref sbackref
55: #define step sstep
56: #define print sprint
57: #define at sat
58: #define match smat
59: #endif
60: #ifdef LNAMES
61: #define matcher lmatcher
62: #define fast lfast
63: #define slow lslow
64: #define dissect ldissect
65: #define backref lbackref
66: #define step lstep
67: #define print lprint
68: #define at lat
69: #define match lmat
70: #endif
71:
72: /* another structure passed up and down to avoid zillions of parameters */
73: struct match {
74: struct re_guts *g;
75: int eflags;
76: regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
77: char *offp; /* offsets work from here */
78: char *beginp; /* start of string -- virtual NUL precedes */
79: char *endp; /* end of string -- virtual NUL here */
80: char *coldp; /* can be no match starting before here */
81: char **lastpos; /* [nplus+1] */
82: STATEVARS;
83: states st; /* current states */
84: states fresh; /* states for a fresh start */
85: states tmp; /* temporary */
86: states empty; /* empty set of states */
87: };
88:
1.4 cgd 89: /* ========= begin header generated by ./mkh ========= */
90: #ifdef __cplusplus
91: extern "C" {
92: #endif
93:
94: /* === engine.c === */
95: static int matcher __P((struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags));
96: static char *dissect __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
97: static char *backref __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev));
98: static char *fast __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
99: static char *slow __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
100: static states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft));
101: #define BOL (OUT+1)
102: #define EOL (BOL+1)
103: #define BOLEOL (BOL+2)
104: #define NOTHING (BOL+3)
105: #define BOW (BOL+4)
106: #define EOW (BOL+5)
107: #define CODEMAX (BOL+5) /* highest code used */
108: #define NONCHAR(c) ((c) > CHAR_MAX)
109: #define NNONCHAR (CODEMAX-CHAR_MAX)
110: #ifdef REDEBUG
111: static void print __P((struct match *m, char *caption, states st, int ch, FILE *d));
112: #endif
113: #ifdef REDEBUG
114: static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst));
115: #endif
116: #ifdef REDEBUG
117: static char *pchar __P((int ch));
118: #endif
119:
120: #ifdef __cplusplus
121: }
122: #endif
123: /* ========= end header generated by ./mkh ========= */
1.1 jtc 124:
125: #ifdef REDEBUG
126: #define SP(t, s, c) print(m, t, s, c, stdout)
127: #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
128: #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); }
129: #else
130: #define SP(t, s, c) /* nothing */
131: #define AT(t, p1, p2, s1, s2) /* nothing */
132: #define NOTE(s) /* nothing */
133: #endif
134:
135: /*
136: - matcher - the actual matching engine
1.8 perry 137: == static int matcher(struct re_guts *g, char *string, \
1.1 jtc 138: == size_t nmatch, regmatch_t pmatch[], int eflags);
139: */
140: static int /* 0 success, REG_NOMATCH failure */
141: matcher(g, string, nmatch, pmatch, eflags)
1.8 perry 142: struct re_guts *g;
1.1 jtc 143: char *string;
144: size_t nmatch;
145: regmatch_t pmatch[];
146: int eflags;
147: {
1.8 perry 148: char *endp;
149: int i;
1.1 jtc 150: struct match mv;
1.8 perry 151: struct match *m = &mv;
152: char *dp;
153: const sopno gf = g->firststate+1; /* +1 for OEND */
154: const sopno gl = g->laststate;
1.1 jtc 155: char *start;
156: char *stop;
157:
158: /* simplify the situation where possible */
159: if (g->cflags®_NOSUB)
160: nmatch = 0;
161: if (eflags®_STARTEND) {
1.11 ! christos 162: start = string + (size_t)pmatch[0].rm_so;
! 163: stop = string + (size_t)pmatch[0].rm_eo;
1.1 jtc 164: } else {
165: start = string;
166: stop = start + strlen(start);
167: }
168: if (stop < start)
169: return(REG_INVARG);
170:
171: /* prescreening; this does wonders for this rather slow code */
172: if (g->must != NULL) {
173: for (dp = start; dp < stop; dp++)
174: if (*dp == g->must[0] && stop - dp >= g->mlen &&
175: memcmp(dp, g->must, (size_t)g->mlen) == 0)
176: break;
177: if (dp == stop) /* we didn't find g->must */
178: return(REG_NOMATCH);
179: }
180:
181: /* match struct setup */
182: m->g = g;
183: m->eflags = eflags;
184: m->pmatch = NULL;
185: m->lastpos = NULL;
186: m->offp = string;
187: m->beginp = start;
188: m->endp = stop;
189: STATESETUP(m, 4);
190: SETUP(m->st);
191: SETUP(m->fresh);
192: SETUP(m->tmp);
193: SETUP(m->empty);
194: CLEAR(m->empty);
195:
196: /* this loop does only one repetition except for backrefs */
197: for (;;) {
198: endp = fast(m, start, stop, gf, gl);
199: if (endp == NULL) { /* a miss */
200: STATETEARDOWN(m);
201: return(REG_NOMATCH);
202: }
203: if (nmatch == 0 && !g->backrefs)
204: break; /* no further info needed */
205:
206: /* where? */
207: assert(m->coldp != NULL);
208: for (;;) {
209: NOTE("finding start");
210: endp = slow(m, m->coldp, stop, gf, gl);
211: if (endp != NULL)
212: break;
213: assert(m->coldp < m->endp);
214: m->coldp++;
215: }
216: if (nmatch == 1 && !g->backrefs)
217: break; /* no further info needed */
218:
219: /* oh my, he wants the subexpressions... */
220: if (m->pmatch == NULL)
221: m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
222: sizeof(regmatch_t));
223: if (m->pmatch == NULL) {
224: STATETEARDOWN(m);
225: return(REG_ESPACE);
226: }
227: for (i = 1; i <= m->g->nsub; i++)
1.10 drochner 228: m->pmatch[i].rm_so = m->pmatch[i].rm_eo = (regoff_t)-1;
1.1 jtc 229: if (!g->backrefs && !(m->eflags®_BACKR)) {
230: NOTE("dissecting");
231: dp = dissect(m, m->coldp, endp, gf, gl);
232: } else {
233: if (g->nplus > 0 && m->lastpos == NULL)
234: m->lastpos = (char **)malloc((g->nplus+1) *
235: sizeof(char *));
236: if (g->nplus > 0 && m->lastpos == NULL) {
237: free(m->pmatch);
238: STATETEARDOWN(m);
239: return(REG_ESPACE);
240: }
241: NOTE("backref dissect");
242: dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
243: }
244: if (dp != NULL)
245: break;
246:
247: /* uh-oh... we couldn't find a subexpression-level match */
248: assert(g->backrefs); /* must be back references doing it */
249: assert(g->nplus == 0 || m->lastpos != NULL);
250: for (;;) {
251: if (dp != NULL || endp <= m->coldp)
252: break; /* defeat */
253: NOTE("backoff");
254: endp = slow(m, m->coldp, endp-1, gf, gl);
255: if (endp == NULL)
256: break; /* defeat */
257: /* try it on a shorter possibility */
258: #ifndef NDEBUG
259: for (i = 1; i <= m->g->nsub; i++) {
1.10 drochner 260: assert(m->pmatch[i].rm_so == (regoff_t)-1);
261: assert(m->pmatch[i].rm_eo == (regoff_t)-1);
1.1 jtc 262: }
263: #endif
264: NOTE("backoff dissect");
265: dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
266: }
267: assert(dp == NULL || dp == endp);
268: if (dp != NULL) /* found a shorter one */
269: break;
270:
271: /* despite initial appearances, there is no match here */
272: NOTE("false alarm");
273: start = m->coldp + 1; /* recycle starting later */
274: assert(start <= stop);
275: }
276:
277: /* fill in the details if requested */
278: if (nmatch > 0) {
279: pmatch[0].rm_so = m->coldp - m->offp;
280: pmatch[0].rm_eo = endp - m->offp;
281: }
282: if (nmatch > 1) {
283: assert(m->pmatch != NULL);
284: for (i = 1; i < nmatch; i++)
285: if (i <= m->g->nsub)
286: pmatch[i] = m->pmatch[i];
287: else {
1.10 drochner 288: pmatch[i].rm_so = (regoff_t)-1;
289: pmatch[i].rm_eo = (regoff_t)-1;
1.1 jtc 290: }
291: }
292:
293: if (m->pmatch != NULL)
1.9 christos 294: free(m->pmatch);
1.1 jtc 295: if (m->lastpos != NULL)
1.9 christos 296: free(m->lastpos);
1.1 jtc 297: STATETEARDOWN(m);
298: return(0);
299: }
300:
301: /*
302: - dissect - figure out what matched what, no back references
1.8 perry 303: == static char *dissect(struct match *m, char *start, \
1.1 jtc 304: == char *stop, sopno startst, sopno stopst);
305: */
306: static char * /* == stop (success) always */
307: dissect(m, start, stop, startst, stopst)
1.8 perry 308: struct match *m;
1.1 jtc 309: char *start;
310: char *stop;
311: sopno startst;
312: sopno stopst;
313: {
1.8 perry 314: int i;
315: sopno ss; /* start sop of current subRE */
316: sopno es; /* end sop of current subRE */
317: char *sp; /* start of string matched by it */
318: char *stp; /* string matched by it cannot pass here */
319: char *rest; /* start of rest of string */
320: char *tail; /* string unmatched by rest of RE */
321: sopno ssub; /* start sop of subsubRE */
322: sopno esub; /* end sop of subsubRE */
323: char *ssp; /* start of string matched by subsubRE */
324: char *sep; /* end of string matched by subsubRE */
325: char *oldssp; /* previous ssp */
1.9 christos 326: #ifndef NDEBUG
1.8 perry 327: char *dp;
1.9 christos 328: #endif
1.1 jtc 329:
330: AT("diss", start, stop, startst, stopst);
331: sp = start;
332: for (ss = startst; ss < stopst; ss = es) {
333: /* identify end of subRE */
334: es = ss;
335: switch (OP(m->g->strip[es])) {
336: case OPLUS_:
337: case OQUEST_:
338: es += OPND(m->g->strip[es]);
339: break;
340: case OCH_:
341: while (OP(m->g->strip[es]) != O_CH)
342: es += OPND(m->g->strip[es]);
343: break;
344: }
345: es++;
346:
347: /* figure out what it matched */
348: switch (OP(m->g->strip[ss])) {
349: case OEND:
350: assert(nope);
351: break;
352: case OCHAR:
353: sp++;
354: break;
355: case OBOL:
356: case OEOL:
357: case OBOW:
358: case OEOW:
359: break;
360: case OANY:
361: case OANYOF:
362: sp++;
363: break;
364: case OBACK_:
365: case O_BACK:
366: assert(nope);
367: break;
368: /* cases where length of match is hard to find */
369: case OQUEST_:
370: stp = stop;
371: for (;;) {
372: /* how long could this one be? */
373: rest = slow(m, sp, stp, ss, es);
374: assert(rest != NULL); /* it did match */
375: /* could the rest match the rest? */
376: tail = slow(m, rest, stop, es, stopst);
377: if (tail == stop)
378: break; /* yes! */
379: /* no -- try a shorter match for this one */
380: stp = rest - 1;
381: assert(stp >= sp); /* it did work */
382: }
383: ssub = ss + 1;
384: esub = es - 1;
385: /* did innards match? */
386: if (slow(m, sp, rest, ssub, esub) != NULL) {
1.9 christos 387: #ifdef NDEBUG
388: (void)
389: #else
390: dp =
391: #endif
392: dissect(m, sp, rest, ssub, esub);
1.1 jtc 393: assert(dp == rest);
394: } else /* no */
395: assert(sp == rest);
396: sp = rest;
397: break;
398: case OPLUS_:
399: stp = stop;
400: for (;;) {
401: /* how long could this one be? */
402: rest = slow(m, sp, stp, ss, es);
403: assert(rest != NULL); /* it did match */
404: /* could the rest match the rest? */
405: tail = slow(m, rest, stop, es, stopst);
406: if (tail == stop)
407: break; /* yes! */
408: /* no -- try a shorter match for this one */
409: stp = rest - 1;
410: assert(stp >= sp); /* it did work */
411: }
412: ssub = ss + 1;
413: esub = es - 1;
414: ssp = sp;
415: oldssp = ssp;
416: for (;;) { /* find last match of innards */
417: sep = slow(m, ssp, rest, ssub, esub);
418: if (sep == NULL || sep == ssp)
419: break; /* failed or matched null */
420: oldssp = ssp; /* on to next try */
421: ssp = sep;
422: }
423: if (sep == NULL) {
424: /* last successful match */
425: sep = ssp;
426: ssp = oldssp;
427: }
428: assert(sep == rest); /* must exhaust substring */
429: assert(slow(m, ssp, sep, ssub, esub) == rest);
1.9 christos 430: #ifdef NDEBUG
431: (void)
432: #else
433: dp =
434: #endif
435: dissect(m, ssp, sep, ssub, esub);
1.1 jtc 436: assert(dp == sep);
437: sp = rest;
438: break;
439: case OCH_:
440: stp = stop;
441: for (;;) {
442: /* how long could this one be? */
443: rest = slow(m, sp, stp, ss, es);
444: assert(rest != NULL); /* it did match */
445: /* could the rest match the rest? */
446: tail = slow(m, rest, stop, es, stopst);
447: if (tail == stop)
448: break; /* yes! */
449: /* no -- try a shorter match for this one */
450: stp = rest - 1;
451: assert(stp >= sp); /* it did work */
452: }
453: ssub = ss + 1;
454: esub = ss + OPND(m->g->strip[ss]) - 1;
455: assert(OP(m->g->strip[esub]) == OOR1);
456: for (;;) { /* find first matching branch */
457: if (slow(m, sp, rest, ssub, esub) == rest)
458: break; /* it matched all of it */
459: /* that one missed, try next one */
460: assert(OP(m->g->strip[esub]) == OOR1);
461: esub++;
462: assert(OP(m->g->strip[esub]) == OOR2);
463: ssub = esub + 1;
464: esub += OPND(m->g->strip[esub]);
465: if (OP(m->g->strip[esub]) == OOR2)
466: esub--;
467: else
468: assert(OP(m->g->strip[esub]) == O_CH);
469: }
1.9 christos 470: #ifdef NDEBUG
471: (void)
472: #else
473: dp =
474: #endif
475: dissect(m, sp, rest, ssub, esub);
1.1 jtc 476: assert(dp == rest);
477: sp = rest;
478: break;
479: case O_PLUS:
480: case O_QUEST:
481: case OOR1:
482: case OOR2:
483: case O_CH:
484: assert(nope);
485: break;
486: case OLPAREN:
487: i = OPND(m->g->strip[ss]);
1.2 jtc 488: assert(0 < i && i <= m->g->nsub);
1.1 jtc 489: m->pmatch[i].rm_so = sp - m->offp;
490: break;
491: case ORPAREN:
492: i = OPND(m->g->strip[ss]);
1.2 jtc 493: assert(0 < i && i <= m->g->nsub);
1.1 jtc 494: m->pmatch[i].rm_eo = sp - m->offp;
495: break;
496: default: /* uh oh */
497: assert(nope);
498: break;
499: }
500: }
501:
502: assert(sp == stop);
503: return(sp);
504: }
505:
506: /*
507: - backref - figure out what matched what, figuring in back references
1.8 perry 508: == static char *backref(struct match *m, char *start, \
1.1 jtc 509: == char *stop, sopno startst, sopno stopst, sopno lev);
510: */
511: static char * /* == stop (success) or NULL (failure) */
512: backref(m, start, stop, startst, stopst, lev)
1.8 perry 513: struct match *m;
1.1 jtc 514: char *start;
515: char *stop;
516: sopno startst;
517: sopno stopst;
518: sopno lev; /* PLUS nesting level */
519: {
1.8 perry 520: int i;
521: sopno ss; /* start sop of current subRE */
522: char *sp; /* start of string matched by it */
523: sopno ssub; /* start sop of subsubRE */
524: sopno esub; /* end sop of subsubRE */
525: char *ssp; /* start of string matched by subsubRE */
526: char *dp;
527: size_t len;
528: int hard;
529: sop s;
530: regoff_t offsave;
531: cset *cs;
1.1 jtc 532:
533: AT("back", start, stop, startst, stopst);
534: sp = start;
535:
536: /* get as far as we can with easy stuff */
537: hard = 0;
538: for (ss = startst; !hard && ss < stopst; ss++)
539: switch (OP(s = m->g->strip[ss])) {
540: case OCHAR:
541: if (sp == stop || *sp++ != (char)OPND(s))
542: return(NULL);
543: break;
544: case OANY:
545: if (sp == stop)
546: return(NULL);
547: sp++;
548: break;
549: case OANYOF:
550: cs = &m->g->sets[OPND(s)];
551: if (sp == stop || !CHIN(cs, *sp++))
552: return(NULL);
553: break;
554: case OBOL:
555: if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
556: (sp < m->endp && *(sp-1) == '\n' &&
557: (m->g->cflags®_NEWLINE)) )
558: { /* yes */ }
559: else
560: return(NULL);
561: break;
562: case OEOL:
563: if ( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
564: (sp < m->endp && *sp == '\n' &&
565: (m->g->cflags®_NEWLINE)) )
566: { /* yes */ }
567: else
568: return(NULL);
569: break;
570: case OBOW:
571: if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
572: (sp < m->endp && *(sp-1) == '\n' &&
573: (m->g->cflags®_NEWLINE)) ||
574: (sp > m->beginp &&
1.3 jtc 575: !ISWORD(*(sp-1))) ) &&
576: (sp < m->endp && ISWORD(*sp)) )
1.1 jtc 577: { /* yes */ }
578: else
579: return(NULL);
580: break;
581: case OEOW:
582: if (( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
583: (sp < m->endp && *sp == '\n' &&
584: (m->g->cflags®_NEWLINE)) ||
1.3 jtc 585: (sp < m->endp && !ISWORD(*sp)) ) &&
586: (sp > m->beginp && ISWORD(*(sp-1))) )
1.1 jtc 587: { /* yes */ }
588: else
589: return(NULL);
590: break;
591: case O_QUEST:
592: break;
593: case OOR1: /* matches null but needs to skip */
594: ss++;
595: s = m->g->strip[ss];
596: do {
597: assert(OP(s) == OOR2);
598: ss += OPND(s);
599: } while (OP(s = m->g->strip[ss]) != O_CH);
600: /* note that the ss++ gets us past the O_CH */
601: break;
602: default: /* have to make a choice */
603: hard = 1;
604: break;
605: }
606: if (!hard) { /* that was it! */
607: if (sp != stop)
608: return(NULL);
609: return(sp);
610: }
611: ss--; /* adjust for the for's final increment */
612:
613: /* the hard stuff */
614: AT("hard", sp, stop, ss, stopst);
615: s = m->g->strip[ss];
616: switch (OP(s)) {
617: case OBACK_: /* the vilest depths */
618: i = OPND(s);
1.2 jtc 619: assert(0 < i && i <= m->g->nsub);
1.10 drochner 620: if (m->pmatch[i].rm_eo == (regoff_t)-1)
1.1 jtc 621: return(NULL);
1.10 drochner 622: assert(m->pmatch[i].rm_so != (regoff_t)-1);
1.11 ! christos 623: len = (size_t)(m->pmatch[i].rm_eo - m->pmatch[i].rm_so);
1.1 jtc 624: assert(stop - m->beginp >= len);
625: if (sp > stop - len)
626: return(NULL); /* not enough left to match */
1.11 ! christos 627: ssp = m->offp + (size_t)m->pmatch[i].rm_so;
1.1 jtc 628: if (memcmp(sp, ssp, len) != 0)
629: return(NULL);
630: while (m->g->strip[ss] != SOP(O_BACK, i))
631: ss++;
632: return(backref(m, sp+len, stop, ss+1, stopst, lev));
1.9 christos 633:
1.1 jtc 634: case OQUEST_: /* to null or not */
635: dp = backref(m, sp, stop, ss+1, stopst, lev);
636: if (dp != NULL)
637: return(dp); /* not */
638: return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
1.9 christos 639:
1.1 jtc 640: case OPLUS_:
641: assert(m->lastpos != NULL);
642: assert(lev+1 <= m->g->nplus);
643: m->lastpos[lev+1] = sp;
644: return(backref(m, sp, stop, ss+1, stopst, lev+1));
1.9 christos 645:
1.1 jtc 646: case O_PLUS:
647: if (sp == m->lastpos[lev]) /* last pass matched null */
648: return(backref(m, sp, stop, ss+1, stopst, lev-1));
649: /* try another pass */
650: m->lastpos[lev] = sp;
651: dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
652: if (dp == NULL)
1.9 christos 653: dp = backref(m, sp, stop, ss+1, stopst, lev-1);
654: return(dp);
655:
1.1 jtc 656: case OCH_: /* find the right one, if any */
657: ssub = ss + 1;
658: esub = ss + OPND(s) - 1;
659: assert(OP(m->g->strip[esub]) == OOR1);
660: for (;;) { /* find first matching branch */
661: dp = backref(m, sp, stop, ssub, esub, lev);
662: if (dp != NULL)
663: return(dp);
664: /* that one missed, try next one */
665: if (OP(m->g->strip[esub]) == O_CH)
666: return(NULL); /* there is none */
667: esub++;
668: assert(OP(m->g->strip[esub]) == OOR2);
669: ssub = esub + 1;
670: esub += OPND(m->g->strip[esub]);
671: if (OP(m->g->strip[esub]) == OOR2)
672: esub--;
673: else
674: assert(OP(m->g->strip[esub]) == O_CH);
675: }
1.9 christos 676:
1.1 jtc 677: case OLPAREN: /* must undo assignment if rest fails */
678: i = OPND(s);
1.2 jtc 679: assert(0 < i && i <= m->g->nsub);
1.1 jtc 680: offsave = m->pmatch[i].rm_so;
681: m->pmatch[i].rm_so = sp - m->offp;
682: dp = backref(m, sp, stop, ss+1, stopst, lev);
683: if (dp != NULL)
684: return(dp);
685: m->pmatch[i].rm_so = offsave;
686: return(NULL);
1.9 christos 687:
1.1 jtc 688: case ORPAREN: /* must undo assignment if rest fails */
689: i = OPND(s);
1.2 jtc 690: assert(0 < i && i <= m->g->nsub);
1.1 jtc 691: offsave = m->pmatch[i].rm_eo;
692: m->pmatch[i].rm_eo = sp - m->offp;
693: dp = backref(m, sp, stop, ss+1, stopst, lev);
694: if (dp != NULL)
695: return(dp);
696: m->pmatch[i].rm_eo = offsave;
697: return(NULL);
1.9 christos 698:
1.1 jtc 699: default: /* uh oh */
700: assert(nope);
701: break;
702: }
703:
704: /* "can't happen" */
705: assert(nope);
706: /* NOTREACHED */
1.7 christos 707: return NULL;
1.1 jtc 708: }
709:
710: /*
711: - fast - step through the string at top speed
1.8 perry 712: == static char *fast(struct match *m, char *start, \
1.1 jtc 713: == char *stop, sopno startst, sopno stopst);
714: */
715: static char * /* where tentative match ended, or NULL */
716: fast(m, start, stop, startst, stopst)
1.8 perry 717: struct match *m;
1.1 jtc 718: char *start;
719: char *stop;
720: sopno startst;
721: sopno stopst;
722: {
1.8 perry 723: states st = m->st;
724: states fresh = m->fresh;
725: states tmp = m->tmp;
726: char *p = start;
727: int c = (start == m->beginp) ? OUT : *(start-1);
728: int lastc; /* previous c */
729: int flagch;
730: int i;
731: char *coldp; /* last p after which no match was underway */
1.1 jtc 732:
733: CLEAR(st);
734: SET1(st, startst);
735: st = step(m->g, startst, stopst, st, NOTHING, st);
736: ASSIGN(fresh, st);
737: SP("start", st, *p);
738: coldp = NULL;
739: for (;;) {
740: /* next character */
741: lastc = c;
742: c = (p == m->endp) ? OUT : *p;
743: if (EQ(st, fresh))
744: coldp = p;
745:
746: /* is there an EOL and/or BOL between lastc and c? */
747: flagch = '\0';
748: i = 0;
749: if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
750: (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
751: flagch = BOL;
752: i = m->g->nbol;
753: }
754: if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
755: (c == OUT && !(m->eflags®_NOTEOL)) ) {
756: flagch = (flagch == BOL) ? BOLEOL : EOL;
757: i += m->g->neol;
758: }
759: if (i != 0) {
760: for (; i > 0; i--)
761: st = step(m->g, startst, stopst, st, flagch, st);
762: SP("boleol", st, c);
763: }
764:
765: /* how about a word boundary? */
1.3 jtc 766: if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
767: (c != OUT && ISWORD(c)) ) {
1.1 jtc 768: flagch = BOW;
769: }
1.3 jtc 770: if ( (lastc != OUT && ISWORD(lastc)) &&
771: (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
1.1 jtc 772: flagch = EOW;
773: }
774: if (flagch == BOW || flagch == EOW) {
775: st = step(m->g, startst, stopst, st, flagch, st);
776: SP("boweow", st, c);
777: }
778:
779: /* are we done? */
780: if (ISSET(st, stopst) || p == stop)
781: break; /* NOTE BREAK OUT */
782:
783: /* no, we must deal with this character */
784: ASSIGN(tmp, st);
785: ASSIGN(st, fresh);
786: assert(c != OUT);
787: st = step(m->g, startst, stopst, tmp, c, st);
788: SP("aft", st, c);
789: assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
790: p++;
791: }
792:
793: assert(coldp != NULL);
794: m->coldp = coldp;
795: if (ISSET(st, stopst))
796: return(p+1);
797: else
798: return(NULL);
799: }
800:
801: /*
802: - slow - step through the string more deliberately
1.8 perry 803: == static char *slow(struct match *m, char *start, \
1.1 jtc 804: == char *stop, sopno startst, sopno stopst);
805: */
806: static char * /* where it ended */
807: slow(m, start, stop, startst, stopst)
1.8 perry 808: struct match *m;
1.1 jtc 809: char *start;
810: char *stop;
811: sopno startst;
812: sopno stopst;
813: {
1.8 perry 814: states st = m->st;
815: states empty = m->empty;
816: states tmp = m->tmp;
817: char *p = start;
818: int c = (start == m->beginp) ? OUT : *(start-1);
819: int lastc; /* previous c */
820: int flagch;
821: int i;
822: char *matchp; /* last p at which a match ended */
1.1 jtc 823:
824: AT("slow", start, stop, startst, stopst);
825: CLEAR(st);
826: SET1(st, startst);
827: SP("sstart", st, *p);
828: st = step(m->g, startst, stopst, st, NOTHING, st);
829: matchp = NULL;
830: for (;;) {
831: /* next character */
832: lastc = c;
833: c = (p == m->endp) ? OUT : *p;
834:
835: /* is there an EOL and/or BOL between lastc and c? */
836: flagch = '\0';
837: i = 0;
838: if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
839: (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
840: flagch = BOL;
841: i = m->g->nbol;
842: }
843: if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
844: (c == OUT && !(m->eflags®_NOTEOL)) ) {
845: flagch = (flagch == BOL) ? BOLEOL : EOL;
846: i += m->g->neol;
847: }
848: if (i != 0) {
849: for (; i > 0; i--)
850: st = step(m->g, startst, stopst, st, flagch, st);
851: SP("sboleol", st, c);
852: }
853:
854: /* how about a word boundary? */
1.3 jtc 855: if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
856: (c != OUT && ISWORD(c)) ) {
1.1 jtc 857: flagch = BOW;
858: }
1.3 jtc 859: if ( (lastc != OUT && ISWORD(lastc)) &&
860: (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
1.1 jtc 861: flagch = EOW;
862: }
863: if (flagch == BOW || flagch == EOW) {
864: st = step(m->g, startst, stopst, st, flagch, st);
865: SP("sboweow", st, c);
866: }
867:
868: /* are we done? */
869: if (ISSET(st, stopst))
870: matchp = p;
871: if (EQ(st, empty) || p == stop)
872: break; /* NOTE BREAK OUT */
873:
874: /* no, we must deal with this character */
875: ASSIGN(tmp, st);
876: ASSIGN(st, empty);
877: assert(c != OUT);
878: st = step(m->g, startst, stopst, tmp, c, st);
879: SP("saft", st, c);
880: assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
881: p++;
882: }
883:
884: return(matchp);
885: }
886:
887:
888: /*
889: - step - map set of states reachable before char to set reachable after
1.8 perry 890: == static states step(struct re_guts *g, sopno start, sopno stop, \
891: == states bef, int ch, states aft);
1.1 jtc 892: == #define BOL (OUT+1)
893: == #define EOL (BOL+1)
894: == #define BOLEOL (BOL+2)
895: == #define NOTHING (BOL+3)
896: == #define BOW (BOL+4)
897: == #define EOW (BOL+5)
898: == #define CODEMAX (BOL+5) // highest code used
899: == #define NONCHAR(c) ((c) > CHAR_MAX)
900: == #define NNONCHAR (CODEMAX-CHAR_MAX)
901: */
902: static states
903: step(g, start, stop, bef, ch, aft)
1.8 perry 904: struct re_guts *g;
1.3 jtc 905: sopno start; /* start state within strip */
906: sopno stop; /* state after stop state within strip */
1.8 perry 907: states bef; /* states reachable before */
1.1 jtc 908: int ch; /* character or NONCHAR code */
1.8 perry 909: states aft; /* states already known reachable after */
1.1 jtc 910: {
1.8 perry 911: cset *cs;
912: sop s;
913: sopno pc;
914: onestate here; /* note, macros know this name */
915: sopno look;
916: int i;
1.1 jtc 917:
918: for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
919: s = g->strip[pc];
920: switch (OP(s)) {
921: case OEND:
922: assert(pc == stop-1);
923: break;
924: case OCHAR:
925: /* only characters can match */
926: assert(!NONCHAR(ch) || ch != (char)OPND(s));
927: if (ch == (char)OPND(s))
928: FWD(aft, bef, 1);
929: break;
930: case OBOL:
931: if (ch == BOL || ch == BOLEOL)
932: FWD(aft, bef, 1);
933: break;
934: case OEOL:
935: if (ch == EOL || ch == BOLEOL)
936: FWD(aft, bef, 1);
937: break;
938: case OBOW:
939: if (ch == BOW)
940: FWD(aft, bef, 1);
941: break;
942: case OEOW:
943: if (ch == EOW)
944: FWD(aft, bef, 1);
945: break;
946: case OANY:
947: if (!NONCHAR(ch))
948: FWD(aft, bef, 1);
949: break;
950: case OANYOF:
951: cs = &g->sets[OPND(s)];
952: if (!NONCHAR(ch) && CHIN(cs, ch))
953: FWD(aft, bef, 1);
954: break;
955: case OBACK_: /* ignored here */
956: case O_BACK:
957: FWD(aft, aft, 1);
958: break;
959: case OPLUS_: /* forward, this is just an empty */
960: FWD(aft, aft, 1);
961: break;
962: case O_PLUS: /* both forward and back */
963: FWD(aft, aft, 1);
964: i = ISSETBACK(aft, OPND(s));
965: BACK(aft, aft, OPND(s));
966: if (!i && ISSETBACK(aft, OPND(s))) {
967: /* oho, must reconsider loop body */
968: pc -= OPND(s) + 1;
969: INIT(here, pc);
970: }
971: break;
972: case OQUEST_: /* two branches, both forward */
973: FWD(aft, aft, 1);
974: FWD(aft, aft, OPND(s));
975: break;
976: case O_QUEST: /* just an empty */
977: FWD(aft, aft, 1);
978: break;
979: case OLPAREN: /* not significant here */
980: case ORPAREN:
981: FWD(aft, aft, 1);
982: break;
983: case OCH_: /* mark the first two branches */
984: FWD(aft, aft, 1);
985: assert(OP(g->strip[pc+OPND(s)]) == OOR2);
986: FWD(aft, aft, OPND(s));
987: break;
988: case OOR1: /* done a branch, find the O_CH */
989: if (ISSTATEIN(aft, here)) {
990: for (look = 1;
991: OP(s = g->strip[pc+look]) != O_CH;
992: look += OPND(s))
993: assert(OP(s) == OOR2);
994: FWD(aft, aft, look);
995: }
996: break;
997: case OOR2: /* propagate OCH_'s marking */
998: FWD(aft, aft, 1);
999: if (OP(g->strip[pc+OPND(s)]) != O_CH) {
1000: assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1001: FWD(aft, aft, OPND(s));
1002: }
1003: break;
1004: case O_CH: /* just empty */
1005: FWD(aft, aft, 1);
1006: break;
1007: default: /* ooooops... */
1008: assert(nope);
1009: break;
1010: }
1011: }
1012:
1013: return(aft);
1014: }
1015:
1016: #ifdef REDEBUG
1017: /*
1018: - print - print a set of states
1019: == #ifdef REDEBUG
1020: == static void print(struct match *m, char *caption, states st, \
1021: == int ch, FILE *d);
1022: == #endif
1023: */
1024: static void
1025: print(m, caption, st, ch, d)
1026: struct match *m;
1027: char *caption;
1028: states st;
1029: int ch;
1030: FILE *d;
1031: {
1.8 perry 1032: struct re_guts *g = m->g;
1033: int i;
1034: int first = 1;
1.1 jtc 1035:
1036: if (!(m->eflags®_TRACE))
1037: return;
1038:
1039: fprintf(d, "%s", caption);
1040: if (ch != '\0')
1041: fprintf(d, " %s", pchar(ch));
1042: for (i = 0; i < g->nstates; i++)
1043: if (ISSET(st, i)) {
1044: fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1045: first = 0;
1046: }
1047: fprintf(d, "\n");
1048: }
1049:
1050: /*
1051: - at - print current situation
1052: == #ifdef REDEBUG
1053: == static void at(struct match *m, char *title, char *start, char *stop, \
1054: == sopno startst, sopno stopst);
1055: == #endif
1056: */
1057: static void
1058: at(m, title, start, stop, startst, stopst)
1059: struct match *m;
1060: char *title;
1061: char *start;
1062: char *stop;
1063: sopno startst;
1064: sopno stopst;
1065: {
1066: if (!(m->eflags®_TRACE))
1067: return;
1068:
1069: printf("%s %s-", title, pchar(*start));
1070: printf("%s ", pchar(*stop));
1071: printf("%ld-%ld\n", (long)startst, (long)stopst);
1072: }
1073:
1074: #ifndef PCHARDONE
1075: #define PCHARDONE /* never again */
1076: /*
1077: - pchar - make a character printable
1078: == #ifdef REDEBUG
1079: == static char *pchar(int ch);
1080: == #endif
1081: *
1082: * Is this identical to regchar() over in debug.c? Well, yes. But a
1083: * duplicate here avoids having a debugging-capable regexec.o tied to
1084: * a matching debug.o, and this is convenient. It all disappears in
1085: * the non-debug compilation anyway, so it doesn't matter much.
1086: */
1087: static char * /* -> representation */
1088: pchar(ch)
1089: int ch;
1090: {
1091: static char pbuf[10];
1092:
1093: if (isprint(ch) || ch == ' ')
1.6 mrg 1094: (void)snprintf(pbuf, sizeof pbuf, "%c", ch);
1.1 jtc 1095: else
1.6 mrg 1096: (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch);
1.1 jtc 1097: return(pbuf);
1098: }
1099: #endif
1100: #endif
1101:
1102: #undef matcher
1103: #undef fast
1104: #undef slow
1105: #undef dissect
1106: #undef backref
1107: #undef step
1108: #undef print
1109: #undef at
1110: #undef match
CVSweb <webmaster@jp.NetBSD.org>