src/lib/libedit/tokenizer.c - annotate

Return to tokenizer.c CVS log
Up to [cvs.NetBSD.org] / src / lib / libedit
Annotation of src/lib/libedit/tokenizer.c, Revision 1.28

1.28    ! christos    1: /*     $NetBSD: tokenizer.c,v 1.27 2016/04/11 16:06:52 christos Exp $  */
1.2       lukem       2:
1.1       cgd         3: /*-
                      4:  * Copyright (c) 1992, 1993
                      5:  *     The Regents of the University of California.  All rights reserved.
                      6:  *
                      7:  * This code is derived from software contributed to Berkeley by
                      8:  * Christos Zoulas of Cornell University.
                      9:  *
                     10:  * Redistribution and use in source and binary forms, with or without
                     11:  * modification, are permitted provided that the following conditions
                     12:  * are met:
                     13:  * 1. Redistributions of source code must retain the above copyright
                     14:  *    notice, this list of conditions and the following disclaimer.
                     15:  * 2. Redistributions in binary form must reproduce the above copyright
                     16:  *    notice, this list of conditions and the following disclaimer in the
                     17:  *    documentation and/or other materials provided with the distribution.
1.12      agc        18:  * 3. Neither the name of the University nor the names of its contributors
1.1       cgd        19:  *    may be used to endorse or promote products derived from this software
                     20:  *    without specific prior written permission.
                     21:  *
                     22:  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
                     23:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     24:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     25:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
                     26:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
                     27:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
                     28:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
                     29:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
                     30:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
                     31:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
                     32:  * SUCH DAMAGE.
                     33:  */
                     34:
1.10      christos   35: #include "config.h"
1.1       cgd        36: #if !defined(lint) && !defined(SCCSID)
1.2       lukem      37: #if 0
1.1       cgd        38: static char sccsid[] = "@(#)tokenizer.c        8.1 (Berkeley) 6/4/93";
1.2       lukem      39: #else
1.28    ! christos   40: __RCSID("$NetBSD: tokenizer.c,v 1.27 2016/04/11 16:06:52 christos Exp $");
1.2       lukem      41: #endif
1.1       cgd        42: #endif /* not lint && not SCCSID */
                     43:
1.16      christos   44: /* We build this file twice, once as NARROW, once as WIDE. */
1.1       cgd        45: /*
                     46:  * tokenize.c: Bourne shell like tokenizer
                     47:  */
1.24      christos   48: #include <stdlib.h>
1.1       cgd        49: #include <string.h>
1.24      christos   50:
1.14      lukem      51: #include "histedit.h"
1.1       cgd        52:
1.6       lukem      53: typedef enum {
                     54:        Q_none, Q_single, Q_double, Q_one, Q_doubleone
                     55: } quote_t;
1.1       cgd        56:
1.6       lukem      57: #define        TOK_KEEP        1
                     58: #define        TOK_EAT         2
1.1       cgd        59:
1.6       lukem      60: #define        WINCR           20
                     61: #define        AINCR           10
1.1       cgd        62:
1.16      christos   63: #define        IFS             STR("\t \n")
                     64:
1.6       lukem      65: #define        tok_malloc(a)           malloc(a)
                     66: #define        tok_free(a)             free(a)
                     67: #define        tok_realloc(a, b)       realloc(a, b)
1.1       cgd        68:
1.25      christos   69: #ifdef NARROWCHAR
1.26      christos   70: #define        Char                    char
1.25      christos   71: #define        FUN(prefix, rest)       prefix ## _ ## rest
                     72: #define        TYPE(type)              type
                     73: #define        STR(x)                  x
                     74: #define        Strchr(s, c)            strchr(s, c)
                     75: #define        tok_strdup(s)           strdup(s)
                     76: #else
1.26      christos   77: #define        Char                    wchar_t
1.25      christos   78: #define        FUN(prefix, rest)       prefix ## _w ## rest
                     79: #define        TYPE(type)              type ## W
                     80: #define        STR(x)                  L ## x
                     81: #define        Strchr(s, c)            wcschr(s, c)
                     82: #define        tok_strdup(s)           wcsdup(s)
                     83: #endif
1.1       cgd        84:
1.18      christos   85: struct TYPE(tokenizer) {
1.16      christos   86:        Char    *ifs;           /* In field separator                    */
1.21      christos   87:        size_t   argc, amax;    /* Current and maximum number of args    */
1.16      christos   88:        Char   **argv;          /* Argument list                         */
                     89:        Char    *wptr, *wmax;   /* Space and limit on the word buffer    */
                     90:        Char    *wstart;        /* Beginning of next word                */
                     91:        Char    *wspace;        /* Space of word buffer                  */
1.6       lukem      92:        quote_t  quote;         /* Quoting state                         */
                     93:        int      flags;         /* flags;                                */
1.1       cgd        94: };
                     95:
                     96:
1.28    ! christos   97: static void FUN(tok,finish)(TYPE(Tokenizer) *);
1.1       cgd        98:
                     99:
1.16      christos  100: /* FUN(tok,finish)():
1.1       cgd       101:  *     Finish a word in the tokenizer.
                    102:  */
1.28    ! christos  103: static void
1.16      christos  104: FUN(tok,finish)(TYPE(Tokenizer) *tok)
1.1       cgd       105: {
1.6       lukem     106:
                    107:        *tok->wptr = '\0';
                    108:        if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
                    109:                tok->argv[tok->argc++] = tok->wstart;
                    110:                tok->argv[tok->argc] = NULL;
                    111:                tok->wstart = ++tok->wptr;
                    112:        }
                    113:        tok->flags &= ~TOK_KEEP;
1.1       cgd       114: }
                    115:
                    116:
1.16      christos  117: /* FUN(tok,init)():
1.1       cgd       118:  *     Initialize the tokenizer
                    119:  */
1.28    ! christos  120: TYPE(Tokenizer) *
1.16      christos  121: FUN(tok,init)(const Char *ifs)
1.1       cgd       122: {
1.19      christos  123:        TYPE(Tokenizer) *tok = tok_malloc(sizeof(*tok));
1.1       cgd       124:
1.11      christos  125:        if (tok == NULL)
                    126:                return NULL;
1.13      christos  127:        tok->ifs = tok_strdup(ifs ? ifs : IFS);
1.11      christos  128:        if (tok->ifs == NULL) {
1.19      christos  129:                tok_free(tok);
1.11      christos  130:                return NULL;
                    131:        }
1.6       lukem     132:        tok->argc = 0;
                    133:        tok->amax = AINCR;
1.16      christos  134:        tok->argv = tok_malloc(sizeof(*tok->argv) * tok->amax);
1.11      christos  135:        if (tok->argv == NULL) {
1.19      christos  136:                tok_free(tok->ifs);
                    137:                tok_free(tok);
1.11      christos  138:                return NULL;
                    139:        }
1.6       lukem     140:        tok->argv[0] = NULL;
1.16      christos  141:        tok->wspace = tok_malloc(WINCR * sizeof(*tok->wspace));
1.11      christos  142:        if (tok->wspace == NULL) {
1.19      christos  143:                tok_free(tok->argv);
                    144:                tok_free(tok->ifs);
                    145:                tok_free(tok);
1.11      christos  146:                return NULL;
                    147:        }
1.6       lukem     148:        tok->wmax = tok->wspace + WINCR;
                    149:        tok->wstart = tok->wspace;
                    150:        tok->wptr = tok->wspace;
                    151:        tok->flags = 0;
                    152:        tok->quote = Q_none;
1.1       cgd       153:
1.20      christos  154:        return tok;
1.1       cgd       155: }
                    156:
                    157:
1.16      christos  158: /* FUN(tok,reset)():
1.1       cgd       159:  *     Reset the tokenizer
                    160:  */
1.28    ! christos  161: void
1.16      christos  162: FUN(tok,reset)(TYPE(Tokenizer) *tok)
1.1       cgd       163: {
1.6       lukem     164:
                    165:        tok->argc = 0;
                    166:        tok->wstart = tok->wspace;
                    167:        tok->wptr = tok->wspace;
                    168:        tok->flags = 0;
                    169:        tok->quote = Q_none;
1.1       cgd       170: }
                    171:
                    172:
1.16      christos  173: /* FUN(tok,end)():
1.1       cgd       174:  *     Clean up
                    175:  */
1.28    ! christos  176: void
1.16      christos  177: FUN(tok,end)(TYPE(Tokenizer) *tok)
1.1       cgd       178: {
1.6       lukem     179:
1.19      christos  180:        tok_free(tok->ifs);
                    181:        tok_free(tok->wspace);
                    182:        tok_free(tok->argv);
                    183:        tok_free(tok);
1.1       cgd       184: }
                    185:
                    186:
                    187:
1.16      christos  188: /* FUN(tok,line)():
1.14      lukem     189:  *     Bourne shell (sh(1)) like tokenizing
                    190:  *     Arguments:
1.16      christos  191:  *             tok     current tokenizer state (setup with FUN(tok,init)())
1.14      lukem     192:  *             line    line to parse
                    193:  *     Returns:
                    194:  *             -1      Internal error
                    195:  *              3      Quoted return
                    196:  *              2      Unmatched double quote
                    197:  *              1      Unmatched single quote
                    198:  *              0      Ok
                    199:  *     Modifies (if return value is 0):
                    200:  *             argc    number of arguments
                    201:  *             argv    argument array
                    202:  *             cursorc if !NULL, argv element containing cursor
                    203:  *             cursorv if !NULL, offset in argv[cursorc] of cursor
1.1       cgd       204:  */
1.28    ! christos  205: int
1.17      christos  206: FUN(tok,line)(TYPE(Tokenizer) *tok, const TYPE(LineInfo) *line,
1.16      christos  207:     int *argc, const Char ***argv, int *cursorc, int *cursoro)
1.1       cgd       208: {
1.16      christos  209:        const Char *ptr;
1.14      lukem     210:        int cc, co;
1.1       cgd       211:
1.14      lukem     212:        cc = co = -1;
                    213:        ptr = line->buffer;
                    214:        for (ptr = line->buffer; ;ptr++) {
                    215:                if (ptr >= line->lastchar)
1.16      christos  216:                        ptr = STR("");
1.14      lukem     217:                if (ptr == line->cursor) {
1.21      christos  218:                        cc = (int)tok->argc;
1.15      christos  219:                        co = (int)(tok->wptr - tok->wstart);
1.14      lukem     220:                }
                    221:                switch (*ptr) {
1.6       lukem     222:                case '\'':
                    223:                        tok->flags |= TOK_KEEP;
                    224:                        tok->flags &= ~TOK_EAT;
                    225:                        switch (tok->quote) {
                    226:                        case Q_none:
                    227:                                tok->quote = Q_single;  /* Enter single quote
                    228:                                                         * mode */
                    229:                                break;
                    230:
                    231:                        case Q_single:  /* Exit single quote mode */
                    232:                                tok->quote = Q_none;
                    233:                                break;
                    234:
                    235:                        case Q_one:     /* Quote this ' */
                    236:                                tok->quote = Q_none;
                    237:                                *tok->wptr++ = *ptr;
                    238:                                break;
                    239:
                    240:                        case Q_double:  /* Stay in double quote mode */
                    241:                                *tok->wptr++ = *ptr;
                    242:                                break;
                    243:
                    244:                        case Q_doubleone:       /* Quote this ' */
                    245:                                tok->quote = Q_double;
                    246:                                *tok->wptr++ = *ptr;
                    247:                                break;
                    248:
                    249:                        default:
1.20      christos  250:                                return -1;
1.6       lukem     251:                        }
                    252:                        break;
                    253:
                    254:                case '"':
                    255:                        tok->flags &= ~TOK_EAT;
                    256:                        tok->flags |= TOK_KEEP;
                    257:                        switch (tok->quote) {
                    258:                        case Q_none:    /* Enter double quote mode */
                    259:                                tok->quote = Q_double;
                    260:                                break;
                    261:
                    262:                        case Q_double:  /* Exit double quote mode */
                    263:                                tok->quote = Q_none;
                    264:                                break;
                    265:
                    266:                        case Q_one:     /* Quote this " */
                    267:                                tok->quote = Q_none;
                    268:                                *tok->wptr++ = *ptr;
                    269:                                break;
                    270:
                    271:                        case Q_single:  /* Stay in single quote mode */
                    272:                                *tok->wptr++ = *ptr;
                    273:                                break;
                    274:
                    275:                        case Q_doubleone:       /* Quote this " */
                    276:                                tok->quote = Q_double;
                    277:                                *tok->wptr++ = *ptr;
                    278:                                break;
                    279:
                    280:                        default:
1.20      christos  281:                                return -1;
1.6       lukem     282:                        }
                    283:                        break;
                    284:
                    285:                case '\\':
                    286:                        tok->flags |= TOK_KEEP;
                    287:                        tok->flags &= ~TOK_EAT;
                    288:                        switch (tok->quote) {
                    289:                        case Q_none:    /* Quote next character */
                    290:                                tok->quote = Q_one;
                    291:                                break;
                    292:
                    293:                        case Q_double:  /* Quote next character */
                    294:                                tok->quote = Q_doubleone;
                    295:                                break;
                    296:
                    297:                        case Q_one:     /* Quote this, restore state */
                    298:                                *tok->wptr++ = *ptr;
                    299:                                tok->quote = Q_none;
                    300:                                break;
                    301:
                    302:                        case Q_single:  /* Stay in single quote mode */
                    303:                                *tok->wptr++ = *ptr;
                    304:                                break;
                    305:
                    306:                        case Q_doubleone:       /* Quote this \ */
                    307:                                tok->quote = Q_double;
                    308:                                *tok->wptr++ = *ptr;
                    309:                                break;
                    310:
                    311:                        default:
1.20      christos  312:                                return -1;
1.6       lukem     313:                        }
                    314:                        break;
                    315:
                    316:                case '\n':
                    317:                        tok->flags &= ~TOK_EAT;
                    318:                        switch (tok->quote) {
                    319:                        case Q_none:
1.14      lukem     320:                                goto tok_line_outok;
1.6       lukem     321:
                    322:                        case Q_single:
                    323:                        case Q_double:
                    324:                                *tok->wptr++ = *ptr;    /* Add the return */
                    325:                                break;
                    326:
                    327:                        case Q_doubleone:   /* Back to double, eat the '\n' */
                    328:                                tok->flags |= TOK_EAT;
                    329:                                tok->quote = Q_double;
                    330:                                break;
                    331:
                    332:                        case Q_one:     /* No quote, more eat the '\n' */
                    333:                                tok->flags |= TOK_EAT;
                    334:                                tok->quote = Q_none;
                    335:                                break;
                    336:
                    337:                        default:
1.20      christos  338:                                return 0;
1.6       lukem     339:                        }
                    340:                        break;
                    341:
                    342:                case '\0':
                    343:                        switch (tok->quote) {
                    344:                        case Q_none:
                    345:                                /* Finish word and return */
                    346:                                if (tok->flags & TOK_EAT) {
                    347:                                        tok->flags &= ~TOK_EAT;
1.20      christos  348:                                        return 3;
1.6       lukem     349:                                }
1.14      lukem     350:                                goto tok_line_outok;
1.6       lukem     351:
                    352:                        case Q_single:
1.20      christos  353:                                return 1;
1.6       lukem     354:
                    355:                        case Q_double:
1.20      christos  356:                                return 2;
1.6       lukem     357:
                    358:                        case Q_doubleone:
                    359:                                tok->quote = Q_double;
                    360:                                *tok->wptr++ = *ptr;
                    361:                                break;
                    362:
                    363:                        case Q_one:
                    364:                                tok->quote = Q_none;
                    365:                                *tok->wptr++ = *ptr;
                    366:                                break;
                    367:
                    368:                        default:
1.20      christos  369:                                return -1;
1.6       lukem     370:                        }
                    371:                        break;
                    372:
                    373:                default:
                    374:                        tok->flags &= ~TOK_EAT;
                    375:                        switch (tok->quote) {
                    376:                        case Q_none:
1.16      christos  377:                                if (Strchr(tok->ifs, *ptr) != NULL)
                    378:                                        FUN(tok,finish)(tok);
1.6       lukem     379:                                else
                    380:                                        *tok->wptr++ = *ptr;
                    381:                                break;
                    382:
                    383:                        case Q_single:
                    384:                        case Q_double:
                    385:                                *tok->wptr++ = *ptr;
                    386:                                break;
                    387:
                    388:
                    389:                        case Q_doubleone:
                    390:                                *tok->wptr++ = '\\';
                    391:                                tok->quote = Q_double;
                    392:                                *tok->wptr++ = *ptr;
                    393:                                break;
                    394:
                    395:                        case Q_one:
                    396:                                tok->quote = Q_none;
                    397:                                *tok->wptr++ = *ptr;
                    398:                                break;
1.1       cgd       399:
1.6       lukem     400:                        default:
1.20      christos  401:                                return -1;
1.1       cgd       402:
1.6       lukem     403:                        }
                    404:                        break;
                    405:                }
1.1       cgd       406:
1.6       lukem     407:                if (tok->wptr >= tok->wmax - 4) {
1.21      christos  408:                        size_t size = (size_t)(tok->wmax - tok->wspace + WINCR);
1.16      christos  409:                        Char *s = tok_realloc(tok->wspace,
                    410:                            size * sizeof(*s));
1.7       christos  411:                        if (s == NULL)
1.20      christos  412:                                return -1;
1.6       lukem     413:
1.8       christos  414:                        if (s != tok->wspace) {
1.21      christos  415:                                size_t i;
1.8       christos  416:                                for (i = 0; i < tok->argc; i++) {
                    417:                                    tok->argv[i] =
                    418:                                        (tok->argv[i] - tok->wspace) + s;
                    419:                                }
                    420:                                tok->wptr = (tok->wptr - tok->wspace) + s;
                    421:                                tok->wstart = (tok->wstart - tok->wspace) + s;
1.6       lukem     422:                                tok->wspace = s;
                    423:                        }
1.9       christos  424:                        tok->wmax = s + size;
1.6       lukem     425:                }
                    426:                if (tok->argc >= tok->amax - 4) {
1.16      christos  427:                        Char **p;
1.6       lukem     428:                        tok->amax += AINCR;
1.16      christos  429:                        p = tok_realloc(tok->argv, tok->amax * sizeof(*p));
1.23      christos  430:                        if (p == NULL) {
                    431:                                tok->amax -= AINCR;
1.20      christos  432:                                return -1;
1.23      christos  433:                        }
1.7       christos  434:                        tok->argv = p;
1.6       lukem     435:                }
1.1       cgd       436:        }
1.14      lukem     437:  tok_line_outok:
                    438:        if (cc == -1 && co == -1) {
1.21      christos  439:                cc = (int)tok->argc;
1.15      christos  440:                co = (int)(tok->wptr - tok->wstart);
1.14      lukem     441:        }
                    442:        if (cursorc != NULL)
                    443:                *cursorc = cc;
                    444:        if (cursoro != NULL)
                    445:                *cursoro = co;
1.16      christos  446:        FUN(tok,finish)(tok);
                    447:        *argv = (const Char **)tok->argv;
1.21      christos  448:        *argc = (int)tok->argc;
1.20      christos  449:        return 0;
1.14      lukem     450: }
                    451:
1.16      christos  452: /* FUN(tok,str)():
1.14      lukem     453:  *     Simpler version of tok_line, taking a NUL terminated line
                    454:  *     and splitting into words, ignoring cursor state.
                    455:  */
1.28    ! christos  456: int
1.16      christos  457: FUN(tok,str)(TYPE(Tokenizer) *tok, const Char *line, int *argc,
                    458:     const Char ***argv)
1.14      lukem     459: {
1.17      christos  460:        TYPE(LineInfo) li;
1.14      lukem     461:
                    462:        memset(&li, 0, sizeof(li));
                    463:        li.buffer = line;
1.16      christos  464:        li.cursor = li.lastchar = Strchr(line, '\0');
1.22      christos  465:        return FUN(tok,line)(tok, &li, argc, argv, NULL, NULL);
1.1       cgd       466: }
CVSweb <webmaster@jp.NetBSD.org>