[BACK]Return to xmltok.c CVS log [TXT][DIR] Up to [cvs.NetBSD.org] / src / external / mit / expat / dist / lib

Annotation of src/external/mit/expat/dist/lib/xmltok.c, Revision 1.1.1.4

1.1       tron        1: /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
                      2:    See the file COPYING for copying permission.
                      3: */
                      4:
                      5: #include <stddef.h>
                      6:
1.1.1.4 ! christos    7: #ifdef _WIN32
1.1       tron        8: #include "winconfig.h"
                      9: #else
                     10: #ifdef HAVE_EXPAT_CONFIG_H
                     11: #include <expat_config.h>
                     12: #endif
1.1.1.4 ! christos   13: #endif /* ndef _WIN32 */
1.1       tron       14:
                     15: #include "expat_external.h"
                     16: #include "internal.h"
                     17: #include "xmltok.h"
                     18: #include "nametab.h"
                     19:
                     20: #ifdef XML_DTD
                     21: #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
                     22: #else
                     23: #define IGNORE_SECTION_TOK_VTABLE /* as nothing */
                     24: #endif
                     25:
                     26: #define VTABLE1 \
                     27:   { PREFIX(prologTok), PREFIX(contentTok), \
                     28:     PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
                     29:   { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
                     30:   PREFIX(sameName), \
                     31:   PREFIX(nameMatchesAscii), \
                     32:   PREFIX(nameLength), \
                     33:   PREFIX(skipS), \
                     34:   PREFIX(getAtts), \
                     35:   PREFIX(charRefNumber), \
                     36:   PREFIX(predefinedEntityName), \
                     37:   PREFIX(updatePosition), \
                     38:   PREFIX(isPublicId)
                     39:
                     40: #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
                     41:
                     42: #define UCS2_GET_NAMING(pages, hi, lo) \
1.1.1.3   spz        43:    (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F)))
1.1       tron       44:
                     45: /* A 2 byte UTF-8 representation splits the characters 11 bits between
                     46:    the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
                     47:    pages, 3 bits to add to that index and 5 bits to generate the mask.
                     48: */
                     49: #define UTF8_GET_NAMING2(pages, byte) \
                     50:     (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
                     51:                       + ((((byte)[0]) & 3) << 1) \
                     52:                       + ((((byte)[1]) >> 5) & 1)] \
1.1.1.3   spz        53:          & (1u << (((byte)[1]) & 0x1F)))
1.1       tron       54:
                     55: /* A 3 byte UTF-8 representation splits the characters 16 bits between
                     56:    the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
                     57:    into pages, 3 bits to add to that index and 5 bits to generate the
                     58:    mask.
                     59: */
                     60: #define UTF8_GET_NAMING3(pages, byte) \
                     61:   (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
                     62:                              + ((((byte)[1]) >> 2) & 0xF)] \
                     63:                        << 3) \
                     64:                       + ((((byte)[1]) & 3) << 1) \
                     65:                       + ((((byte)[2]) >> 5) & 1)] \
1.1.1.3   spz        66:          & (1u << (((byte)[2]) & 0x1F)))
1.1       tron       67:
                     68: #define UTF8_GET_NAMING(pages, p, n) \
                     69:   ((n) == 2 \
                     70:   ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
                     71:   : ((n) == 3 \
                     72:      ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
                     73:      : 0))
                     74:
                     75: /* Detection of invalid UTF-8 sequences is based on Table 3.1B
                     76:    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
                     77:    with the additional restriction of not allowing the Unicode
                     78:    code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
                     79:    Implementation details:
                     80:      (A & 0x80) == 0     means A < 0x80
                     81:    and
                     82:      (A & 0xC0) == 0xC0  means A > 0xBF
                     83: */
                     84:
                     85: #define UTF8_INVALID2(p) \
                     86:   ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
                     87:
                     88: #define UTF8_INVALID3(p) \
                     89:   (((p)[2] & 0x80) == 0 \
                     90:   || \
                     91:   ((*p) == 0xEF && (p)[1] == 0xBF \
                     92:     ? \
                     93:     (p)[2] > 0xBD \
                     94:     : \
                     95:     ((p)[2] & 0xC0) == 0xC0) \
                     96:   || \
                     97:   ((*p) == 0xE0 \
                     98:     ? \
                     99:     (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
                    100:     : \
                    101:     ((p)[1] & 0x80) == 0 \
                    102:     || \
                    103:     ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
                    104:
                    105: #define UTF8_INVALID4(p) \
                    106:   (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
                    107:   || \
                    108:   ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
                    109:   || \
                    110:   ((*p) == 0xF0 \
                    111:     ? \
                    112:     (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
                    113:     : \
                    114:     ((p)[1] & 0x80) == 0 \
                    115:     || \
                    116:     ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
                    117:
                    118: static int PTRFASTCALL
1.1.1.3   spz       119: isNever(const ENCODING *UNUSED_P(enc), const char *UNUSED_P(p))
1.1       tron      120: {
                    121:   return 0;
                    122: }
                    123:
                    124: static int PTRFASTCALL
1.1.1.3   spz       125: utf8_isName2(const ENCODING *UNUSED_P(enc), const char *p)
1.1       tron      126: {
                    127:   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
                    128: }
                    129:
                    130: static int PTRFASTCALL
1.1.1.3   spz       131: utf8_isName3(const ENCODING *UNUSED_P(enc), const char *p)
1.1       tron      132: {
                    133:   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
                    134: }
                    135:
                    136: #define utf8_isName4 isNever
                    137:
                    138: static int PTRFASTCALL
1.1.1.3   spz       139: utf8_isNmstrt2(const ENCODING *UNUSED_P(enc), const char *p)
1.1       tron      140: {
                    141:   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
                    142: }
                    143:
                    144: static int PTRFASTCALL
1.1.1.3   spz       145: utf8_isNmstrt3(const ENCODING *UNUSED_P(enc), const char *p)
1.1       tron      146: {
                    147:   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
                    148: }
                    149:
                    150: #define utf8_isNmstrt4 isNever
                    151:
                    152: static int PTRFASTCALL
1.1.1.3   spz       153: utf8_isInvalid2(const ENCODING *UNUSED_P(enc), const char *p)
1.1       tron      154: {
                    155:   return UTF8_INVALID2((const unsigned char *)p);
                    156: }
                    157:
                    158: static int PTRFASTCALL
1.1.1.3   spz       159: utf8_isInvalid3(const ENCODING *UNUSED_P(enc), const char *p)
1.1       tron      160: {
                    161:   return UTF8_INVALID3((const unsigned char *)p);
                    162: }
                    163:
                    164: static int PTRFASTCALL
1.1.1.3   spz       165: utf8_isInvalid4(const ENCODING *UNUSED_P(enc), const char *p)
1.1       tron      166: {
                    167:   return UTF8_INVALID4((const unsigned char *)p);
                    168: }
                    169:
                    170: struct normal_encoding {
                    171:   ENCODING enc;
                    172:   unsigned char type[256];
                    173: #ifdef XML_MIN_SIZE
                    174:   int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
                    175:   int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
                    176:   int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
                    177:   int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
                    178:   int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
                    179: #endif /* XML_MIN_SIZE */
                    180:   int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
                    181:   int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
                    182:   int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
                    183:   int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
                    184:   int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
                    185:   int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
                    186:   int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
                    187:   int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
                    188:   int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
                    189: };
                    190:
                    191: #define AS_NORMAL_ENCODING(enc)   ((const struct normal_encoding *) (enc))
                    192:
                    193: #ifdef XML_MIN_SIZE
                    194:
                    195: #define STANDARD_VTABLE(E) \
                    196:  E ## byteType, \
                    197:  E ## isNameMin, \
                    198:  E ## isNmstrtMin, \
                    199:  E ## byteToAscii, \
                    200:  E ## charMatches,
                    201:
                    202: #else
                    203:
                    204: #define STANDARD_VTABLE(E) /* as nothing */
                    205:
                    206: #endif
                    207:
                    208: #define NORMAL_VTABLE(E) \
                    209:  E ## isName2, \
                    210:  E ## isName3, \
                    211:  E ## isName4, \
                    212:  E ## isNmstrt2, \
                    213:  E ## isNmstrt3, \
                    214:  E ## isNmstrt4, \
                    215:  E ## isInvalid2, \
                    216:  E ## isInvalid3, \
                    217:  E ## isInvalid4
                    218:
1.1.1.3   spz       219: #define NULL_VTABLE \
                    220:  /* isName2 */ NULL, \
                    221:  /* isName3 */ NULL, \
                    222:  /* isName4 */ NULL, \
                    223:  /* isNmstrt2 */ NULL, \
                    224:  /* isNmstrt3 */ NULL, \
                    225:  /* isNmstrt4 */ NULL, \
                    226:  /* isInvalid2 */ NULL, \
                    227:  /* isInvalid3 */ NULL, \
                    228:  /* isInvalid4 */ NULL
                    229:
1.1       tron      230: static int FASTCALL checkCharRefNumber(int);
                    231:
                    232: #include "xmltok_impl.h"
                    233: #include "ascii.h"
                    234:
                    235: #ifdef XML_MIN_SIZE
                    236: #define sb_isNameMin isNever
                    237: #define sb_isNmstrtMin isNever
                    238: #endif
                    239:
                    240: #ifdef XML_MIN_SIZE
                    241: #define MINBPC(enc) ((enc)->minBytesPerChar)
                    242: #else
                    243: /* minimum bytes per character */
                    244: #define MINBPC(enc) 1
                    245: #endif
                    246:
                    247: #define SB_BYTE_TYPE(enc, p) \
                    248:   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
                    249:
                    250: #ifdef XML_MIN_SIZE
                    251: static int PTRFASTCALL
                    252: sb_byteType(const ENCODING *enc, const char *p)
                    253: {
                    254:   return SB_BYTE_TYPE(enc, p);
                    255: }
                    256: #define BYTE_TYPE(enc, p) \
                    257:  (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
                    258: #else
                    259: #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
                    260: #endif
                    261:
                    262: #ifdef XML_MIN_SIZE
                    263: #define BYTE_TO_ASCII(enc, p) \
                    264:  (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
                    265: static int PTRFASTCALL
                    266: sb_byteToAscii(const ENCODING *enc, const char *p)
                    267: {
                    268:   return *p;
                    269: }
                    270: #else
                    271: #define BYTE_TO_ASCII(enc, p) (*(p))
                    272: #endif
                    273:
                    274: #define IS_NAME_CHAR(enc, p, n) \
                    275:  (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
                    276: #define IS_NMSTRT_CHAR(enc, p, n) \
                    277:  (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
                    278: #define IS_INVALID_CHAR(enc, p, n) \
                    279:  (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
                    280:
                    281: #ifdef XML_MIN_SIZE
                    282: #define IS_NAME_CHAR_MINBPC(enc, p) \
                    283:  (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
                    284: #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
                    285:  (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
                    286: #else
                    287: #define IS_NAME_CHAR_MINBPC(enc, p) (0)
                    288: #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
                    289: #endif
                    290:
                    291: #ifdef XML_MIN_SIZE
                    292: #define CHAR_MATCHES(enc, p, c) \
                    293:  (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
                    294: static int PTRCALL
                    295: sb_charMatches(const ENCODING *enc, const char *p, int c)
                    296: {
                    297:   return *p == c;
                    298: }
                    299: #else
                    300: /* c is an ASCII character */
                    301: #define CHAR_MATCHES(enc, p, c) (*(p) == c)
                    302: #endif
                    303:
                    304: #define PREFIX(ident) normal_ ## ident
                    305: #define XML_TOK_IMPL_C
                    306: #include "xmltok_impl.c"
                    307: #undef XML_TOK_IMPL_C
                    308:
                    309: #undef MINBPC
                    310: #undef BYTE_TYPE
                    311: #undef BYTE_TO_ASCII
                    312: #undef CHAR_MATCHES
                    313: #undef IS_NAME_CHAR
                    314: #undef IS_NAME_CHAR_MINBPC
                    315: #undef IS_NMSTRT_CHAR
                    316: #undef IS_NMSTRT_CHAR_MINBPC
                    317: #undef IS_INVALID_CHAR
                    318:
                    319: enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
                    320:   UTF8_cval1 = 0x00,
                    321:   UTF8_cval2 = 0xc0,
                    322:   UTF8_cval3 = 0xe0,
                    323:   UTF8_cval4 = 0xf0
                    324: };
                    325:
1.1.1.3   spz       326: void
                    327: align_limit_to_full_utf8_characters(const char * from, const char ** fromLimRef)
                    328: {
                    329:   const char * fromLim = *fromLimRef;
                    330:   size_t walked = 0;
                    331:   for (; fromLim > from; fromLim--, walked++) {
                    332:     const unsigned char prev = (unsigned char)fromLim[-1];
                    333:     if ((prev & 0xf8u) == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
                    334:       if (walked + 1 >= 4) {
                    335:         fromLim += 4 - 1;
                    336:         break;
                    337:       } else {
                    338:         walked = 0;
                    339:       }
                    340:     } else if ((prev & 0xf0u) == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
                    341:       if (walked + 1 >= 3) {
                    342:         fromLim += 3 - 1;
                    343:         break;
                    344:       } else {
                    345:         walked = 0;
                    346:       }
                    347:     } else if ((prev & 0xe0u) == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
                    348:       if (walked + 1 >= 2) {
                    349:         fromLim += 2 - 1;
                    350:         break;
                    351:       } else {
                    352:         walked = 0;
                    353:       }
                    354:     } else if ((prev & 0x80u) == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
                    355:       break;
                    356:     }
                    357:   }
                    358:   *fromLimRef = fromLim;
                    359: }
                    360:
                    361: static enum XML_Convert_Result PTRCALL
                    362: utf8_toUtf8(const ENCODING *UNUSED_P(enc),
1.1       tron      363:             const char **fromP, const char *fromLim,
                    364:             char **toP, const char *toLim)
                    365: {
                    366:   char *to;
                    367:   const char *from;
1.1.1.4 ! christos  368:   const char *fromLimInitial = fromLim;
        !           369:
        !           370:   /* Avoid copying partial characters. */
        !           371:   align_limit_to_full_utf8_characters(*fromP, &fromLim);
        !           372:
1.1.1.3   spz       373:   for (to = *toP, from = *fromP; (from < fromLim) && (to < toLim); from++, to++)
1.1       tron      374:     *to = *from;
                    375:   *fromP = from;
                    376:   *toP = to;
1.1.1.3   spz       377:
1.1.1.4 ! christos  378:   if (fromLim < fromLimInitial)
        !           379:     return XML_CONVERT_INPUT_INCOMPLETE;
        !           380:   else if ((to == toLim) && (from < fromLim))
1.1.1.3   spz       381:     return XML_CONVERT_OUTPUT_EXHAUSTED;
                    382:   else
1.1.1.4 ! christos  383:     return XML_CONVERT_COMPLETED;
1.1       tron      384: }
                    385:
1.1.1.3   spz       386: static enum XML_Convert_Result PTRCALL
1.1       tron      387: utf8_toUtf16(const ENCODING *enc,
                    388:              const char **fromP, const char *fromLim,
                    389:              unsigned short **toP, const unsigned short *toLim)
                    390: {
1.1.1.3   spz       391:   enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
1.1       tron      392:   unsigned short *to = *toP;
                    393:   const char *from = *fromP;
1.1.1.3   spz       394:   while (from < fromLim && to < toLim) {
1.1       tron      395:     switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
                    396:     case BT_LEAD2:
1.1.1.3   spz       397:       if (fromLim - from < 2) {
                    398:         res = XML_CONVERT_INPUT_INCOMPLETE;
1.1.1.4 ! christos  399:         goto after;
1.1.1.3   spz       400:       }
1.1       tron      401:       *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
                    402:       from += 2;
                    403:       break;
                    404:     case BT_LEAD3:
1.1.1.3   spz       405:       if (fromLim - from < 3) {
                    406:         res = XML_CONVERT_INPUT_INCOMPLETE;
1.1.1.4 ! christos  407:         goto after;
1.1.1.3   spz       408:       }
1.1       tron      409:       *to++ = (unsigned short)(((from[0] & 0xf) << 12)
                    410:                                | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
                    411:       from += 3;
                    412:       break;
                    413:     case BT_LEAD4:
                    414:       {
                    415:         unsigned long n;
1.1.1.3   spz       416:         if (toLim - to < 2) {
                    417:           res = XML_CONVERT_OUTPUT_EXHAUSTED;
1.1       tron      418:           goto after;
1.1.1.3   spz       419:         }
                    420:         if (fromLim - from < 4) {
                    421:           res = XML_CONVERT_INPUT_INCOMPLETE;
                    422:           goto after;
                    423:         }
1.1       tron      424:         n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
                    425:             | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
                    426:         n -= 0x10000;
                    427:         to[0] = (unsigned short)((n >> 10) | 0xD800);
                    428:         to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
                    429:         to += 2;
                    430:         from += 4;
                    431:       }
                    432:       break;
                    433:     default:
                    434:       *to++ = *from++;
                    435:       break;
                    436:     }
                    437:   }
1.1.1.4 ! christos  438:   if (from < fromLim)
        !           439:     res = XML_CONVERT_OUTPUT_EXHAUSTED;
1.1       tron      440: after:
                    441:   *fromP = from;
                    442:   *toP = to;
1.1.1.3   spz       443:   return res;
1.1       tron      444: }
                    445:
                    446: #ifdef XML_NS
                    447: static const struct normal_encoding utf8_encoding_ns = {
                    448:   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
                    449:   {
                    450: #include "asciitab.h"
                    451: #include "utf8tab.h"
                    452:   },
                    453:   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
                    454: };
                    455: #endif
                    456:
                    457: static const struct normal_encoding utf8_encoding = {
                    458:   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
                    459:   {
                    460: #define BT_COLON BT_NMSTRT
                    461: #include "asciitab.h"
                    462: #undef BT_COLON
                    463: #include "utf8tab.h"
                    464:   },
                    465:   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
                    466: };
                    467:
                    468: #ifdef XML_NS
                    469:
                    470: static const struct normal_encoding internal_utf8_encoding_ns = {
                    471:   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
                    472:   {
                    473: #include "iasciitab.h"
                    474: #include "utf8tab.h"
                    475:   },
                    476:   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
                    477: };
                    478:
                    479: #endif
                    480:
                    481: static const struct normal_encoding internal_utf8_encoding = {
                    482:   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
                    483:   {
                    484: #define BT_COLON BT_NMSTRT
                    485: #include "iasciitab.h"
                    486: #undef BT_COLON
                    487: #include "utf8tab.h"
                    488:   },
                    489:   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
                    490: };
                    491:
1.1.1.3   spz       492: static enum XML_Convert_Result PTRCALL
                    493: latin1_toUtf8(const ENCODING *UNUSED_P(enc),
1.1       tron      494:               const char **fromP, const char *fromLim,
                    495:               char **toP, const char *toLim)
                    496: {
                    497:   for (;;) {
                    498:     unsigned char c;
                    499:     if (*fromP == fromLim)
1.1.1.3   spz       500:       return XML_CONVERT_COMPLETED;
1.1       tron      501:     c = (unsigned char)**fromP;
                    502:     if (c & 0x80) {
                    503:       if (toLim - *toP < 2)
1.1.1.3   spz       504:         return XML_CONVERT_OUTPUT_EXHAUSTED;
1.1       tron      505:       *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
                    506:       *(*toP)++ = (char)((c & 0x3f) | 0x80);
                    507:       (*fromP)++;
                    508:     }
                    509:     else {
                    510:       if (*toP == toLim)
1.1.1.3   spz       511:         return XML_CONVERT_OUTPUT_EXHAUSTED;
1.1       tron      512:       *(*toP)++ = *(*fromP)++;
                    513:     }
                    514:   }
                    515: }
                    516:
1.1.1.3   spz       517: static enum XML_Convert_Result PTRCALL
                    518: latin1_toUtf16(const ENCODING *UNUSED_P(enc),
1.1       tron      519:                const char **fromP, const char *fromLim,
                    520:                unsigned short **toP, const unsigned short *toLim)
                    521: {
1.1.1.3   spz       522:   while (*fromP < fromLim && *toP < toLim)
1.1       tron      523:     *(*toP)++ = (unsigned char)*(*fromP)++;
1.1.1.3   spz       524:
                    525:   if ((*toP == toLim) && (*fromP < fromLim))
                    526:     return XML_CONVERT_OUTPUT_EXHAUSTED;
                    527:   else
                    528:     return XML_CONVERT_COMPLETED;
1.1       tron      529: }
                    530:
                    531: #ifdef XML_NS
                    532:
                    533: static const struct normal_encoding latin1_encoding_ns = {
                    534:   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
                    535:   {
                    536: #include "asciitab.h"
                    537: #include "latin1tab.h"
                    538:   },
1.1.1.3   spz       539:   STANDARD_VTABLE(sb_) NULL_VTABLE
1.1       tron      540: };
                    541:
                    542: #endif
                    543:
                    544: static const struct normal_encoding latin1_encoding = {
                    545:   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
                    546:   {
                    547: #define BT_COLON BT_NMSTRT
                    548: #include "asciitab.h"
                    549: #undef BT_COLON
                    550: #include "latin1tab.h"
                    551:   },
1.1.1.3   spz       552:   STANDARD_VTABLE(sb_) NULL_VTABLE
1.1       tron      553: };
                    554:
1.1.1.3   spz       555: static enum XML_Convert_Result PTRCALL
                    556: ascii_toUtf8(const ENCODING *UNUSED_P(enc),
1.1       tron      557:              const char **fromP, const char *fromLim,
                    558:              char **toP, const char *toLim)
                    559: {
1.1.1.3   spz       560:   while (*fromP < fromLim && *toP < toLim)
1.1       tron      561:     *(*toP)++ = *(*fromP)++;
1.1.1.3   spz       562:
                    563:   if ((*toP == toLim) && (*fromP < fromLim))
                    564:     return XML_CONVERT_OUTPUT_EXHAUSTED;
                    565:   else
                    566:     return XML_CONVERT_COMPLETED;
1.1       tron      567: }
                    568:
                    569: #ifdef XML_NS
                    570:
                    571: static const struct normal_encoding ascii_encoding_ns = {
                    572:   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
                    573:   {
                    574: #include "asciitab.h"
                    575: /* BT_NONXML == 0 */
                    576:   },
1.1.1.3   spz       577:   STANDARD_VTABLE(sb_) NULL_VTABLE
1.1       tron      578: };
                    579:
                    580: #endif
                    581:
                    582: static const struct normal_encoding ascii_encoding = {
                    583:   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
                    584:   {
                    585: #define BT_COLON BT_NMSTRT
                    586: #include "asciitab.h"
                    587: #undef BT_COLON
                    588: /* BT_NONXML == 0 */
                    589:   },
1.1.1.3   spz       590:   STANDARD_VTABLE(sb_) NULL_VTABLE
1.1       tron      591: };
                    592:
                    593: static int PTRFASTCALL
                    594: unicode_byte_type(char hi, char lo)
                    595: {
                    596:   switch ((unsigned char)hi) {
                    597:   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
                    598:     return BT_LEAD4;
                    599:   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
                    600:     return BT_TRAIL;
                    601:   case 0xFF:
                    602:     switch ((unsigned char)lo) {
                    603:     case 0xFF:
                    604:     case 0xFE:
                    605:       return BT_NONXML;
                    606:     }
                    607:     break;
                    608:   }
                    609:   return BT_NONASCII;
                    610: }
                    611:
                    612: #define DEFINE_UTF16_TO_UTF8(E) \
1.1.1.3   spz       613: static enum XML_Convert_Result  PTRCALL \
                    614: E ## toUtf8(const ENCODING *UNUSED_P(enc), \
1.1       tron      615:             const char **fromP, const char *fromLim, \
                    616:             char **toP, const char *toLim) \
                    617: { \
1.1.1.3   spz       618:   const char *from = *fromP; \
                    619:   fromLim = from + (((fromLim - from) >> 1) << 1);  /* shrink to even */ \
                    620:   for (; from < fromLim; from += 2) { \
1.1       tron      621:     int plane; \
                    622:     unsigned char lo2; \
                    623:     unsigned char lo = GET_LO(from); \
                    624:     unsigned char hi = GET_HI(from); \
                    625:     switch (hi) { \
                    626:     case 0: \
                    627:       if (lo < 0x80) { \
                    628:         if (*toP == toLim) { \
                    629:           *fromP = from; \
1.1.1.3   spz       630:           return XML_CONVERT_OUTPUT_EXHAUSTED; \
1.1       tron      631:         } \
                    632:         *(*toP)++ = lo; \
                    633:         break; \
                    634:       } \
                    635:       /* fall through */ \
                    636:     case 0x1: case 0x2: case 0x3: \
                    637:     case 0x4: case 0x5: case 0x6: case 0x7: \
                    638:       if (toLim -  *toP < 2) { \
                    639:         *fromP = from; \
1.1.1.3   spz       640:         return XML_CONVERT_OUTPUT_EXHAUSTED; \
1.1       tron      641:       } \
                    642:       *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
                    643:       *(*toP)++ = ((lo & 0x3f) | 0x80); \
                    644:       break; \
                    645:     default: \
                    646:       if (toLim -  *toP < 3)  { \
                    647:         *fromP = from; \
1.1.1.3   spz       648:         return XML_CONVERT_OUTPUT_EXHAUSTED; \
1.1       tron      649:       } \
                    650:       /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
                    651:       *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
                    652:       *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
                    653:       *(*toP)++ = ((lo & 0x3f) | 0x80); \
                    654:       break; \
                    655:     case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
                    656:       if (toLim -  *toP < 4) { \
                    657:         *fromP = from; \
1.1.1.3   spz       658:         return XML_CONVERT_OUTPUT_EXHAUSTED; \
                    659:       } \
                    660:       if (fromLim - from < 4) { \
                    661:         *fromP = from; \
                    662:         return XML_CONVERT_INPUT_INCOMPLETE; \
1.1       tron      663:       } \
                    664:       plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
                    665:       *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
                    666:       *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
                    667:       from += 2; \
                    668:       lo2 = GET_LO(from); \
                    669:       *(*toP)++ = (((lo & 0x3) << 4) \
                    670:                    | ((GET_HI(from) & 0x3) << 2) \
                    671:                    | (lo2 >> 6) \
                    672:                    | 0x80); \
                    673:       *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
                    674:       break; \
                    675:     } \
                    676:   } \
                    677:   *fromP = from; \
1.1.1.3   spz       678:   if (from < fromLim) \
                    679:     return XML_CONVERT_INPUT_INCOMPLETE; \
                    680:   else \
                    681:     return XML_CONVERT_COMPLETED; \
1.1       tron      682: }
                    683:
                    684: #define DEFINE_UTF16_TO_UTF16(E) \
1.1.1.3   spz       685: static enum XML_Convert_Result  PTRCALL \
                    686: E ## toUtf16(const ENCODING *UNUSED_P(enc), \
1.1       tron      687:              const char **fromP, const char *fromLim, \
                    688:              unsigned short **toP, const unsigned short *toLim) \
                    689: { \
1.1.1.3   spz       690:   enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \
                    691:   fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1);  /* shrink to even */ \
1.1       tron      692:   /* Avoid copying first half only of surrogate */ \
                    693:   if (fromLim - *fromP > ((toLim - *toP) << 1) \
1.1.1.3   spz       694:       && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \
1.1       tron      695:     fromLim -= 2; \
1.1.1.3   spz       696:     res = XML_CONVERT_INPUT_INCOMPLETE; \
                    697:   } \
                    698:   for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \
1.1       tron      699:     *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
1.1.1.3   spz       700:   if ((*toP == toLim) && (*fromP < fromLim)) \
                    701:     return XML_CONVERT_OUTPUT_EXHAUSTED; \
                    702:   else \
                    703:     return res; \
1.1       tron      704: }
                    705:
                    706: #define SET2(ptr, ch) \
                    707:   (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
                    708: #define GET_LO(ptr) ((unsigned char)(ptr)[0])
                    709: #define GET_HI(ptr) ((unsigned char)(ptr)[1])
                    710:
                    711: DEFINE_UTF16_TO_UTF8(little2_)
                    712: DEFINE_UTF16_TO_UTF16(little2_)
                    713:
                    714: #undef SET2
                    715: #undef GET_LO
                    716: #undef GET_HI
                    717:
                    718: #define SET2(ptr, ch) \
                    719:   (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
                    720: #define GET_LO(ptr) ((unsigned char)(ptr)[1])
                    721: #define GET_HI(ptr) ((unsigned char)(ptr)[0])
                    722:
                    723: DEFINE_UTF16_TO_UTF8(big2_)
                    724: DEFINE_UTF16_TO_UTF16(big2_)
                    725:
                    726: #undef SET2
                    727: #undef GET_LO
                    728: #undef GET_HI
                    729:
                    730: #define LITTLE2_BYTE_TYPE(enc, p) \
                    731:  ((p)[1] == 0 \
                    732:   ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
                    733:   : unicode_byte_type((p)[1], (p)[0]))
                    734: #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
                    735: #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
                    736: #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
                    737:   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
                    738: #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
                    739:   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
                    740:
                    741: #ifdef XML_MIN_SIZE
                    742:
                    743: static int PTRFASTCALL
                    744: little2_byteType(const ENCODING *enc, const char *p)
                    745: {
                    746:   return LITTLE2_BYTE_TYPE(enc, p);
                    747: }
                    748:
                    749: static int PTRFASTCALL
                    750: little2_byteToAscii(const ENCODING *enc, const char *p)
                    751: {
                    752:   return LITTLE2_BYTE_TO_ASCII(enc, p);
                    753: }
                    754:
                    755: static int PTRCALL
                    756: little2_charMatches(const ENCODING *enc, const char *p, int c)
                    757: {
                    758:   return LITTLE2_CHAR_MATCHES(enc, p, c);
                    759: }
                    760:
                    761: static int PTRFASTCALL
                    762: little2_isNameMin(const ENCODING *enc, const char *p)
                    763: {
                    764:   return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
                    765: }
                    766:
                    767: static int PTRFASTCALL
                    768: little2_isNmstrtMin(const ENCODING *enc, const char *p)
                    769: {
                    770:   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
                    771: }
                    772:
                    773: #undef VTABLE
                    774: #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
                    775:
                    776: #else /* not XML_MIN_SIZE */
                    777:
                    778: #undef PREFIX
                    779: #define PREFIX(ident) little2_ ## ident
                    780: #define MINBPC(enc) 2
                    781: /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
                    782: #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
                    783: #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
                    784: #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
                    785: #define IS_NAME_CHAR(enc, p, n) 0
                    786: #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
                    787: #define IS_NMSTRT_CHAR(enc, p, n) (0)
                    788: #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
                    789:
                    790: #define XML_TOK_IMPL_C
                    791: #include "xmltok_impl.c"
                    792: #undef XML_TOK_IMPL_C
                    793:
                    794: #undef MINBPC
                    795: #undef BYTE_TYPE
                    796: #undef BYTE_TO_ASCII
                    797: #undef CHAR_MATCHES
                    798: #undef IS_NAME_CHAR
                    799: #undef IS_NAME_CHAR_MINBPC
                    800: #undef IS_NMSTRT_CHAR
                    801: #undef IS_NMSTRT_CHAR_MINBPC
                    802: #undef IS_INVALID_CHAR
                    803:
                    804: #endif /* not XML_MIN_SIZE */
                    805:
                    806: #ifdef XML_NS
                    807:
                    808: static const struct normal_encoding little2_encoding_ns = {
                    809:   { VTABLE, 2, 0,
                    810: #if BYTEORDER == 1234
                    811:     1
                    812: #else
                    813:     0
                    814: #endif
                    815:   },
                    816:   {
                    817: #include "asciitab.h"
                    818: #include "latin1tab.h"
                    819:   },
1.1.1.3   spz       820:   STANDARD_VTABLE(little2_) NULL_VTABLE
1.1       tron      821: };
                    822:
                    823: #endif
                    824:
                    825: static const struct normal_encoding little2_encoding = {
                    826:   { VTABLE, 2, 0,
                    827: #if BYTEORDER == 1234
                    828:     1
                    829: #else
                    830:     0
                    831: #endif
                    832:   },
                    833:   {
                    834: #define BT_COLON BT_NMSTRT
                    835: #include "asciitab.h"
                    836: #undef BT_COLON
                    837: #include "latin1tab.h"
                    838:   },
1.1.1.3   spz       839:   STANDARD_VTABLE(little2_) NULL_VTABLE
1.1       tron      840: };
                    841:
                    842: #if BYTEORDER != 4321
                    843:
                    844: #ifdef XML_NS
                    845:
                    846: static const struct normal_encoding internal_little2_encoding_ns = {
                    847:   { VTABLE, 2, 0, 1 },
                    848:   {
                    849: #include "iasciitab.h"
                    850: #include "latin1tab.h"
                    851:   },
1.1.1.3   spz       852:   STANDARD_VTABLE(little2_) NULL_VTABLE
1.1       tron      853: };
                    854:
                    855: #endif
                    856:
                    857: static const struct normal_encoding internal_little2_encoding = {
                    858:   { VTABLE, 2, 0, 1 },
                    859:   {
                    860: #define BT_COLON BT_NMSTRT
                    861: #include "iasciitab.h"
                    862: #undef BT_COLON
                    863: #include "latin1tab.h"
                    864:   },
1.1.1.3   spz       865:   STANDARD_VTABLE(little2_) NULL_VTABLE
1.1       tron      866: };
                    867:
                    868: #endif
                    869:
                    870:
                    871: #define BIG2_BYTE_TYPE(enc, p) \
                    872:  ((p)[0] == 0 \
                    873:   ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
                    874:   : unicode_byte_type((p)[0], (p)[1]))
                    875: #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
                    876: #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
                    877: #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
                    878:   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
                    879: #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
                    880:   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
                    881:
                    882: #ifdef XML_MIN_SIZE
                    883:
                    884: static int PTRFASTCALL
                    885: big2_byteType(const ENCODING *enc, const char *p)
                    886: {
                    887:   return BIG2_BYTE_TYPE(enc, p);
                    888: }
                    889:
                    890: static int PTRFASTCALL
                    891: big2_byteToAscii(const ENCODING *enc, const char *p)
                    892: {
                    893:   return BIG2_BYTE_TO_ASCII(enc, p);
                    894: }
                    895:
                    896: static int PTRCALL
                    897: big2_charMatches(const ENCODING *enc, const char *p, int c)
                    898: {
                    899:   return BIG2_CHAR_MATCHES(enc, p, c);
                    900: }
                    901:
                    902: static int PTRFASTCALL
                    903: big2_isNameMin(const ENCODING *enc, const char *p)
                    904: {
                    905:   return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
                    906: }
                    907:
                    908: static int PTRFASTCALL
                    909: big2_isNmstrtMin(const ENCODING *enc, const char *p)
                    910: {
                    911:   return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
                    912: }
                    913:
                    914: #undef VTABLE
                    915: #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
                    916:
                    917: #else /* not XML_MIN_SIZE */
                    918:
                    919: #undef PREFIX
                    920: #define PREFIX(ident) big2_ ## ident
                    921: #define MINBPC(enc) 2
                    922: /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
                    923: #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
                    924: #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
                    925: #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
                    926: #define IS_NAME_CHAR(enc, p, n) 0
                    927: #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
                    928: #define IS_NMSTRT_CHAR(enc, p, n) (0)
                    929: #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
                    930:
                    931: #define XML_TOK_IMPL_C
                    932: #include "xmltok_impl.c"
                    933: #undef XML_TOK_IMPL_C
                    934:
                    935: #undef MINBPC
                    936: #undef BYTE_TYPE
                    937: #undef BYTE_TO_ASCII
                    938: #undef CHAR_MATCHES
                    939: #undef IS_NAME_CHAR
                    940: #undef IS_NAME_CHAR_MINBPC
                    941: #undef IS_NMSTRT_CHAR
                    942: #undef IS_NMSTRT_CHAR_MINBPC
                    943: #undef IS_INVALID_CHAR
                    944:
                    945: #endif /* not XML_MIN_SIZE */
                    946:
                    947: #ifdef XML_NS
                    948:
                    949: static const struct normal_encoding big2_encoding_ns = {
                    950:   { VTABLE, 2, 0,
                    951: #if BYTEORDER == 4321
                    952:   1
                    953: #else
                    954:   0
                    955: #endif
                    956:   },
                    957:   {
                    958: #include "asciitab.h"
                    959: #include "latin1tab.h"
                    960:   },
1.1.1.3   spz       961:   STANDARD_VTABLE(big2_) NULL_VTABLE
1.1       tron      962: };
                    963:
                    964: #endif
                    965:
                    966: static const struct normal_encoding big2_encoding = {
                    967:   { VTABLE, 2, 0,
                    968: #if BYTEORDER == 4321
                    969:   1
                    970: #else
                    971:   0
                    972: #endif
                    973:   },
                    974:   {
                    975: #define BT_COLON BT_NMSTRT
                    976: #include "asciitab.h"
                    977: #undef BT_COLON
                    978: #include "latin1tab.h"
                    979:   },
1.1.1.3   spz       980:   STANDARD_VTABLE(big2_) NULL_VTABLE
1.1       tron      981: };
                    982:
                    983: #if BYTEORDER != 1234
                    984:
                    985: #ifdef XML_NS
                    986:
                    987: static const struct normal_encoding internal_big2_encoding_ns = {
                    988:   { VTABLE, 2, 0, 1 },
                    989:   {
                    990: #include "iasciitab.h"
                    991: #include "latin1tab.h"
                    992:   },
1.1.1.3   spz       993:   STANDARD_VTABLE(big2_) NULL_VTABLE
1.1       tron      994: };
                    995:
                    996: #endif
                    997:
                    998: static const struct normal_encoding internal_big2_encoding = {
                    999:   { VTABLE, 2, 0, 1 },
                   1000:   {
                   1001: #define BT_COLON BT_NMSTRT
                   1002: #include "iasciitab.h"
                   1003: #undef BT_COLON
                   1004: #include "latin1tab.h"
                   1005:   },
1.1.1.3   spz      1006:   STANDARD_VTABLE(big2_) NULL_VTABLE
1.1       tron     1007: };
                   1008:
                   1009: #endif
                   1010:
                   1011: #undef PREFIX
                   1012:
                   1013: static int FASTCALL
                   1014: streqci(const char *s1, const char *s2)
                   1015: {
                   1016:   for (;;) {
                   1017:     char c1 = *s1++;
                   1018:     char c2 = *s2++;
                   1019:     if (ASCII_a <= c1 && c1 <= ASCII_z)
                   1020:       c1 += ASCII_A - ASCII_a;
                   1021:     if (ASCII_a <= c2 && c2 <= ASCII_z)
                   1022:       c2 += ASCII_A - ASCII_a;
                   1023:     if (c1 != c2)
                   1024:       return 0;
                   1025:     if (!c1)
                   1026:       break;
                   1027:   }
                   1028:   return 1;
                   1029: }
                   1030:
                   1031: static void PTRCALL
1.1.1.3   spz      1032: initUpdatePosition(const ENCODING *UNUSED_P(enc), const char *ptr,
1.1       tron     1033:                    const char *end, POSITION *pos)
                   1034: {
                   1035:   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
                   1036: }
                   1037:
                   1038: static int
                   1039: toAscii(const ENCODING *enc, const char *ptr, const char *end)
                   1040: {
                   1041:   char buf[1];
                   1042:   char *p = buf;
                   1043:   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
                   1044:   if (p == buf)
                   1045:     return -1;
                   1046:   else
                   1047:     return buf[0];
                   1048: }
                   1049:
                   1050: static int FASTCALL
                   1051: isSpace(int c)
                   1052: {
                   1053:   switch (c) {
                   1054:   case 0x20:
                   1055:   case 0xD:
                   1056:   case 0xA:
                   1057:   case 0x9:
                   1058:     return 1;
                   1059:   }
                   1060:   return 0;
                   1061: }
                   1062:
                   1063: /* Return 1 if there's just optional white space or there's an S
                   1064:    followed by name=val.
                   1065: */
                   1066: static int
                   1067: parsePseudoAttribute(const ENCODING *enc,
                   1068:                      const char *ptr,
                   1069:                      const char *end,
                   1070:                      const char **namePtr,
                   1071:                      const char **nameEndPtr,
                   1072:                      const char **valPtr,
                   1073:                      const char **nextTokPtr)
                   1074: {
                   1075:   int c;
                   1076:   char open;
                   1077:   if (ptr == end) {
                   1078:     *namePtr = NULL;
                   1079:     return 1;
                   1080:   }
                   1081:   if (!isSpace(toAscii(enc, ptr, end))) {
                   1082:     *nextTokPtr = ptr;
                   1083:     return 0;
                   1084:   }
                   1085:   do {
                   1086:     ptr += enc->minBytesPerChar;
                   1087:   } while (isSpace(toAscii(enc, ptr, end)));
                   1088:   if (ptr == end) {
                   1089:     *namePtr = NULL;
                   1090:     return 1;
                   1091:   }
                   1092:   *namePtr = ptr;
                   1093:   for (;;) {
                   1094:     c = toAscii(enc, ptr, end);
                   1095:     if (c == -1) {
                   1096:       *nextTokPtr = ptr;
                   1097:       return 0;
                   1098:     }
                   1099:     if (c == ASCII_EQUALS) {
                   1100:       *nameEndPtr = ptr;
                   1101:       break;
                   1102:     }
                   1103:     if (isSpace(c)) {
                   1104:       *nameEndPtr = ptr;
                   1105:       do {
                   1106:         ptr += enc->minBytesPerChar;
                   1107:       } while (isSpace(c = toAscii(enc, ptr, end)));
                   1108:       if (c != ASCII_EQUALS) {
                   1109:         *nextTokPtr = ptr;
                   1110:         return 0;
                   1111:       }
                   1112:       break;
                   1113:     }
                   1114:     ptr += enc->minBytesPerChar;
                   1115:   }
                   1116:   if (ptr == *namePtr) {
                   1117:     *nextTokPtr = ptr;
                   1118:     return 0;
                   1119:   }
                   1120:   ptr += enc->minBytesPerChar;
                   1121:   c = toAscii(enc, ptr, end);
                   1122:   while (isSpace(c)) {
                   1123:     ptr += enc->minBytesPerChar;
                   1124:     c = toAscii(enc, ptr, end);
                   1125:   }
                   1126:   if (c != ASCII_QUOT && c != ASCII_APOS) {
                   1127:     *nextTokPtr = ptr;
                   1128:     return 0;
                   1129:   }
                   1130:   open = (char)c;
                   1131:   ptr += enc->minBytesPerChar;
                   1132:   *valPtr = ptr;
                   1133:   for (;; ptr += enc->minBytesPerChar) {
                   1134:     c = toAscii(enc, ptr, end);
                   1135:     if (c == open)
                   1136:       break;
                   1137:     if (!(ASCII_a <= c && c <= ASCII_z)
                   1138:         && !(ASCII_A <= c && c <= ASCII_Z)
                   1139:         && !(ASCII_0 <= c && c <= ASCII_9)
                   1140:         && c != ASCII_PERIOD
                   1141:         && c != ASCII_MINUS
                   1142:         && c != ASCII_UNDERSCORE) {
                   1143:       *nextTokPtr = ptr;
                   1144:       return 0;
                   1145:     }
                   1146:   }
                   1147:   *nextTokPtr = ptr + enc->minBytesPerChar;
                   1148:   return 1;
                   1149: }
                   1150:
                   1151: static const char KW_version[] = {
                   1152:   ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
                   1153: };
                   1154:
                   1155: static const char KW_encoding[] = {
                   1156:   ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
                   1157: };
                   1158:
                   1159: static const char KW_standalone[] = {
                   1160:   ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
                   1161:   ASCII_n, ASCII_e, '\0'
                   1162: };
                   1163:
                   1164: static const char KW_yes[] = {
                   1165:   ASCII_y, ASCII_e, ASCII_s,  '\0'
                   1166: };
                   1167:
                   1168: static const char KW_no[] = {
                   1169:   ASCII_n, ASCII_o,  '\0'
                   1170: };
                   1171:
                   1172: static int
                   1173: doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
                   1174:                                                  const char *,
                   1175:                                                  const char *),
                   1176:                int isGeneralTextEntity,
                   1177:                const ENCODING *enc,
                   1178:                const char *ptr,
                   1179:                const char *end,
                   1180:                const char **badPtr,
                   1181:                const char **versionPtr,
                   1182:                const char **versionEndPtr,
                   1183:                const char **encodingName,
                   1184:                const ENCODING **encoding,
                   1185:                int *standalone)
                   1186: {
                   1187:   const char *val = NULL;
                   1188:   const char *name = NULL;
                   1189:   const char *nameEnd = NULL;
                   1190:   ptr += 5 * enc->minBytesPerChar;
                   1191:   end -= 2 * enc->minBytesPerChar;
                   1192:   if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
                   1193:       || !name) {
                   1194:     *badPtr = ptr;
                   1195:     return 0;
                   1196:   }
                   1197:   if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
                   1198:     if (!isGeneralTextEntity) {
                   1199:       *badPtr = name;
                   1200:       return 0;
                   1201:     }
                   1202:   }
                   1203:   else {
                   1204:     if (versionPtr)
                   1205:       *versionPtr = val;
                   1206:     if (versionEndPtr)
                   1207:       *versionEndPtr = ptr;
                   1208:     if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
                   1209:       *badPtr = ptr;
                   1210:       return 0;
                   1211:     }
                   1212:     if (!name) {
                   1213:       if (isGeneralTextEntity) {
                   1214:         /* a TextDecl must have an EncodingDecl */
                   1215:         *badPtr = ptr;
                   1216:         return 0;
                   1217:       }
                   1218:       return 1;
                   1219:     }
                   1220:   }
                   1221:   if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
                   1222:     int c = toAscii(enc, val, end);
                   1223:     if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
                   1224:       *badPtr = val;
                   1225:       return 0;
                   1226:     }
                   1227:     if (encodingName)
                   1228:       *encodingName = val;
                   1229:     if (encoding)
                   1230:       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
                   1231:     if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
                   1232:       *badPtr = ptr;
                   1233:       return 0;
                   1234:     }
                   1235:     if (!name)
                   1236:       return 1;
                   1237:   }
                   1238:   if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
                   1239:       || isGeneralTextEntity) {
                   1240:     *badPtr = name;
                   1241:     return 0;
                   1242:   }
                   1243:   if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
                   1244:     if (standalone)
                   1245:       *standalone = 1;
                   1246:   }
                   1247:   else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
                   1248:     if (standalone)
                   1249:       *standalone = 0;
                   1250:   }
                   1251:   else {
                   1252:     *badPtr = val;
                   1253:     return 0;
                   1254:   }
                   1255:   while (isSpace(toAscii(enc, ptr, end)))
                   1256:     ptr += enc->minBytesPerChar;
                   1257:   if (ptr != end) {
                   1258:     *badPtr = ptr;
                   1259:     return 0;
                   1260:   }
                   1261:   return 1;
                   1262: }
                   1263:
                   1264: static int FASTCALL
                   1265: checkCharRefNumber(int result)
                   1266: {
                   1267:   switch (result >> 8) {
                   1268:   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
                   1269:   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
                   1270:     return -1;
                   1271:   case 0:
                   1272:     if (latin1_encoding.type[result] == BT_NONXML)
                   1273:       return -1;
                   1274:     break;
                   1275:   case 0xFF:
                   1276:     if (result == 0xFFFE || result == 0xFFFF)
                   1277:       return -1;
                   1278:     break;
                   1279:   }
                   1280:   return result;
                   1281: }
                   1282:
                   1283: int FASTCALL
                   1284: XmlUtf8Encode(int c, char *buf)
                   1285: {
                   1286:   enum {
                   1287:     /* minN is minimum legal resulting value for N byte sequence */
                   1288:     min2 = 0x80,
                   1289:     min3 = 0x800,
                   1290:     min4 = 0x10000
                   1291:   };
                   1292:
                   1293:   if (c < 0)
                   1294:     return 0;
                   1295:   if (c < min2) {
                   1296:     buf[0] = (char)(c | UTF8_cval1);
                   1297:     return 1;
                   1298:   }
                   1299:   if (c < min3) {
                   1300:     buf[0] = (char)((c >> 6) | UTF8_cval2);
                   1301:     buf[1] = (char)((c & 0x3f) | 0x80);
                   1302:     return 2;
                   1303:   }
                   1304:   if (c < min4) {
                   1305:     buf[0] = (char)((c >> 12) | UTF8_cval3);
                   1306:     buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
                   1307:     buf[2] = (char)((c & 0x3f) | 0x80);
                   1308:     return 3;
                   1309:   }
                   1310:   if (c < 0x110000) {
                   1311:     buf[0] = (char)((c >> 18) | UTF8_cval4);
                   1312:     buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
                   1313:     buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
                   1314:     buf[3] = (char)((c & 0x3f) | 0x80);
                   1315:     return 4;
                   1316:   }
                   1317:   return 0;
                   1318: }
                   1319:
                   1320: int FASTCALL
                   1321: XmlUtf16Encode(int charNum, unsigned short *buf)
                   1322: {
                   1323:   if (charNum < 0)
                   1324:     return 0;
                   1325:   if (charNum < 0x10000) {
                   1326:     buf[0] = (unsigned short)charNum;
                   1327:     return 1;
                   1328:   }
                   1329:   if (charNum < 0x110000) {
                   1330:     charNum -= 0x10000;
                   1331:     buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
                   1332:     buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
                   1333:     return 2;
                   1334:   }
                   1335:   return 0;
                   1336: }
                   1337:
                   1338: struct unknown_encoding {
                   1339:   struct normal_encoding normal;
                   1340:   CONVERTER convert;
                   1341:   void *userData;
                   1342:   unsigned short utf16[256];
                   1343:   char utf8[256][4];
                   1344: };
                   1345:
                   1346: #define AS_UNKNOWN_ENCODING(enc)  ((const struct unknown_encoding *) (enc))
                   1347:
                   1348: int
                   1349: XmlSizeOfUnknownEncoding(void)
                   1350: {
                   1351:   return sizeof(struct unknown_encoding);
                   1352: }
                   1353:
                   1354: static int PTRFASTCALL
                   1355: unknown_isName(const ENCODING *enc, const char *p)
                   1356: {
                   1357:   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
                   1358:   int c = uenc->convert(uenc->userData, p);
                   1359:   if (c & ~0xFFFF)
                   1360:     return 0;
                   1361:   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
                   1362: }
                   1363:
                   1364: static int PTRFASTCALL
                   1365: unknown_isNmstrt(const ENCODING *enc, const char *p)
                   1366: {
                   1367:   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
                   1368:   int c = uenc->convert(uenc->userData, p);
                   1369:   if (c & ~0xFFFF)
                   1370:     return 0;
                   1371:   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
                   1372: }
                   1373:
                   1374: static int PTRFASTCALL
                   1375: unknown_isInvalid(const ENCODING *enc, const char *p)
                   1376: {
                   1377:   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
                   1378:   int c = uenc->convert(uenc->userData, p);
                   1379:   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
                   1380: }
                   1381:
1.1.1.3   spz      1382: static enum XML_Convert_Result PTRCALL
1.1       tron     1383: unknown_toUtf8(const ENCODING *enc,
                   1384:                const char **fromP, const char *fromLim,
                   1385:                char **toP, const char *toLim)
                   1386: {
                   1387:   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
                   1388:   char buf[XML_UTF8_ENCODE_MAX];
                   1389:   for (;;) {
                   1390:     const char *utf8;
                   1391:     int n;
                   1392:     if (*fromP == fromLim)
1.1.1.3   spz      1393:       return XML_CONVERT_COMPLETED;
1.1       tron     1394:     utf8 = uenc->utf8[(unsigned char)**fromP];
                   1395:     n = *utf8++;
                   1396:     if (n == 0) {
                   1397:       int c = uenc->convert(uenc->userData, *fromP);
                   1398:       n = XmlUtf8Encode(c, buf);
                   1399:       if (n > toLim - *toP)
1.1.1.3   spz      1400:         return XML_CONVERT_OUTPUT_EXHAUSTED;
1.1       tron     1401:       utf8 = buf;
                   1402:       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
                   1403:                  - (BT_LEAD2 - 2));
                   1404:     }
                   1405:     else {
                   1406:       if (n > toLim - *toP)
1.1.1.3   spz      1407:         return XML_CONVERT_OUTPUT_EXHAUSTED;
1.1       tron     1408:       (*fromP)++;
                   1409:     }
                   1410:     do {
                   1411:       *(*toP)++ = *utf8++;
                   1412:     } while (--n != 0);
                   1413:   }
                   1414: }
                   1415:
1.1.1.3   spz      1416: static enum XML_Convert_Result PTRCALL
1.1       tron     1417: unknown_toUtf16(const ENCODING *enc,
                   1418:                 const char **fromP, const char *fromLim,
                   1419:                 unsigned short **toP, const unsigned short *toLim)
                   1420: {
                   1421:   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1.1.1.3   spz      1422:   while (*fromP < fromLim && *toP < toLim) {
1.1       tron     1423:     unsigned short c = uenc->utf16[(unsigned char)**fromP];
                   1424:     if (c == 0) {
                   1425:       c = (unsigned short)
                   1426:           uenc->convert(uenc->userData, *fromP);
                   1427:       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
                   1428:                  - (BT_LEAD2 - 2));
                   1429:     }
                   1430:     else
                   1431:       (*fromP)++;
                   1432:     *(*toP)++ = c;
                   1433:   }
1.1.1.3   spz      1434:
                   1435:   if ((*toP == toLim) && (*fromP < fromLim))
                   1436:     return XML_CONVERT_OUTPUT_EXHAUSTED;
                   1437:   else
                   1438:     return XML_CONVERT_COMPLETED;
1.1       tron     1439: }
                   1440:
                   1441: ENCODING *
                   1442: XmlInitUnknownEncoding(void *mem,
                   1443:                        int *table,
1.1.1.2   spz      1444:                        CONVERTER convert,
1.1       tron     1445:                        void *userData)
                   1446: {
                   1447:   int i;
                   1448:   struct unknown_encoding *e = (struct unknown_encoding *)mem;
                   1449:   for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
                   1450:     ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
                   1451:   for (i = 0; i < 128; i++)
                   1452:     if (latin1_encoding.type[i] != BT_OTHER
                   1453:         && latin1_encoding.type[i] != BT_NONXML
                   1454:         && table[i] != i)
                   1455:       return 0;
                   1456:   for (i = 0; i < 256; i++) {
                   1457:     int c = table[i];
                   1458:     if (c == -1) {
                   1459:       e->normal.type[i] = BT_MALFORM;
                   1460:       /* This shouldn't really get used. */
                   1461:       e->utf16[i] = 0xFFFF;
                   1462:       e->utf8[i][0] = 1;
                   1463:       e->utf8[i][1] = 0;
                   1464:     }
                   1465:     else if (c < 0) {
                   1466:       if (c < -4)
                   1467:         return 0;
                   1468:       e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
                   1469:       e->utf8[i][0] = 0;
                   1470:       e->utf16[i] = 0;
                   1471:     }
                   1472:     else if (c < 0x80) {
                   1473:       if (latin1_encoding.type[c] != BT_OTHER
                   1474:           && latin1_encoding.type[c] != BT_NONXML
                   1475:           && c != i)
                   1476:         return 0;
                   1477:       e->normal.type[i] = latin1_encoding.type[c];
                   1478:       e->utf8[i][0] = 1;
                   1479:       e->utf8[i][1] = (char)c;
                   1480:       e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
                   1481:     }
                   1482:     else if (checkCharRefNumber(c) < 0) {
                   1483:       e->normal.type[i] = BT_NONXML;
                   1484:       /* This shouldn't really get used. */
                   1485:       e->utf16[i] = 0xFFFF;
                   1486:       e->utf8[i][0] = 1;
                   1487:       e->utf8[i][1] = 0;
                   1488:     }
                   1489:     else {
                   1490:       if (c > 0xFFFF)
                   1491:         return 0;
                   1492:       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
                   1493:         e->normal.type[i] = BT_NMSTRT;
                   1494:       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
                   1495:         e->normal.type[i] = BT_NAME;
                   1496:       else
                   1497:         e->normal.type[i] = BT_OTHER;
                   1498:       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
                   1499:       e->utf16[i] = (unsigned short)c;
                   1500:     }
                   1501:   }
                   1502:   e->userData = userData;
                   1503:   e->convert = convert;
                   1504:   if (convert) {
                   1505:     e->normal.isName2 = unknown_isName;
                   1506:     e->normal.isName3 = unknown_isName;
                   1507:     e->normal.isName4 = unknown_isName;
                   1508:     e->normal.isNmstrt2 = unknown_isNmstrt;
                   1509:     e->normal.isNmstrt3 = unknown_isNmstrt;
                   1510:     e->normal.isNmstrt4 = unknown_isNmstrt;
                   1511:     e->normal.isInvalid2 = unknown_isInvalid;
                   1512:     e->normal.isInvalid3 = unknown_isInvalid;
                   1513:     e->normal.isInvalid4 = unknown_isInvalid;
                   1514:   }
                   1515:   e->normal.enc.utf8Convert = unknown_toUtf8;
                   1516:   e->normal.enc.utf16Convert = unknown_toUtf16;
                   1517:   return &(e->normal.enc);
                   1518: }
                   1519:
                   1520: /* If this enumeration is changed, getEncodingIndex and encodings
                   1521: must also be changed. */
                   1522: enum {
                   1523:   UNKNOWN_ENC = -1,
                   1524:   ISO_8859_1_ENC = 0,
                   1525:   US_ASCII_ENC,
                   1526:   UTF_8_ENC,
                   1527:   UTF_16_ENC,
                   1528:   UTF_16BE_ENC,
                   1529:   UTF_16LE_ENC,
                   1530:   /* must match encodingNames up to here */
                   1531:   NO_ENC
                   1532: };
                   1533:
                   1534: static const char KW_ISO_8859_1[] = {
                   1535:   ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
                   1536:   ASCII_MINUS, ASCII_1, '\0'
                   1537: };
                   1538: static const char KW_US_ASCII[] = {
                   1539:   ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
                   1540:   '\0'
                   1541: };
                   1542: static const char KW_UTF_8[] =  {
                   1543:   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
                   1544: };
                   1545: static const char KW_UTF_16[] = {
                   1546:   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
                   1547: };
                   1548: static const char KW_UTF_16BE[] = {
                   1549:   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
                   1550:   '\0'
                   1551: };
                   1552: static const char KW_UTF_16LE[] = {
                   1553:   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
                   1554:   '\0'
                   1555: };
                   1556:
                   1557: static int FASTCALL
                   1558: getEncodingIndex(const char *name)
                   1559: {
                   1560:   static const char * const encodingNames[] = {
                   1561:     KW_ISO_8859_1,
                   1562:     KW_US_ASCII,
                   1563:     KW_UTF_8,
                   1564:     KW_UTF_16,
                   1565:     KW_UTF_16BE,
                   1566:     KW_UTF_16LE,
                   1567:   };
                   1568:   int i;
                   1569:   if (name == NULL)
                   1570:     return NO_ENC;
                   1571:   for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
                   1572:     if (streqci(name, encodingNames[i]))
                   1573:       return i;
                   1574:   return UNKNOWN_ENC;
                   1575: }
                   1576:
                   1577: /* For binary compatibility, we store the index of the encoding
                   1578:    specified at initialization in the isUtf16 member.
                   1579: */
                   1580:
                   1581: #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
                   1582: #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
                   1583:
                   1584: /* This is what detects the encoding.  encodingTable maps from
                   1585:    encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
                   1586:    the external (protocol) specified encoding; state is
                   1587:    XML_CONTENT_STATE if we're parsing an external text entity, and
                   1588:    XML_PROLOG_STATE otherwise.
                   1589: */
                   1590:
                   1591:
                   1592: static int
                   1593: initScan(const ENCODING * const *encodingTable,
                   1594:          const INIT_ENCODING *enc,
                   1595:          int state,
                   1596:          const char *ptr,
                   1597:          const char *end,
                   1598:          const char **nextTokPtr)
                   1599: {
                   1600:   const ENCODING **encPtr;
                   1601:
1.1.1.3   spz      1602:   if (ptr >= end)
1.1       tron     1603:     return XML_TOK_NONE;
                   1604:   encPtr = enc->encPtr;
                   1605:   if (ptr + 1 == end) {
                   1606:     /* only a single byte available for auto-detection */
                   1607: #ifndef XML_DTD /* FIXME */
                   1608:     /* a well-formed document entity must have more than one byte */
                   1609:     if (state != XML_CONTENT_STATE)
                   1610:       return XML_TOK_PARTIAL;
                   1611: #endif
                   1612:     /* so we're parsing an external text entity... */
                   1613:     /* if UTF-16 was externally specified, then we need at least 2 bytes */
                   1614:     switch (INIT_ENC_INDEX(enc)) {
                   1615:     case UTF_16_ENC:
                   1616:     case UTF_16LE_ENC:
                   1617:     case UTF_16BE_ENC:
                   1618:       return XML_TOK_PARTIAL;
                   1619:     }
                   1620:     switch ((unsigned char)*ptr) {
                   1621:     case 0xFE:
                   1622:     case 0xFF:
                   1623:     case 0xEF: /* possibly first byte of UTF-8 BOM */
                   1624:       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
                   1625:           && state == XML_CONTENT_STATE)
                   1626:         break;
                   1627:       /* fall through */
                   1628:     case 0x00:
                   1629:     case 0x3C:
                   1630:       return XML_TOK_PARTIAL;
                   1631:     }
                   1632:   }
                   1633:   else {
                   1634:     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
                   1635:     case 0xFEFF:
                   1636:       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
                   1637:           && state == XML_CONTENT_STATE)
                   1638:         break;
                   1639:       *nextTokPtr = ptr + 2;
                   1640:       *encPtr = encodingTable[UTF_16BE_ENC];
                   1641:       return XML_TOK_BOM;
                   1642:     /* 00 3C is handled in the default case */
                   1643:     case 0x3C00:
                   1644:       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
                   1645:            || INIT_ENC_INDEX(enc) == UTF_16_ENC)
                   1646:           && state == XML_CONTENT_STATE)
                   1647:         break;
                   1648:       *encPtr = encodingTable[UTF_16LE_ENC];
                   1649:       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
                   1650:     case 0xFFFE:
                   1651:       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
                   1652:           && state == XML_CONTENT_STATE)
                   1653:         break;
                   1654:       *nextTokPtr = ptr + 2;
                   1655:       *encPtr = encodingTable[UTF_16LE_ENC];
                   1656:       return XML_TOK_BOM;
                   1657:     case 0xEFBB:
                   1658:       /* Maybe a UTF-8 BOM (EF BB BF) */
                   1659:       /* If there's an explicitly specified (external) encoding
                   1660:          of ISO-8859-1 or some flavour of UTF-16
                   1661:          and this is an external text entity,
                   1662:          don't look for the BOM,
                   1663:          because it might be a legal data.
                   1664:       */
                   1665:       if (state == XML_CONTENT_STATE) {
                   1666:         int e = INIT_ENC_INDEX(enc);
                   1667:         if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
                   1668:             || e == UTF_16LE_ENC || e == UTF_16_ENC)
                   1669:           break;
                   1670:       }
                   1671:       if (ptr + 2 == end)
                   1672:         return XML_TOK_PARTIAL;
                   1673:       if ((unsigned char)ptr[2] == 0xBF) {
                   1674:         *nextTokPtr = ptr + 3;
                   1675:         *encPtr = encodingTable[UTF_8_ENC];
                   1676:         return XML_TOK_BOM;
                   1677:       }
                   1678:       break;
                   1679:     default:
                   1680:       if (ptr[0] == '\0') {
                   1681:         /* 0 isn't a legal data character. Furthermore a document
                   1682:            entity can only start with ASCII characters.  So the only
                   1683:            way this can fail to be big-endian UTF-16 if it it's an
                   1684:            external parsed general entity that's labelled as
                   1685:            UTF-16LE.
                   1686:         */
                   1687:         if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
                   1688:           break;
                   1689:         *encPtr = encodingTable[UTF_16BE_ENC];
                   1690:         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
                   1691:       }
                   1692:       else if (ptr[1] == '\0') {
                   1693:         /* We could recover here in the case:
                   1694:             - parsing an external entity
                   1695:             - second byte is 0
                   1696:             - no externally specified encoding
                   1697:             - no encoding declaration
                   1698:            by assuming UTF-16LE.  But we don't, because this would mean when
                   1699:            presented just with a single byte, we couldn't reliably determine
                   1700:            whether we needed further bytes.
                   1701:         */
                   1702:         if (state == XML_CONTENT_STATE)
                   1703:           break;
                   1704:         *encPtr = encodingTable[UTF_16LE_ENC];
                   1705:         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
                   1706:       }
                   1707:       break;
                   1708:     }
                   1709:   }
                   1710:   *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
                   1711:   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
                   1712: }
                   1713:
                   1714:
                   1715: #define NS(x) x
                   1716: #define ns(x) x
                   1717: #define XML_TOK_NS_C
                   1718: #include "xmltok_ns.c"
                   1719: #undef XML_TOK_NS_C
                   1720: #undef NS
                   1721: #undef ns
                   1722:
                   1723: #ifdef XML_NS
                   1724:
                   1725: #define NS(x) x ## NS
                   1726: #define ns(x) x ## _ns
                   1727:
                   1728: #define XML_TOK_NS_C
                   1729: #include "xmltok_ns.c"
                   1730: #undef XML_TOK_NS_C
                   1731:
                   1732: #undef NS
                   1733: #undef ns
                   1734:
                   1735: ENCODING *
                   1736: XmlInitUnknownEncodingNS(void *mem,
                   1737:                          int *table,
1.1.1.2   spz      1738:                          CONVERTER convert,
1.1       tron     1739:                          void *userData)
                   1740: {
                   1741:   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
                   1742:   if (enc)
                   1743:     ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
                   1744:   return enc;
                   1745: }
                   1746:
                   1747: #endif /* XML_NS */

CVSweb <webmaster@jp.NetBSD.org>