Annotation of src/external/mit/expat/dist/lib/xmltok_impl.c, Revision 1.2.2.2
1.1 tron 1: /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2: See the file COPYING for copying permission.
3: */
4:
5: /* This file is included! */
6: #ifdef XML_TOK_IMPL_C
7:
8: #ifndef IS_INVALID_CHAR
9: #define IS_INVALID_CHAR(enc, ptr, n) (0)
10: #endif
11:
12: #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
13: case BT_LEAD ## n: \
14: if (end - ptr < n) \
15: return XML_TOK_PARTIAL_CHAR; \
16: if (IS_INVALID_CHAR(enc, ptr, n)) { \
17: *(nextTokPtr) = (ptr); \
18: return XML_TOK_INVALID; \
19: } \
20: ptr += n; \
21: break;
22:
23: #define INVALID_CASES(ptr, nextTokPtr) \
24: INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
25: INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
26: INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
27: case BT_NONXML: \
28: case BT_MALFORM: \
29: case BT_TRAIL: \
30: *(nextTokPtr) = (ptr); \
31: return XML_TOK_INVALID;
32:
33: #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
34: case BT_LEAD ## n: \
35: if (end - ptr < n) \
36: return XML_TOK_PARTIAL_CHAR; \
37: if (!IS_NAME_CHAR(enc, ptr, n)) { \
38: *nextTokPtr = ptr; \
39: return XML_TOK_INVALID; \
40: } \
41: ptr += n; \
42: break;
43:
44: #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
45: case BT_NONASCII: \
46: if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
47: *nextTokPtr = ptr; \
48: return XML_TOK_INVALID; \
49: } \
50: case BT_NMSTRT: \
51: case BT_HEX: \
52: case BT_DIGIT: \
53: case BT_NAME: \
54: case BT_MINUS: \
55: ptr += MINBPC(enc); \
56: break; \
57: CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
58: CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
59: CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
60:
61: #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
62: case BT_LEAD ## n: \
63: if (end - ptr < n) \
64: return XML_TOK_PARTIAL_CHAR; \
65: if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
66: *nextTokPtr = ptr; \
67: return XML_TOK_INVALID; \
68: } \
69: ptr += n; \
70: break;
71:
72: #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
73: case BT_NONASCII: \
74: if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
75: *nextTokPtr = ptr; \
76: return XML_TOK_INVALID; \
77: } \
78: case BT_NMSTRT: \
79: case BT_HEX: \
80: ptr += MINBPC(enc); \
81: break; \
82: CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
83: CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
84: CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
85:
86: #ifndef PREFIX
87: #define PREFIX(ident) ident
88: #endif
89:
1.2.2.2 ! snj 90:
! 91: #define HAS_CHARS(enc, ptr, end, count) \
! 92: (end - ptr >= count * MINBPC(enc))
! 93:
! 94: #define HAS_CHAR(enc, ptr, end) \
! 95: HAS_CHARS(enc, ptr, end, 1)
! 96:
! 97: #define REQUIRE_CHARS(enc, ptr, end, count) \
! 98: { \
! 99: if (! HAS_CHARS(enc, ptr, end, count)) { \
! 100: return XML_TOK_PARTIAL; \
! 101: } \
! 102: }
! 103:
! 104: #define REQUIRE_CHAR(enc, ptr, end) \
! 105: REQUIRE_CHARS(enc, ptr, end, 1)
! 106:
! 107:
1.1 tron 108: /* ptr points to character following "<!-" */
109:
110: static int PTRCALL
111: PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
112: const char *end, const char **nextTokPtr)
113: {
1.2.2.2 ! snj 114: if (HAS_CHAR(enc, ptr, end)) {
1.1 tron 115: if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
116: *nextTokPtr = ptr;
117: return XML_TOK_INVALID;
118: }
119: ptr += MINBPC(enc);
1.2.2.2 ! snj 120: while (HAS_CHAR(enc, ptr, end)) {
1.1 tron 121: switch (BYTE_TYPE(enc, ptr)) {
122: INVALID_CASES(ptr, nextTokPtr)
123: case BT_MINUS:
1.2.2.2 ! snj 124: ptr += MINBPC(enc);
! 125: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 126: if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
1.2.2.2 ! snj 127: ptr += MINBPC(enc);
! 128: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 129: if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
130: *nextTokPtr = ptr;
131: return XML_TOK_INVALID;
132: }
133: *nextTokPtr = ptr + MINBPC(enc);
134: return XML_TOK_COMMENT;
135: }
136: break;
137: default:
138: ptr += MINBPC(enc);
139: break;
140: }
141: }
142: }
143: return XML_TOK_PARTIAL;
144: }
145:
146: /* ptr points to character following "<!" */
147:
148: static int PTRCALL
149: PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
150: const char *end, const char **nextTokPtr)
151: {
1.2.2.2 ! snj 152: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 153: switch (BYTE_TYPE(enc, ptr)) {
154: case BT_MINUS:
155: return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
156: case BT_LSQB:
157: *nextTokPtr = ptr + MINBPC(enc);
158: return XML_TOK_COND_SECT_OPEN;
159: case BT_NMSTRT:
160: case BT_HEX:
161: ptr += MINBPC(enc);
162: break;
163: default:
164: *nextTokPtr = ptr;
165: return XML_TOK_INVALID;
166: }
1.2.2.2 ! snj 167: while (HAS_CHAR(enc, ptr, end)) {
1.1 tron 168: switch (BYTE_TYPE(enc, ptr)) {
169: case BT_PERCNT:
1.2.2.2 ! snj 170: REQUIRE_CHARS(enc, ptr, end, 2);
1.1 tron 171: /* don't allow <!ENTITY% foo "whatever"> */
172: switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
173: case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
174: *nextTokPtr = ptr;
175: return XML_TOK_INVALID;
176: }
177: /* fall through */
178: case BT_S: case BT_CR: case BT_LF:
179: *nextTokPtr = ptr;
180: return XML_TOK_DECL_OPEN;
181: case BT_NMSTRT:
182: case BT_HEX:
183: ptr += MINBPC(enc);
184: break;
185: default:
186: *nextTokPtr = ptr;
187: return XML_TOK_INVALID;
188: }
189: }
190: return XML_TOK_PARTIAL;
191: }
192:
193: static int PTRCALL
1.2.2.2 ! snj 194: PREFIX(checkPiTarget)(const ENCODING *UNUSED_P(enc), const char *ptr,
1.1 tron 195: const char *end, int *tokPtr)
196: {
197: int upper = 0;
198: *tokPtr = XML_TOK_PI;
199: if (end - ptr != MINBPC(enc)*3)
200: return 1;
201: switch (BYTE_TO_ASCII(enc, ptr)) {
202: case ASCII_x:
203: break;
204: case ASCII_X:
205: upper = 1;
206: break;
207: default:
208: return 1;
209: }
210: ptr += MINBPC(enc);
211: switch (BYTE_TO_ASCII(enc, ptr)) {
212: case ASCII_m:
213: break;
214: case ASCII_M:
215: upper = 1;
216: break;
217: default:
218: return 1;
219: }
220: ptr += MINBPC(enc);
221: switch (BYTE_TO_ASCII(enc, ptr)) {
222: case ASCII_l:
223: break;
224: case ASCII_L:
225: upper = 1;
226: break;
227: default:
228: return 1;
229: }
230: if (upper)
231: return 0;
232: *tokPtr = XML_TOK_XML_DECL;
233: return 1;
234: }
235:
236: /* ptr points to character following "<?" */
237:
238: static int PTRCALL
239: PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
240: const char *end, const char **nextTokPtr)
241: {
242: int tok;
243: const char *target = ptr;
1.2.2.2 ! snj 244: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 245: switch (BYTE_TYPE(enc, ptr)) {
246: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
247: default:
248: *nextTokPtr = ptr;
249: return XML_TOK_INVALID;
250: }
1.2.2.2 ! snj 251: while (HAS_CHAR(enc, ptr, end)) {
1.1 tron 252: switch (BYTE_TYPE(enc, ptr)) {
253: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
254: case BT_S: case BT_CR: case BT_LF:
255: if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
256: *nextTokPtr = ptr;
257: return XML_TOK_INVALID;
258: }
259: ptr += MINBPC(enc);
1.2.2.2 ! snj 260: while (HAS_CHAR(enc, ptr, end)) {
1.1 tron 261: switch (BYTE_TYPE(enc, ptr)) {
262: INVALID_CASES(ptr, nextTokPtr)
263: case BT_QUEST:
264: ptr += MINBPC(enc);
1.2.2.2 ! snj 265: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 266: if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
267: *nextTokPtr = ptr + MINBPC(enc);
268: return tok;
269: }
270: break;
271: default:
272: ptr += MINBPC(enc);
273: break;
274: }
275: }
276: return XML_TOK_PARTIAL;
277: case BT_QUEST:
278: if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
279: *nextTokPtr = ptr;
280: return XML_TOK_INVALID;
281: }
282: ptr += MINBPC(enc);
1.2.2.2 ! snj 283: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 284: if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
285: *nextTokPtr = ptr + MINBPC(enc);
286: return tok;
287: }
288: /* fall through */
289: default:
290: *nextTokPtr = ptr;
291: return XML_TOK_INVALID;
292: }
293: }
294: return XML_TOK_PARTIAL;
295: }
296:
297: static int PTRCALL
1.2.2.2 ! snj 298: PREFIX(scanCdataSection)(const ENCODING *UNUSED_P(enc), const char *ptr,
1.1 tron 299: const char *end, const char **nextTokPtr)
300: {
301: static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
302: ASCII_T, ASCII_A, ASCII_LSQB };
303: int i;
304: /* CDATA[ */
1.2.2.2 ! snj 305: REQUIRE_CHARS(enc, ptr, end, 6);
1.1 tron 306: for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
307: if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
308: *nextTokPtr = ptr;
309: return XML_TOK_INVALID;
310: }
311: }
312: *nextTokPtr = ptr;
313: return XML_TOK_CDATA_SECT_OPEN;
314: }
315:
316: static int PTRCALL
317: PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
318: const char *end, const char **nextTokPtr)
319: {
1.2.2.1 snj 320: if (ptr >= end)
1.1 tron 321: return XML_TOK_NONE;
322: if (MINBPC(enc) > 1) {
323: size_t n = end - ptr;
324: if (n & (MINBPC(enc) - 1)) {
325: n &= ~(MINBPC(enc) - 1);
326: if (n == 0)
327: return XML_TOK_PARTIAL;
328: end = ptr + n;
329: }
330: }
331: switch (BYTE_TYPE(enc, ptr)) {
332: case BT_RSQB:
333: ptr += MINBPC(enc);
1.2.2.2 ! snj 334: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 335: if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
336: break;
337: ptr += MINBPC(enc);
1.2.2.2 ! snj 338: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 339: if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
340: ptr -= MINBPC(enc);
341: break;
342: }
343: *nextTokPtr = ptr + MINBPC(enc);
344: return XML_TOK_CDATA_SECT_CLOSE;
345: case BT_CR:
346: ptr += MINBPC(enc);
1.2.2.2 ! snj 347: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 348: if (BYTE_TYPE(enc, ptr) == BT_LF)
349: ptr += MINBPC(enc);
350: *nextTokPtr = ptr;
351: return XML_TOK_DATA_NEWLINE;
352: case BT_LF:
353: *nextTokPtr = ptr + MINBPC(enc);
354: return XML_TOK_DATA_NEWLINE;
355: INVALID_CASES(ptr, nextTokPtr)
356: default:
357: ptr += MINBPC(enc);
358: break;
359: }
1.2.2.2 ! snj 360: while (HAS_CHAR(enc, ptr, end)) {
1.1 tron 361: switch (BYTE_TYPE(enc, ptr)) {
362: #define LEAD_CASE(n) \
363: case BT_LEAD ## n: \
364: if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
365: *nextTokPtr = ptr; \
366: return XML_TOK_DATA_CHARS; \
367: } \
368: ptr += n; \
369: break;
370: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
371: #undef LEAD_CASE
372: case BT_NONXML:
373: case BT_MALFORM:
374: case BT_TRAIL:
375: case BT_CR:
376: case BT_LF:
377: case BT_RSQB:
378: *nextTokPtr = ptr;
379: return XML_TOK_DATA_CHARS;
380: default:
381: ptr += MINBPC(enc);
382: break;
383: }
384: }
385: *nextTokPtr = ptr;
386: return XML_TOK_DATA_CHARS;
387: }
388:
389: /* ptr points to character following "</" */
390:
391: static int PTRCALL
392: PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
393: const char *end, const char **nextTokPtr)
394: {
1.2.2.2 ! snj 395: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 396: switch (BYTE_TYPE(enc, ptr)) {
397: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
398: default:
399: *nextTokPtr = ptr;
400: return XML_TOK_INVALID;
401: }
1.2.2.2 ! snj 402: while (HAS_CHAR(enc, ptr, end)) {
1.1 tron 403: switch (BYTE_TYPE(enc, ptr)) {
404: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
405: case BT_S: case BT_CR: case BT_LF:
1.2.2.2 ! snj 406: for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1.1 tron 407: switch (BYTE_TYPE(enc, ptr)) {
408: case BT_S: case BT_CR: case BT_LF:
409: break;
410: case BT_GT:
411: *nextTokPtr = ptr + MINBPC(enc);
412: return XML_TOK_END_TAG;
413: default:
414: *nextTokPtr = ptr;
415: return XML_TOK_INVALID;
416: }
417: }
418: return XML_TOK_PARTIAL;
419: #ifdef XML_NS
420: case BT_COLON:
421: /* no need to check qname syntax here,
422: since end-tag must match exactly */
423: ptr += MINBPC(enc);
424: break;
425: #endif
426: case BT_GT:
427: *nextTokPtr = ptr + MINBPC(enc);
428: return XML_TOK_END_TAG;
429: default:
430: *nextTokPtr = ptr;
431: return XML_TOK_INVALID;
432: }
433: }
434: return XML_TOK_PARTIAL;
435: }
436:
437: /* ptr points to character following "&#X" */
438:
439: static int PTRCALL
440: PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
441: const char *end, const char **nextTokPtr)
442: {
1.2.2.2 ! snj 443: if (HAS_CHAR(enc, ptr, end)) {
1.1 tron 444: switch (BYTE_TYPE(enc, ptr)) {
445: case BT_DIGIT:
446: case BT_HEX:
447: break;
448: default:
449: *nextTokPtr = ptr;
450: return XML_TOK_INVALID;
451: }
1.2.2.2 ! snj 452: for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1.1 tron 453: switch (BYTE_TYPE(enc, ptr)) {
454: case BT_DIGIT:
455: case BT_HEX:
456: break;
457: case BT_SEMI:
458: *nextTokPtr = ptr + MINBPC(enc);
459: return XML_TOK_CHAR_REF;
460: default:
461: *nextTokPtr = ptr;
462: return XML_TOK_INVALID;
463: }
464: }
465: }
466: return XML_TOK_PARTIAL;
467: }
468:
469: /* ptr points to character following "&#" */
470:
471: static int PTRCALL
472: PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
473: const char *end, const char **nextTokPtr)
474: {
1.2.2.2 ! snj 475: if (HAS_CHAR(enc, ptr, end)) {
1.1 tron 476: if (CHAR_MATCHES(enc, ptr, ASCII_x))
477: return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
478: switch (BYTE_TYPE(enc, ptr)) {
479: case BT_DIGIT:
480: break;
481: default:
482: *nextTokPtr = ptr;
483: return XML_TOK_INVALID;
484: }
1.2.2.2 ! snj 485: for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1.1 tron 486: switch (BYTE_TYPE(enc, ptr)) {
487: case BT_DIGIT:
488: break;
489: case BT_SEMI:
490: *nextTokPtr = ptr + MINBPC(enc);
491: return XML_TOK_CHAR_REF;
492: default:
493: *nextTokPtr = ptr;
494: return XML_TOK_INVALID;
495: }
496: }
497: }
498: return XML_TOK_PARTIAL;
499: }
500:
501: /* ptr points to character following "&" */
502:
503: static int PTRCALL
504: PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
505: const char **nextTokPtr)
506: {
1.2.2.2 ! snj 507: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 508: switch (BYTE_TYPE(enc, ptr)) {
509: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
510: case BT_NUM:
511: return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
512: default:
513: *nextTokPtr = ptr;
514: return XML_TOK_INVALID;
515: }
1.2.2.2 ! snj 516: while (HAS_CHAR(enc, ptr, end)) {
1.1 tron 517: switch (BYTE_TYPE(enc, ptr)) {
518: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
519: case BT_SEMI:
520: *nextTokPtr = ptr + MINBPC(enc);
521: return XML_TOK_ENTITY_REF;
522: default:
523: *nextTokPtr = ptr;
524: return XML_TOK_INVALID;
525: }
526: }
527: return XML_TOK_PARTIAL;
528: }
529:
530: /* ptr points to character following first character of attribute name */
531:
532: static int PTRCALL
533: PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
534: const char **nextTokPtr)
535: {
536: #ifdef XML_NS
537: int hadColon = 0;
538: #endif
1.2.2.2 ! snj 539: while (HAS_CHAR(enc, ptr, end)) {
1.1 tron 540: switch (BYTE_TYPE(enc, ptr)) {
541: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
542: #ifdef XML_NS
543: case BT_COLON:
544: if (hadColon) {
545: *nextTokPtr = ptr;
546: return XML_TOK_INVALID;
547: }
548: hadColon = 1;
549: ptr += MINBPC(enc);
1.2.2.2 ! snj 550: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 551: switch (BYTE_TYPE(enc, ptr)) {
552: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
553: default:
554: *nextTokPtr = ptr;
555: return XML_TOK_INVALID;
556: }
557: break;
558: #endif
559: case BT_S: case BT_CR: case BT_LF:
560: for (;;) {
561: int t;
562:
563: ptr += MINBPC(enc);
1.2.2.2 ! snj 564: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 565: t = BYTE_TYPE(enc, ptr);
566: if (t == BT_EQUALS)
567: break;
568: switch (t) {
569: case BT_S:
570: case BT_LF:
571: case BT_CR:
572: break;
573: default:
574: *nextTokPtr = ptr;
575: return XML_TOK_INVALID;
576: }
577: }
578: /* fall through */
579: case BT_EQUALS:
580: {
581: int open;
582: #ifdef XML_NS
583: hadColon = 0;
584: #endif
585: for (;;) {
586: ptr += MINBPC(enc);
1.2.2.2 ! snj 587: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 588: open = BYTE_TYPE(enc, ptr);
589: if (open == BT_QUOT || open == BT_APOS)
590: break;
591: switch (open) {
592: case BT_S:
593: case BT_LF:
594: case BT_CR:
595: break;
596: default:
597: *nextTokPtr = ptr;
598: return XML_TOK_INVALID;
599: }
600: }
601: ptr += MINBPC(enc);
602: /* in attribute value */
603: for (;;) {
604: int t;
1.2.2.2 ! snj 605: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 606: t = BYTE_TYPE(enc, ptr);
607: if (t == open)
608: break;
609: switch (t) {
610: INVALID_CASES(ptr, nextTokPtr)
611: case BT_AMP:
612: {
613: int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
614: if (tok <= 0) {
615: if (tok == XML_TOK_INVALID)
616: *nextTokPtr = ptr;
617: return tok;
618: }
619: break;
620: }
621: case BT_LT:
622: *nextTokPtr = ptr;
623: return XML_TOK_INVALID;
624: default:
625: ptr += MINBPC(enc);
626: break;
627: }
628: }
629: ptr += MINBPC(enc);
1.2.2.2 ! snj 630: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 631: switch (BYTE_TYPE(enc, ptr)) {
632: case BT_S:
633: case BT_CR:
634: case BT_LF:
635: break;
636: case BT_SOL:
637: goto sol;
638: case BT_GT:
639: goto gt;
640: default:
641: *nextTokPtr = ptr;
642: return XML_TOK_INVALID;
643: }
644: /* ptr points to closing quote */
645: for (;;) {
646: ptr += MINBPC(enc);
1.2.2.2 ! snj 647: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 648: switch (BYTE_TYPE(enc, ptr)) {
649: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
650: case BT_S: case BT_CR: case BT_LF:
651: continue;
652: case BT_GT:
653: gt:
654: *nextTokPtr = ptr + MINBPC(enc);
655: return XML_TOK_START_TAG_WITH_ATTS;
656: case BT_SOL:
657: sol:
658: ptr += MINBPC(enc);
1.2.2.2 ! snj 659: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 660: if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
661: *nextTokPtr = ptr;
662: return XML_TOK_INVALID;
663: }
664: *nextTokPtr = ptr + MINBPC(enc);
665: return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
666: default:
667: *nextTokPtr = ptr;
668: return XML_TOK_INVALID;
669: }
670: break;
671: }
672: break;
673: }
674: default:
675: *nextTokPtr = ptr;
676: return XML_TOK_INVALID;
677: }
678: }
679: return XML_TOK_PARTIAL;
680: }
681:
682: /* ptr points to character following "<" */
683:
684: static int PTRCALL
685: PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
686: const char **nextTokPtr)
687: {
688: #ifdef XML_NS
689: int hadColon;
690: #endif
1.2.2.2 ! snj 691: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 692: switch (BYTE_TYPE(enc, ptr)) {
693: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
694: case BT_EXCL:
1.2.2.2 ! snj 695: ptr += MINBPC(enc);
! 696: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 697: switch (BYTE_TYPE(enc, ptr)) {
698: case BT_MINUS:
699: return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
700: case BT_LSQB:
701: return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
702: end, nextTokPtr);
703: }
704: *nextTokPtr = ptr;
705: return XML_TOK_INVALID;
706: case BT_QUEST:
707: return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
708: case BT_SOL:
709: return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
710: default:
711: *nextTokPtr = ptr;
712: return XML_TOK_INVALID;
713: }
714: #ifdef XML_NS
715: hadColon = 0;
716: #endif
717: /* we have a start-tag */
1.2.2.2 ! snj 718: while (HAS_CHAR(enc, ptr, end)) {
1.1 tron 719: switch (BYTE_TYPE(enc, ptr)) {
720: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
721: #ifdef XML_NS
722: case BT_COLON:
723: if (hadColon) {
724: *nextTokPtr = ptr;
725: return XML_TOK_INVALID;
726: }
727: hadColon = 1;
728: ptr += MINBPC(enc);
1.2.2.2 ! snj 729: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 730: switch (BYTE_TYPE(enc, ptr)) {
731: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
732: default:
733: *nextTokPtr = ptr;
734: return XML_TOK_INVALID;
735: }
736: break;
737: #endif
738: case BT_S: case BT_CR: case BT_LF:
739: {
740: ptr += MINBPC(enc);
1.2.2.2 ! snj 741: while (HAS_CHAR(enc, ptr, end)) {
1.1 tron 742: switch (BYTE_TYPE(enc, ptr)) {
743: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
744: case BT_GT:
745: goto gt;
746: case BT_SOL:
747: goto sol;
748: case BT_S: case BT_CR: case BT_LF:
749: ptr += MINBPC(enc);
750: continue;
751: default:
752: *nextTokPtr = ptr;
753: return XML_TOK_INVALID;
754: }
755: return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
756: }
757: return XML_TOK_PARTIAL;
758: }
759: case BT_GT:
760: gt:
761: *nextTokPtr = ptr + MINBPC(enc);
762: return XML_TOK_START_TAG_NO_ATTS;
763: case BT_SOL:
764: sol:
765: ptr += MINBPC(enc);
1.2.2.2 ! snj 766: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 767: if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
768: *nextTokPtr = ptr;
769: return XML_TOK_INVALID;
770: }
771: *nextTokPtr = ptr + MINBPC(enc);
772: return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
773: default:
774: *nextTokPtr = ptr;
775: return XML_TOK_INVALID;
776: }
777: }
778: return XML_TOK_PARTIAL;
779: }
780:
781: static int PTRCALL
782: PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
783: const char **nextTokPtr)
784: {
1.2.2.1 snj 785: if (ptr >= end)
1.1 tron 786: return XML_TOK_NONE;
787: if (MINBPC(enc) > 1) {
788: size_t n = end - ptr;
789: if (n & (MINBPC(enc) - 1)) {
790: n &= ~(MINBPC(enc) - 1);
791: if (n == 0)
792: return XML_TOK_PARTIAL;
793: end = ptr + n;
794: }
795: }
796: switch (BYTE_TYPE(enc, ptr)) {
797: case BT_LT:
798: return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
799: case BT_AMP:
800: return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
801: case BT_CR:
802: ptr += MINBPC(enc);
1.2.2.2 ! snj 803: if (! HAS_CHAR(enc, ptr, end))
1.1 tron 804: return XML_TOK_TRAILING_CR;
805: if (BYTE_TYPE(enc, ptr) == BT_LF)
806: ptr += MINBPC(enc);
807: *nextTokPtr = ptr;
808: return XML_TOK_DATA_NEWLINE;
809: case BT_LF:
810: *nextTokPtr = ptr + MINBPC(enc);
811: return XML_TOK_DATA_NEWLINE;
812: case BT_RSQB:
813: ptr += MINBPC(enc);
1.2.2.2 ! snj 814: if (! HAS_CHAR(enc, ptr, end))
1.1 tron 815: return XML_TOK_TRAILING_RSQB;
816: if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
817: break;
818: ptr += MINBPC(enc);
1.2.2.2 ! snj 819: if (! HAS_CHAR(enc, ptr, end))
1.1 tron 820: return XML_TOK_TRAILING_RSQB;
821: if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
822: ptr -= MINBPC(enc);
823: break;
824: }
825: *nextTokPtr = ptr;
826: return XML_TOK_INVALID;
827: INVALID_CASES(ptr, nextTokPtr)
828: default:
829: ptr += MINBPC(enc);
830: break;
831: }
1.2.2.2 ! snj 832: while (HAS_CHAR(enc, ptr, end)) {
1.1 tron 833: switch (BYTE_TYPE(enc, ptr)) {
834: #define LEAD_CASE(n) \
835: case BT_LEAD ## n: \
836: if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
837: *nextTokPtr = ptr; \
838: return XML_TOK_DATA_CHARS; \
839: } \
840: ptr += n; \
841: break;
842: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
843: #undef LEAD_CASE
844: case BT_RSQB:
1.2.2.2 ! snj 845: if (HAS_CHARS(enc, ptr, end, 2)) {
1.1 tron 846: if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
847: ptr += MINBPC(enc);
848: break;
849: }
1.2.2.2 ! snj 850: if (HAS_CHARS(enc, ptr, end, 3)) {
1.1 tron 851: if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
852: ptr += MINBPC(enc);
853: break;
854: }
855: *nextTokPtr = ptr + 2*MINBPC(enc);
856: return XML_TOK_INVALID;
857: }
858: }
859: /* fall through */
860: case BT_AMP:
861: case BT_LT:
862: case BT_NONXML:
863: case BT_MALFORM:
864: case BT_TRAIL:
865: case BT_CR:
866: case BT_LF:
867: *nextTokPtr = ptr;
868: return XML_TOK_DATA_CHARS;
869: default:
870: ptr += MINBPC(enc);
871: break;
872: }
873: }
874: *nextTokPtr = ptr;
875: return XML_TOK_DATA_CHARS;
876: }
877:
878: /* ptr points to character following "%" */
879:
880: static int PTRCALL
881: PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
882: const char **nextTokPtr)
883: {
1.2.2.2 ! snj 884: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 885: switch (BYTE_TYPE(enc, ptr)) {
886: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
887: case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
888: *nextTokPtr = ptr;
889: return XML_TOK_PERCENT;
890: default:
891: *nextTokPtr = ptr;
892: return XML_TOK_INVALID;
893: }
1.2.2.2 ! snj 894: while (HAS_CHAR(enc, ptr, end)) {
1.1 tron 895: switch (BYTE_TYPE(enc, ptr)) {
896: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
897: case BT_SEMI:
898: *nextTokPtr = ptr + MINBPC(enc);
899: return XML_TOK_PARAM_ENTITY_REF;
900: default:
901: *nextTokPtr = ptr;
902: return XML_TOK_INVALID;
903: }
904: }
905: return XML_TOK_PARTIAL;
906: }
907:
908: static int PTRCALL
909: PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
910: const char **nextTokPtr)
911: {
1.2.2.2 ! snj 912: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 913: switch (BYTE_TYPE(enc, ptr)) {
914: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
915: default:
916: *nextTokPtr = ptr;
917: return XML_TOK_INVALID;
918: }
1.2.2.2 ! snj 919: while (HAS_CHAR(enc, ptr, end)) {
1.1 tron 920: switch (BYTE_TYPE(enc, ptr)) {
921: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
922: case BT_CR: case BT_LF: case BT_S:
923: case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
924: *nextTokPtr = ptr;
925: return XML_TOK_POUND_NAME;
926: default:
927: *nextTokPtr = ptr;
928: return XML_TOK_INVALID;
929: }
930: }
931: return -XML_TOK_POUND_NAME;
932: }
933:
934: static int PTRCALL
935: PREFIX(scanLit)(int open, const ENCODING *enc,
936: const char *ptr, const char *end,
937: const char **nextTokPtr)
938: {
1.2.2.2 ! snj 939: while (HAS_CHAR(enc, ptr, end)) {
1.1 tron 940: int t = BYTE_TYPE(enc, ptr);
941: switch (t) {
942: INVALID_CASES(ptr, nextTokPtr)
943: case BT_QUOT:
944: case BT_APOS:
945: ptr += MINBPC(enc);
946: if (t != open)
947: break;
1.2.2.2 ! snj 948: if (! HAS_CHAR(enc, ptr, end))
1.1 tron 949: return -XML_TOK_LITERAL;
950: *nextTokPtr = ptr;
951: switch (BYTE_TYPE(enc, ptr)) {
952: case BT_S: case BT_CR: case BT_LF:
953: case BT_GT: case BT_PERCNT: case BT_LSQB:
954: return XML_TOK_LITERAL;
955: default:
956: return XML_TOK_INVALID;
957: }
958: default:
959: ptr += MINBPC(enc);
960: break;
961: }
962: }
963: return XML_TOK_PARTIAL;
964: }
965:
966: static int PTRCALL
967: PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
968: const char **nextTokPtr)
969: {
970: int tok;
1.2.2.1 snj 971: if (ptr >= end)
1.1 tron 972: return XML_TOK_NONE;
973: if (MINBPC(enc) > 1) {
974: size_t n = end - ptr;
975: if (n & (MINBPC(enc) - 1)) {
976: n &= ~(MINBPC(enc) - 1);
977: if (n == 0)
978: return XML_TOK_PARTIAL;
979: end = ptr + n;
980: }
981: }
982: switch (BYTE_TYPE(enc, ptr)) {
983: case BT_QUOT:
984: return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
985: case BT_APOS:
986: return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
987: case BT_LT:
988: {
989: ptr += MINBPC(enc);
1.2.2.2 ! snj 990: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 991: switch (BYTE_TYPE(enc, ptr)) {
992: case BT_EXCL:
993: return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
994: case BT_QUEST:
995: return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
996: case BT_NMSTRT:
997: case BT_HEX:
998: case BT_NONASCII:
999: case BT_LEAD2:
1000: case BT_LEAD3:
1001: case BT_LEAD4:
1002: *nextTokPtr = ptr - MINBPC(enc);
1003: return XML_TOK_INSTANCE_START;
1004: }
1005: *nextTokPtr = ptr;
1006: return XML_TOK_INVALID;
1007: }
1008: case BT_CR:
1009: if (ptr + MINBPC(enc) == end) {
1010: *nextTokPtr = end;
1011: /* indicate that this might be part of a CR/LF pair */
1012: return -XML_TOK_PROLOG_S;
1013: }
1014: /* fall through */
1015: case BT_S: case BT_LF:
1016: for (;;) {
1017: ptr += MINBPC(enc);
1.2.2.2 ! snj 1018: if (! HAS_CHAR(enc, ptr, end))
1.1 tron 1019: break;
1020: switch (BYTE_TYPE(enc, ptr)) {
1021: case BT_S: case BT_LF:
1022: break;
1023: case BT_CR:
1024: /* don't split CR/LF pair */
1025: if (ptr + MINBPC(enc) != end)
1026: break;
1027: /* fall through */
1028: default:
1029: *nextTokPtr = ptr;
1030: return XML_TOK_PROLOG_S;
1031: }
1032: }
1033: *nextTokPtr = ptr;
1034: return XML_TOK_PROLOG_S;
1035: case BT_PERCNT:
1036: return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1037: case BT_COMMA:
1038: *nextTokPtr = ptr + MINBPC(enc);
1039: return XML_TOK_COMMA;
1040: case BT_LSQB:
1041: *nextTokPtr = ptr + MINBPC(enc);
1042: return XML_TOK_OPEN_BRACKET;
1043: case BT_RSQB:
1044: ptr += MINBPC(enc);
1.2.2.2 ! snj 1045: if (! HAS_CHAR(enc, ptr, end))
1.1 tron 1046: return -XML_TOK_CLOSE_BRACKET;
1047: if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1.2.2.2 ! snj 1048: REQUIRE_CHARS(enc, ptr, end, 2);
1.1 tron 1049: if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1050: *nextTokPtr = ptr + 2*MINBPC(enc);
1051: return XML_TOK_COND_SECT_CLOSE;
1052: }
1053: }
1054: *nextTokPtr = ptr;
1055: return XML_TOK_CLOSE_BRACKET;
1056: case BT_LPAR:
1057: *nextTokPtr = ptr + MINBPC(enc);
1058: return XML_TOK_OPEN_PAREN;
1059: case BT_RPAR:
1060: ptr += MINBPC(enc);
1.2.2.2 ! snj 1061: if (! HAS_CHAR(enc, ptr, end))
1.1 tron 1062: return -XML_TOK_CLOSE_PAREN;
1063: switch (BYTE_TYPE(enc, ptr)) {
1064: case BT_AST:
1065: *nextTokPtr = ptr + MINBPC(enc);
1066: return XML_TOK_CLOSE_PAREN_ASTERISK;
1067: case BT_QUEST:
1068: *nextTokPtr = ptr + MINBPC(enc);
1069: return XML_TOK_CLOSE_PAREN_QUESTION;
1070: case BT_PLUS:
1071: *nextTokPtr = ptr + MINBPC(enc);
1072: return XML_TOK_CLOSE_PAREN_PLUS;
1073: case BT_CR: case BT_LF: case BT_S:
1074: case BT_GT: case BT_COMMA: case BT_VERBAR:
1075: case BT_RPAR:
1076: *nextTokPtr = ptr;
1077: return XML_TOK_CLOSE_PAREN;
1078: }
1079: *nextTokPtr = ptr;
1080: return XML_TOK_INVALID;
1081: case BT_VERBAR:
1082: *nextTokPtr = ptr + MINBPC(enc);
1083: return XML_TOK_OR;
1084: case BT_GT:
1085: *nextTokPtr = ptr + MINBPC(enc);
1086: return XML_TOK_DECL_CLOSE;
1087: case BT_NUM:
1088: return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1089: #define LEAD_CASE(n) \
1090: case BT_LEAD ## n: \
1091: if (end - ptr < n) \
1092: return XML_TOK_PARTIAL_CHAR; \
1093: if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1094: ptr += n; \
1095: tok = XML_TOK_NAME; \
1096: break; \
1097: } \
1098: if (IS_NAME_CHAR(enc, ptr, n)) { \
1099: ptr += n; \
1100: tok = XML_TOK_NMTOKEN; \
1101: break; \
1102: } \
1103: *nextTokPtr = ptr; \
1104: return XML_TOK_INVALID;
1105: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1106: #undef LEAD_CASE
1107: case BT_NMSTRT:
1108: case BT_HEX:
1109: tok = XML_TOK_NAME;
1110: ptr += MINBPC(enc);
1111: break;
1112: case BT_DIGIT:
1113: case BT_NAME:
1114: case BT_MINUS:
1115: #ifdef XML_NS
1116: case BT_COLON:
1117: #endif
1118: tok = XML_TOK_NMTOKEN;
1119: ptr += MINBPC(enc);
1120: break;
1121: case BT_NONASCII:
1122: if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1123: ptr += MINBPC(enc);
1124: tok = XML_TOK_NAME;
1125: break;
1126: }
1127: if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1128: ptr += MINBPC(enc);
1129: tok = XML_TOK_NMTOKEN;
1130: break;
1131: }
1132: /* fall through */
1133: default:
1134: *nextTokPtr = ptr;
1135: return XML_TOK_INVALID;
1136: }
1.2.2.2 ! snj 1137: while (HAS_CHAR(enc, ptr, end)) {
1.1 tron 1138: switch (BYTE_TYPE(enc, ptr)) {
1139: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1140: case BT_GT: case BT_RPAR: case BT_COMMA:
1141: case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1142: case BT_S: case BT_CR: case BT_LF:
1143: *nextTokPtr = ptr;
1144: return tok;
1145: #ifdef XML_NS
1146: case BT_COLON:
1147: ptr += MINBPC(enc);
1148: switch (tok) {
1149: case XML_TOK_NAME:
1.2.2.2 ! snj 1150: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 1151: tok = XML_TOK_PREFIXED_NAME;
1152: switch (BYTE_TYPE(enc, ptr)) {
1153: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1154: default:
1155: tok = XML_TOK_NMTOKEN;
1156: break;
1157: }
1158: break;
1159: case XML_TOK_PREFIXED_NAME:
1160: tok = XML_TOK_NMTOKEN;
1161: break;
1162: }
1163: break;
1164: #endif
1165: case BT_PLUS:
1166: if (tok == XML_TOK_NMTOKEN) {
1167: *nextTokPtr = ptr;
1168: return XML_TOK_INVALID;
1169: }
1170: *nextTokPtr = ptr + MINBPC(enc);
1171: return XML_TOK_NAME_PLUS;
1172: case BT_AST:
1173: if (tok == XML_TOK_NMTOKEN) {
1174: *nextTokPtr = ptr;
1175: return XML_TOK_INVALID;
1176: }
1177: *nextTokPtr = ptr + MINBPC(enc);
1178: return XML_TOK_NAME_ASTERISK;
1179: case BT_QUEST:
1180: if (tok == XML_TOK_NMTOKEN) {
1181: *nextTokPtr = ptr;
1182: return XML_TOK_INVALID;
1183: }
1184: *nextTokPtr = ptr + MINBPC(enc);
1185: return XML_TOK_NAME_QUESTION;
1186: default:
1187: *nextTokPtr = ptr;
1188: return XML_TOK_INVALID;
1189: }
1190: }
1191: return -tok;
1192: }
1193:
1194: static int PTRCALL
1195: PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
1196: const char *end, const char **nextTokPtr)
1197: {
1198: const char *start;
1.2.2.1 snj 1199: if (ptr >= end)
1.1 tron 1200: return XML_TOK_NONE;
1.2.2.2 ! snj 1201: else if (! HAS_CHAR(enc, ptr, end))
! 1202: return XML_TOK_PARTIAL;
1.1 tron 1203: start = ptr;
1.2.2.2 ! snj 1204: while (HAS_CHAR(enc, ptr, end)) {
1.1 tron 1205: switch (BYTE_TYPE(enc, ptr)) {
1206: #define LEAD_CASE(n) \
1207: case BT_LEAD ## n: ptr += n; break;
1208: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1209: #undef LEAD_CASE
1210: case BT_AMP:
1211: if (ptr == start)
1212: return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1213: *nextTokPtr = ptr;
1214: return XML_TOK_DATA_CHARS;
1215: case BT_LT:
1216: /* this is for inside entity references */
1217: *nextTokPtr = ptr;
1218: return XML_TOK_INVALID;
1219: case BT_LF:
1220: if (ptr == start) {
1221: *nextTokPtr = ptr + MINBPC(enc);
1222: return XML_TOK_DATA_NEWLINE;
1223: }
1224: *nextTokPtr = ptr;
1225: return XML_TOK_DATA_CHARS;
1226: case BT_CR:
1227: if (ptr == start) {
1228: ptr += MINBPC(enc);
1.2.2.2 ! snj 1229: if (! HAS_CHAR(enc, ptr, end))
1.1 tron 1230: return XML_TOK_TRAILING_CR;
1231: if (BYTE_TYPE(enc, ptr) == BT_LF)
1232: ptr += MINBPC(enc);
1233: *nextTokPtr = ptr;
1234: return XML_TOK_DATA_NEWLINE;
1235: }
1236: *nextTokPtr = ptr;
1237: return XML_TOK_DATA_CHARS;
1238: case BT_S:
1239: if (ptr == start) {
1240: *nextTokPtr = ptr + MINBPC(enc);
1241: return XML_TOK_ATTRIBUTE_VALUE_S;
1242: }
1243: *nextTokPtr = ptr;
1244: return XML_TOK_DATA_CHARS;
1245: default:
1246: ptr += MINBPC(enc);
1247: break;
1248: }
1249: }
1250: *nextTokPtr = ptr;
1251: return XML_TOK_DATA_CHARS;
1252: }
1253:
1254: static int PTRCALL
1255: PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
1256: const char *end, const char **nextTokPtr)
1257: {
1258: const char *start;
1.2.2.1 snj 1259: if (ptr >= end)
1.1 tron 1260: return XML_TOK_NONE;
1.2.2.2 ! snj 1261: else if (! HAS_CHAR(enc, ptr, end))
! 1262: return XML_TOK_PARTIAL;
1.1 tron 1263: start = ptr;
1.2.2.2 ! snj 1264: while (HAS_CHAR(enc, ptr, end)) {
1.1 tron 1265: switch (BYTE_TYPE(enc, ptr)) {
1266: #define LEAD_CASE(n) \
1267: case BT_LEAD ## n: ptr += n; break;
1268: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1269: #undef LEAD_CASE
1270: case BT_AMP:
1271: if (ptr == start)
1272: return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1273: *nextTokPtr = ptr;
1274: return XML_TOK_DATA_CHARS;
1275: case BT_PERCNT:
1276: if (ptr == start) {
1277: int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1278: end, nextTokPtr);
1279: return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1280: }
1281: *nextTokPtr = ptr;
1282: return XML_TOK_DATA_CHARS;
1283: case BT_LF:
1284: if (ptr == start) {
1285: *nextTokPtr = ptr + MINBPC(enc);
1286: return XML_TOK_DATA_NEWLINE;
1287: }
1288: *nextTokPtr = ptr;
1289: return XML_TOK_DATA_CHARS;
1290: case BT_CR:
1291: if (ptr == start) {
1292: ptr += MINBPC(enc);
1.2.2.2 ! snj 1293: if (! HAS_CHAR(enc, ptr, end))
1.1 tron 1294: return XML_TOK_TRAILING_CR;
1295: if (BYTE_TYPE(enc, ptr) == BT_LF)
1296: ptr += MINBPC(enc);
1297: *nextTokPtr = ptr;
1298: return XML_TOK_DATA_NEWLINE;
1299: }
1300: *nextTokPtr = ptr;
1301: return XML_TOK_DATA_CHARS;
1302: default:
1303: ptr += MINBPC(enc);
1304: break;
1305: }
1306: }
1307: *nextTokPtr = ptr;
1308: return XML_TOK_DATA_CHARS;
1309: }
1310:
1311: #ifdef XML_DTD
1312:
1313: static int PTRCALL
1314: PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
1315: const char *end, const char **nextTokPtr)
1316: {
1317: int level = 0;
1318: if (MINBPC(enc) > 1) {
1319: size_t n = end - ptr;
1320: if (n & (MINBPC(enc) - 1)) {
1321: n &= ~(MINBPC(enc) - 1);
1322: end = ptr + n;
1323: }
1324: }
1.2.2.2 ! snj 1325: while (HAS_CHAR(enc, ptr, end)) {
1.1 tron 1326: switch (BYTE_TYPE(enc, ptr)) {
1327: INVALID_CASES(ptr, nextTokPtr)
1328: case BT_LT:
1.2.2.2 ! snj 1329: ptr += MINBPC(enc);
! 1330: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 1331: if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1.2.2.2 ! snj 1332: ptr += MINBPC(enc);
! 1333: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 1334: if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1335: ++level;
1336: ptr += MINBPC(enc);
1337: }
1338: }
1339: break;
1340: case BT_RSQB:
1.2.2.2 ! snj 1341: ptr += MINBPC(enc);
! 1342: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 1343: if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1.2.2.2 ! snj 1344: ptr += MINBPC(enc);
! 1345: REQUIRE_CHAR(enc, ptr, end);
1.1 tron 1346: if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1347: ptr += MINBPC(enc);
1348: if (level == 0) {
1349: *nextTokPtr = ptr;
1350: return XML_TOK_IGNORE_SECT;
1351: }
1352: --level;
1353: }
1354: }
1355: break;
1356: default:
1357: ptr += MINBPC(enc);
1358: break;
1359: }
1360: }
1361: return XML_TOK_PARTIAL;
1362: }
1363:
1364: #endif /* XML_DTD */
1365:
1366: static int PTRCALL
1367: PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1368: const char **badPtr)
1369: {
1370: ptr += MINBPC(enc);
1371: end -= MINBPC(enc);
1.2.2.2 ! snj 1372: for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1.1 tron 1373: switch (BYTE_TYPE(enc, ptr)) {
1374: case BT_DIGIT:
1375: case BT_HEX:
1376: case BT_MINUS:
1377: case BT_APOS:
1378: case BT_LPAR:
1379: case BT_RPAR:
1380: case BT_PLUS:
1381: case BT_COMMA:
1382: case BT_SOL:
1383: case BT_EQUALS:
1384: case BT_QUEST:
1385: case BT_CR:
1386: case BT_LF:
1387: case BT_SEMI:
1388: case BT_EXCL:
1389: case BT_AST:
1390: case BT_PERCNT:
1391: case BT_NUM:
1392: #ifdef XML_NS
1393: case BT_COLON:
1394: #endif
1395: break;
1396: case BT_S:
1397: if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1398: *badPtr = ptr;
1399: return 0;
1400: }
1401: break;
1402: case BT_NAME:
1403: case BT_NMSTRT:
1404: if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1405: break;
1406: default:
1407: switch (BYTE_TO_ASCII(enc, ptr)) {
1408: case 0x24: /* $ */
1409: case 0x40: /* @ */
1410: break;
1411: default:
1412: *badPtr = ptr;
1413: return 0;
1414: }
1415: break;
1416: }
1417: }
1418: return 1;
1419: }
1420:
1421: /* This must only be called for a well-formed start-tag or empty
1422: element tag. Returns the number of attributes. Pointers to the
1423: first attsMax attributes are stored in atts.
1424: */
1425:
1426: static int PTRCALL
1427: PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1428: int attsMax, ATTRIBUTE *atts)
1429: {
1430: enum { other, inName, inValue } state = inName;
1431: int nAtts = 0;
1432: int open = 0; /* defined when state == inValue;
1433: initialization just to shut up compilers */
1434:
1435: for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1436: switch (BYTE_TYPE(enc, ptr)) {
1437: #define START_NAME \
1438: if (state == other) { \
1439: if (nAtts < attsMax) { \
1440: atts[nAtts].name = ptr; \
1441: atts[nAtts].normalized = 1; \
1442: } \
1443: state = inName; \
1444: }
1445: #define LEAD_CASE(n) \
1446: case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1447: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1448: #undef LEAD_CASE
1449: case BT_NONASCII:
1450: case BT_NMSTRT:
1451: case BT_HEX:
1452: START_NAME
1453: break;
1454: #undef START_NAME
1455: case BT_QUOT:
1456: if (state != inValue) {
1457: if (nAtts < attsMax)
1458: atts[nAtts].valuePtr = ptr + MINBPC(enc);
1459: state = inValue;
1460: open = BT_QUOT;
1461: }
1462: else if (open == BT_QUOT) {
1463: state = other;
1464: if (nAtts < attsMax)
1465: atts[nAtts].valueEnd = ptr;
1466: nAtts++;
1467: }
1468: break;
1469: case BT_APOS:
1470: if (state != inValue) {
1471: if (nAtts < attsMax)
1472: atts[nAtts].valuePtr = ptr + MINBPC(enc);
1473: state = inValue;
1474: open = BT_APOS;
1475: }
1476: else if (open == BT_APOS) {
1477: state = other;
1478: if (nAtts < attsMax)
1479: atts[nAtts].valueEnd = ptr;
1480: nAtts++;
1481: }
1482: break;
1483: case BT_AMP:
1484: if (nAtts < attsMax)
1485: atts[nAtts].normalized = 0;
1486: break;
1487: case BT_S:
1488: if (state == inName)
1489: state = other;
1490: else if (state == inValue
1491: && nAtts < attsMax
1492: && atts[nAtts].normalized
1493: && (ptr == atts[nAtts].valuePtr
1494: || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1495: || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1496: || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1497: atts[nAtts].normalized = 0;
1498: break;
1499: case BT_CR: case BT_LF:
1500: /* This case ensures that the first attribute name is counted
1501: Apart from that we could just change state on the quote. */
1502: if (state == inName)
1503: state = other;
1504: else if (state == inValue && nAtts < attsMax)
1505: atts[nAtts].normalized = 0;
1506: break;
1507: case BT_GT:
1508: case BT_SOL:
1509: if (state != inValue)
1510: return nAtts;
1511: break;
1512: default:
1513: break;
1514: }
1515: }
1516: /* not reached */
1517: }
1518:
1519: static int PTRFASTCALL
1.2.2.2 ! snj 1520: PREFIX(charRefNumber)(const ENCODING *UNUSED_P(enc), const char *ptr)
1.1 tron 1521: {
1522: int result = 0;
1523: /* skip &# */
1524: ptr += 2*MINBPC(enc);
1525: if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1526: for (ptr += MINBPC(enc);
1527: !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1528: ptr += MINBPC(enc)) {
1529: int c = BYTE_TO_ASCII(enc, ptr);
1530: switch (c) {
1531: case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1532: case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1533: result <<= 4;
1534: result |= (c - ASCII_0);
1535: break;
1536: case ASCII_A: case ASCII_B: case ASCII_C:
1537: case ASCII_D: case ASCII_E: case ASCII_F:
1538: result <<= 4;
1539: result += 10 + (c - ASCII_A);
1540: break;
1541: case ASCII_a: case ASCII_b: case ASCII_c:
1542: case ASCII_d: case ASCII_e: case ASCII_f:
1543: result <<= 4;
1544: result += 10 + (c - ASCII_a);
1545: break;
1546: }
1547: if (result >= 0x110000)
1548: return -1;
1549: }
1550: }
1551: else {
1552: for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1553: int c = BYTE_TO_ASCII(enc, ptr);
1554: result *= 10;
1555: result += (c - ASCII_0);
1556: if (result >= 0x110000)
1557: return -1;
1558: }
1559: }
1560: return checkCharRefNumber(result);
1561: }
1562:
1563: static int PTRCALL
1.2.2.2 ! snj 1564: PREFIX(predefinedEntityName)(const ENCODING *UNUSED_P(enc), const char *ptr,
1.1 tron 1565: const char *end)
1566: {
1567: switch ((end - ptr)/MINBPC(enc)) {
1568: case 2:
1569: if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1570: switch (BYTE_TO_ASCII(enc, ptr)) {
1571: case ASCII_l:
1572: return ASCII_LT;
1573: case ASCII_g:
1574: return ASCII_GT;
1575: }
1576: }
1577: break;
1578: case 3:
1579: if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1580: ptr += MINBPC(enc);
1581: if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1582: ptr += MINBPC(enc);
1583: if (CHAR_MATCHES(enc, ptr, ASCII_p))
1584: return ASCII_AMP;
1585: }
1586: }
1587: break;
1588: case 4:
1589: switch (BYTE_TO_ASCII(enc, ptr)) {
1590: case ASCII_q:
1591: ptr += MINBPC(enc);
1592: if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1593: ptr += MINBPC(enc);
1594: if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1595: ptr += MINBPC(enc);
1596: if (CHAR_MATCHES(enc, ptr, ASCII_t))
1597: return ASCII_QUOT;
1598: }
1599: }
1600: break;
1601: case ASCII_a:
1602: ptr += MINBPC(enc);
1603: if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1604: ptr += MINBPC(enc);
1605: if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1606: ptr += MINBPC(enc);
1607: if (CHAR_MATCHES(enc, ptr, ASCII_s))
1608: return ASCII_APOS;
1609: }
1610: }
1611: break;
1612: }
1613: }
1614: return 0;
1615: }
1616:
1617: static int PTRCALL
1618: PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1619: {
1620: for (;;) {
1621: switch (BYTE_TYPE(enc, ptr1)) {
1622: #define LEAD_CASE(n) \
1623: case BT_LEAD ## n: \
1624: if (*ptr1++ != *ptr2++) \
1625: return 0;
1626: LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1627: #undef LEAD_CASE
1628: /* fall through */
1629: if (*ptr1++ != *ptr2++)
1630: return 0;
1631: break;
1632: case BT_NONASCII:
1633: case BT_NMSTRT:
1634: #ifdef XML_NS
1635: case BT_COLON:
1636: #endif
1637: case BT_HEX:
1638: case BT_DIGIT:
1639: case BT_NAME:
1640: case BT_MINUS:
1641: if (*ptr2++ != *ptr1++)
1642: return 0;
1643: if (MINBPC(enc) > 1) {
1644: if (*ptr2++ != *ptr1++)
1645: return 0;
1646: if (MINBPC(enc) > 2) {
1647: if (*ptr2++ != *ptr1++)
1648: return 0;
1649: if (MINBPC(enc) > 3) {
1650: if (*ptr2++ != *ptr1++)
1651: return 0;
1652: }
1653: }
1654: }
1655: break;
1656: default:
1657: if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
1658: return 1;
1659: switch (BYTE_TYPE(enc, ptr2)) {
1660: case BT_LEAD2:
1661: case BT_LEAD3:
1662: case BT_LEAD4:
1663: case BT_NONASCII:
1664: case BT_NMSTRT:
1665: #ifdef XML_NS
1666: case BT_COLON:
1667: #endif
1668: case BT_HEX:
1669: case BT_DIGIT:
1670: case BT_NAME:
1671: case BT_MINUS:
1672: return 0;
1673: default:
1674: return 1;
1675: }
1676: }
1677: }
1678: /* not reached */
1679: }
1680:
1681: static int PTRCALL
1.2.2.2 ! snj 1682: PREFIX(nameMatchesAscii)(const ENCODING *UNUSED_P(enc), const char *ptr1,
1.1 tron 1683: const char *end1, const char *ptr2)
1684: {
1685: for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1.2.2.2 ! snj 1686: if (end1 - ptr1 < MINBPC(enc))
1.1 tron 1687: return 0;
1688: if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1689: return 0;
1690: }
1691: return ptr1 == end1;
1692: }
1693:
1694: static int PTRFASTCALL
1695: PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1696: {
1697: const char *start = ptr;
1698: for (;;) {
1699: switch (BYTE_TYPE(enc, ptr)) {
1700: #define LEAD_CASE(n) \
1701: case BT_LEAD ## n: ptr += n; break;
1702: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1703: #undef LEAD_CASE
1704: case BT_NONASCII:
1705: case BT_NMSTRT:
1706: #ifdef XML_NS
1707: case BT_COLON:
1708: #endif
1709: case BT_HEX:
1710: case BT_DIGIT:
1711: case BT_NAME:
1712: case BT_MINUS:
1713: ptr += MINBPC(enc);
1714: break;
1715: default:
1716: return (int)(ptr - start);
1717: }
1718: }
1719: }
1720:
1721: static const char * PTRFASTCALL
1722: PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1723: {
1724: for (;;) {
1725: switch (BYTE_TYPE(enc, ptr)) {
1726: case BT_LF:
1727: case BT_CR:
1728: case BT_S:
1729: ptr += MINBPC(enc);
1730: break;
1731: default:
1732: return ptr;
1733: }
1734: }
1735: }
1736:
1737: static void PTRCALL
1738: PREFIX(updatePosition)(const ENCODING *enc,
1739: const char *ptr,
1740: const char *end,
1741: POSITION *pos)
1742: {
1.2.2.2 ! snj 1743: while (HAS_CHAR(enc, ptr, end)) {
1.1 tron 1744: switch (BYTE_TYPE(enc, ptr)) {
1745: #define LEAD_CASE(n) \
1746: case BT_LEAD ## n: \
1747: ptr += n; \
1748: break;
1749: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1750: #undef LEAD_CASE
1751: case BT_LF:
1752: pos->columnNumber = (XML_Size)-1;
1753: pos->lineNumber++;
1754: ptr += MINBPC(enc);
1755: break;
1756: case BT_CR:
1757: pos->lineNumber++;
1758: ptr += MINBPC(enc);
1.2.2.2 ! snj 1759: if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1.1 tron 1760: ptr += MINBPC(enc);
1761: pos->columnNumber = (XML_Size)-1;
1762: break;
1763: default:
1764: ptr += MINBPC(enc);
1765: break;
1766: }
1767: pos->columnNumber++;
1768: }
1769: }
1770:
1771: #undef DO_LEAD_CASE
1772: #undef MULTIBYTE_CASES
1773: #undef INVALID_CASES
1774: #undef CHECK_NAME_CASE
1775: #undef CHECK_NAME_CASES
1776: #undef CHECK_NMSTRT_CASE
1777: #undef CHECK_NMSTRT_CASES
1778:
1779: #endif /* XML_TOK_IMPL_C */
CVSweb <webmaster@jp.NetBSD.org>