xmltok_impl.c 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768
  1. /*
  2. Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
  3. See the file COPYING for copying permission.
  4. */
  5. #ifndef IS_INVALID_CHAR
  6. #define IS_INVALID_CHAR(enc, ptr, n) (0)
  7. #endif
  8. #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
  9. case BT_LEAD ## n: \
  10. if (end - ptr < n) \
  11. return XML_TOK_PARTIAL_CHAR; \
  12. if (IS_INVALID_CHAR(enc, ptr, n)) { \
  13. *(nextTokPtr) = (ptr); \
  14. return XML_TOK_INVALID; \
  15. } \
  16. ptr += n; \
  17. break;
  18. #define INVALID_CASES(ptr, nextTokPtr) \
  19. INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
  20. INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
  21. INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
  22. case BT_NONXML: \
  23. case BT_MALFORM: \
  24. case BT_TRAIL: \
  25. *(nextTokPtr) = (ptr); \
  26. return XML_TOK_INVALID;
  27. #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
  28. case BT_LEAD ## n: \
  29. if (end - ptr < n) \
  30. return XML_TOK_PARTIAL_CHAR; \
  31. if (!IS_NAME_CHAR(enc, ptr, n)) { \
  32. *nextTokPtr = ptr; \
  33. return XML_TOK_INVALID; \
  34. } \
  35. ptr += n; \
  36. break;
  37. #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
  38. case BT_NONASCII: \
  39. if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
  40. *nextTokPtr = ptr; \
  41. return XML_TOK_INVALID; \
  42. } \
  43. case BT_NMSTRT: \
  44. case BT_HEX: \
  45. case BT_DIGIT: \
  46. case BT_NAME: \
  47. case BT_MINUS: \
  48. ptr += MINBPC(enc); \
  49. break; \
  50. CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
  51. CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
  52. CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
  53. #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
  54. case BT_LEAD ## n: \
  55. if (end - ptr < n) \
  56. return XML_TOK_PARTIAL_CHAR; \
  57. if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
  58. *nextTokPtr = ptr; \
  59. return XML_TOK_INVALID; \
  60. } \
  61. ptr += n; \
  62. break;
  63. #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
  64. case BT_NONASCII: \
  65. if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
  66. *nextTokPtr = ptr; \
  67. return XML_TOK_INVALID; \
  68. } \
  69. case BT_NMSTRT: \
  70. case BT_HEX: \
  71. ptr += MINBPC(enc); \
  72. break; \
  73. CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
  74. CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
  75. CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
  76. #ifndef PREFIX
  77. #define PREFIX(ident) ident
  78. #endif
  79. /* ptr points to character following "<!-" */
  80. static
  81. int PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
  82. const char **nextTokPtr)
  83. {
  84. if (ptr != end) {
  85. if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
  86. *nextTokPtr = ptr;
  87. return XML_TOK_INVALID;
  88. }
  89. ptr += MINBPC(enc);
  90. while (ptr != end) {
  91. switch (BYTE_TYPE(enc, ptr)) {
  92. INVALID_CASES(ptr, nextTokPtr)
  93. case BT_MINUS:
  94. if ((ptr += MINBPC(enc)) == end)
  95. return XML_TOK_PARTIAL;
  96. if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
  97. if ((ptr += MINBPC(enc)) == end)
  98. return XML_TOK_PARTIAL;
  99. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  100. *nextTokPtr = ptr;
  101. return XML_TOK_INVALID;
  102. }
  103. *nextTokPtr = ptr + MINBPC(enc);
  104. return XML_TOK_COMMENT;
  105. }
  106. break;
  107. default:
  108. ptr += MINBPC(enc);
  109. break;
  110. }
  111. }
  112. }
  113. return XML_TOK_PARTIAL;
  114. }
  115. /* ptr points to character following "<!" */
  116. static
  117. int PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
  118. const char **nextTokPtr)
  119. {
  120. if (ptr == end)
  121. return XML_TOK_PARTIAL;
  122. switch (BYTE_TYPE(enc, ptr)) {
  123. case BT_MINUS:
  124. return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  125. case BT_LSQB:
  126. *nextTokPtr = ptr + MINBPC(enc);
  127. return XML_TOK_COND_SECT_OPEN;
  128. case BT_NMSTRT:
  129. case BT_HEX:
  130. ptr += MINBPC(enc);
  131. break;
  132. default:
  133. *nextTokPtr = ptr;
  134. return XML_TOK_INVALID;
  135. }
  136. while (ptr != end) {
  137. switch (BYTE_TYPE(enc, ptr)) {
  138. case BT_PERCNT:
  139. if (ptr + MINBPC(enc) == end)
  140. return XML_TOK_PARTIAL;
  141. /* don't allow <!ENTITY% foo "whatever"> */
  142. switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
  143. case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
  144. *nextTokPtr = ptr;
  145. return XML_TOK_INVALID;
  146. }
  147. /* fall through */
  148. case BT_S: case BT_CR: case BT_LF:
  149. *nextTokPtr = ptr;
  150. return XML_TOK_DECL_OPEN;
  151. case BT_NMSTRT:
  152. case BT_HEX:
  153. ptr += MINBPC(enc);
  154. break;
  155. default:
  156. *nextTokPtr = ptr;
  157. return XML_TOK_INVALID;
  158. }
  159. }
  160. return XML_TOK_PARTIAL;
  161. }
  162. static
  163. int PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end, int *tokPtr)
  164. {
  165. int upper = 0;
  166. *tokPtr = XML_TOK_PI;
  167. if (end - ptr != MINBPC(enc)*3)
  168. return 1;
  169. switch (BYTE_TO_ASCII(enc, ptr)) {
  170. case ASCII_x:
  171. break;
  172. case ASCII_X:
  173. upper = 1;
  174. break;
  175. default:
  176. return 1;
  177. }
  178. ptr += MINBPC(enc);
  179. switch (BYTE_TO_ASCII(enc, ptr)) {
  180. case ASCII_m:
  181. break;
  182. case ASCII_M:
  183. upper = 1;
  184. break;
  185. default:
  186. return 1;
  187. }
  188. ptr += MINBPC(enc);
  189. switch (BYTE_TO_ASCII(enc, ptr)) {
  190. case ASCII_l:
  191. break;
  192. case ASCII_L:
  193. upper = 1;
  194. break;
  195. default:
  196. return 1;
  197. }
  198. if (upper)
  199. return 0;
  200. *tokPtr = XML_TOK_XML_DECL;
  201. return 1;
  202. }
  203. /* ptr points to character following "<?" */
  204. static
  205. int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
  206. const char **nextTokPtr)
  207. {
  208. int tok;
  209. const char *target = ptr;
  210. if (ptr == end)
  211. return XML_TOK_PARTIAL;
  212. switch (BYTE_TYPE(enc, ptr)) {
  213. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  214. default:
  215. *nextTokPtr = ptr;
  216. return XML_TOK_INVALID;
  217. }
  218. while (ptr != end) {
  219. switch (BYTE_TYPE(enc, ptr)) {
  220. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  221. case BT_S: case BT_CR: case BT_LF:
  222. if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
  223. *nextTokPtr = ptr;
  224. return XML_TOK_INVALID;
  225. }
  226. ptr += MINBPC(enc);
  227. while (ptr != end) {
  228. switch (BYTE_TYPE(enc, ptr)) {
  229. INVALID_CASES(ptr, nextTokPtr)
  230. case BT_QUEST:
  231. ptr += MINBPC(enc);
  232. if (ptr == end)
  233. return XML_TOK_PARTIAL;
  234. if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  235. *nextTokPtr = ptr + MINBPC(enc);
  236. return tok;
  237. }
  238. break;
  239. default:
  240. ptr += MINBPC(enc);
  241. break;
  242. }
  243. }
  244. return XML_TOK_PARTIAL;
  245. case BT_QUEST:
  246. if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
  247. *nextTokPtr = ptr;
  248. return XML_TOK_INVALID;
  249. }
  250. ptr += MINBPC(enc);
  251. if (ptr == end)
  252. return XML_TOK_PARTIAL;
  253. if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  254. *nextTokPtr = ptr + MINBPC(enc);
  255. return tok;
  256. }
  257. /* fall through */
  258. default:
  259. *nextTokPtr = ptr;
  260. return XML_TOK_INVALID;
  261. }
  262. }
  263. return XML_TOK_PARTIAL;
  264. }
  265. static
  266. int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
  267. const char **nextTokPtr)
  268. {
  269. static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB };
  270. int i;
  271. /* CDATA[ */
  272. if (end - ptr < 6 * MINBPC(enc))
  273. return XML_TOK_PARTIAL;
  274. for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
  275. if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
  276. *nextTokPtr = ptr;
  277. return XML_TOK_INVALID;
  278. }
  279. }
  280. *nextTokPtr = ptr;
  281. return XML_TOK_CDATA_SECT_OPEN;
  282. }
  283. static
  284. int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
  285. const char **nextTokPtr)
  286. {
  287. if (ptr == end)
  288. return XML_TOK_NONE;
  289. if (MINBPC(enc) > 1) {
  290. size_t n = end - ptr;
  291. if (n & (MINBPC(enc) - 1)) {
  292. n &= ~(MINBPC(enc) - 1);
  293. if (n == 0)
  294. return XML_TOK_PARTIAL;
  295. end = ptr + n;
  296. }
  297. }
  298. switch (BYTE_TYPE(enc, ptr)) {
  299. case BT_RSQB:
  300. ptr += MINBPC(enc);
  301. if (ptr == end)
  302. return XML_TOK_PARTIAL;
  303. if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
  304. break;
  305. ptr += MINBPC(enc);
  306. if (ptr == end)
  307. return XML_TOK_PARTIAL;
  308. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  309. ptr -= MINBPC(enc);
  310. break;
  311. }
  312. *nextTokPtr = ptr + MINBPC(enc);
  313. return XML_TOK_CDATA_SECT_CLOSE;
  314. case BT_CR:
  315. ptr += MINBPC(enc);
  316. if (ptr == end)
  317. return XML_TOK_PARTIAL;
  318. if (BYTE_TYPE(enc, ptr) == BT_LF)
  319. ptr += MINBPC(enc);
  320. *nextTokPtr = ptr;
  321. return XML_TOK_DATA_NEWLINE;
  322. case BT_LF:
  323. *nextTokPtr = ptr + MINBPC(enc);
  324. return XML_TOK_DATA_NEWLINE;
  325. INVALID_CASES(ptr, nextTokPtr)
  326. default:
  327. ptr += MINBPC(enc);
  328. break;
  329. }
  330. while (ptr != end) {
  331. switch (BYTE_TYPE(enc, ptr)) {
  332. #define LEAD_CASE(n) \
  333. case BT_LEAD ## n: \
  334. if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
  335. *nextTokPtr = ptr; \
  336. return XML_TOK_DATA_CHARS; \
  337. } \
  338. ptr += n; \
  339. break;
  340. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  341. #undef LEAD_CASE
  342. case BT_NONXML:
  343. case BT_MALFORM:
  344. case BT_TRAIL:
  345. case BT_CR:
  346. case BT_LF:
  347. case BT_RSQB:
  348. *nextTokPtr = ptr;
  349. return XML_TOK_DATA_CHARS;
  350. default:
  351. ptr += MINBPC(enc);
  352. break;
  353. }
  354. }
  355. *nextTokPtr = ptr;
  356. return XML_TOK_DATA_CHARS;
  357. }
  358. /* ptr points to character following "</" */
  359. static
  360. int PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
  361. const char **nextTokPtr)
  362. {
  363. if (ptr == end)
  364. return XML_TOK_PARTIAL;
  365. switch (BYTE_TYPE(enc, ptr)) {
  366. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  367. default:
  368. *nextTokPtr = ptr;
  369. return XML_TOK_INVALID;
  370. }
  371. while (ptr != end) {
  372. switch (BYTE_TYPE(enc, ptr)) {
  373. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  374. case BT_S: case BT_CR: case BT_LF:
  375. for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
  376. switch (BYTE_TYPE(enc, ptr)) {
  377. case BT_S: case BT_CR: case BT_LF:
  378. break;
  379. case BT_GT:
  380. *nextTokPtr = ptr + MINBPC(enc);
  381. return XML_TOK_END_TAG;
  382. default:
  383. *nextTokPtr = ptr;
  384. return XML_TOK_INVALID;
  385. }
  386. }
  387. return XML_TOK_PARTIAL;
  388. #ifdef XML_NS
  389. case BT_COLON:
  390. /* no need to check qname syntax here, since end-tag must match exactly */
  391. ptr += MINBPC(enc);
  392. break;
  393. #endif
  394. case BT_GT:
  395. *nextTokPtr = ptr + MINBPC(enc);
  396. return XML_TOK_END_TAG;
  397. default:
  398. *nextTokPtr = ptr;
  399. return XML_TOK_INVALID;
  400. }
  401. }
  402. return XML_TOK_PARTIAL;
  403. }
  404. /* ptr points to character following "&#X" */
  405. static
  406. int PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
  407. const char **nextTokPtr)
  408. {
  409. if (ptr != end) {
  410. switch (BYTE_TYPE(enc, ptr)) {
  411. case BT_DIGIT:
  412. case BT_HEX:
  413. break;
  414. default:
  415. *nextTokPtr = ptr;
  416. return XML_TOK_INVALID;
  417. }
  418. for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
  419. switch (BYTE_TYPE(enc, ptr)) {
  420. case BT_DIGIT:
  421. case BT_HEX:
  422. break;
  423. case BT_SEMI:
  424. *nextTokPtr = ptr + MINBPC(enc);
  425. return XML_TOK_CHAR_REF;
  426. default:
  427. *nextTokPtr = ptr;
  428. return XML_TOK_INVALID;
  429. }
  430. }
  431. }
  432. return XML_TOK_PARTIAL;
  433. }
  434. /* ptr points to character following "&#" */
  435. static
  436. int PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
  437. const char **nextTokPtr)
  438. {
  439. if (ptr != end) {
  440. if (CHAR_MATCHES(enc, ptr, ASCII_x))
  441. return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  442. switch (BYTE_TYPE(enc, ptr)) {
  443. case BT_DIGIT:
  444. break;
  445. default:
  446. *nextTokPtr = ptr;
  447. return XML_TOK_INVALID;
  448. }
  449. for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
  450. switch (BYTE_TYPE(enc, ptr)) {
  451. case BT_DIGIT:
  452. break;
  453. case BT_SEMI:
  454. *nextTokPtr = ptr + MINBPC(enc);
  455. return XML_TOK_CHAR_REF;
  456. default:
  457. *nextTokPtr = ptr;
  458. return XML_TOK_INVALID;
  459. }
  460. }
  461. }
  462. return XML_TOK_PARTIAL;
  463. }
  464. /* ptr points to character following "&" */
  465. static
  466. int PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
  467. const char **nextTokPtr)
  468. {
  469. if (ptr == end)
  470. return XML_TOK_PARTIAL;
  471. switch (BYTE_TYPE(enc, ptr)) {
  472. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  473. case BT_NUM:
  474. return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  475. default:
  476. *nextTokPtr = ptr;
  477. return XML_TOK_INVALID;
  478. }
  479. while (ptr != end) {
  480. switch (BYTE_TYPE(enc, ptr)) {
  481. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  482. case BT_SEMI:
  483. *nextTokPtr = ptr + MINBPC(enc);
  484. return XML_TOK_ENTITY_REF;
  485. default:
  486. *nextTokPtr = ptr;
  487. return XML_TOK_INVALID;
  488. }
  489. }
  490. return XML_TOK_PARTIAL;
  491. }
  492. /* ptr points to character following first character of attribute name */
  493. static
  494. int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
  495. const char **nextTokPtr)
  496. {
  497. #ifdef XML_NS
  498. int hadColon = 0;
  499. #endif
  500. while (ptr != end) {
  501. switch (BYTE_TYPE(enc, ptr)) {
  502. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  503. #ifdef XML_NS
  504. case BT_COLON:
  505. if (hadColon) {
  506. *nextTokPtr = ptr;
  507. return XML_TOK_INVALID;
  508. }
  509. hadColon = 1;
  510. ptr += MINBPC(enc);
  511. if (ptr == end)
  512. return XML_TOK_PARTIAL;
  513. switch (BYTE_TYPE(enc, ptr)) {
  514. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  515. default:
  516. *nextTokPtr = ptr;
  517. return XML_TOK_INVALID;
  518. }
  519. break;
  520. #endif
  521. case BT_S: case BT_CR: case BT_LF:
  522. for (;;) {
  523. int t;
  524. ptr += MINBPC(enc);
  525. if (ptr == end)
  526. return XML_TOK_PARTIAL;
  527. t = BYTE_TYPE(enc, ptr);
  528. if (t == BT_EQUALS)
  529. break;
  530. switch (t) {
  531. case BT_S:
  532. case BT_LF:
  533. case BT_CR:
  534. break;
  535. default:
  536. *nextTokPtr = ptr;
  537. return XML_TOK_INVALID;
  538. }
  539. }
  540. /* fall through */
  541. case BT_EQUALS:
  542. {
  543. int open;
  544. #ifdef XML_NS
  545. hadColon = 0;
  546. #endif
  547. for (;;) {
  548. ptr += MINBPC(enc);
  549. if (ptr == end)
  550. return XML_TOK_PARTIAL;
  551. open = BYTE_TYPE(enc, ptr);
  552. if (open == BT_QUOT || open == BT_APOS)
  553. break;
  554. switch (open) {
  555. case BT_S:
  556. case BT_LF:
  557. case BT_CR:
  558. break;
  559. default:
  560. *nextTokPtr = ptr;
  561. return XML_TOK_INVALID;
  562. }
  563. }
  564. ptr += MINBPC(enc);
  565. /* in attribute value */
  566. for (;;) {
  567. int t;
  568. if (ptr == end)
  569. return XML_TOK_PARTIAL;
  570. t = BYTE_TYPE(enc, ptr);
  571. if (t == open)
  572. break;
  573. switch (t) {
  574. INVALID_CASES(ptr, nextTokPtr)
  575. case BT_AMP:
  576. {
  577. int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
  578. if (tok <= 0) {
  579. if (tok == XML_TOK_INVALID)
  580. *nextTokPtr = ptr;
  581. return tok;
  582. }
  583. break;
  584. }
  585. case BT_LT:
  586. *nextTokPtr = ptr;
  587. return XML_TOK_INVALID;
  588. default:
  589. ptr += MINBPC(enc);
  590. break;
  591. }
  592. }
  593. ptr += MINBPC(enc);
  594. if (ptr == end)
  595. return XML_TOK_PARTIAL;
  596. switch (BYTE_TYPE(enc, ptr)) {
  597. case BT_S:
  598. case BT_CR:
  599. case BT_LF:
  600. break;
  601. case BT_SOL:
  602. goto sol;
  603. case BT_GT:
  604. goto gt;
  605. default:
  606. *nextTokPtr = ptr;
  607. return XML_TOK_INVALID;
  608. }
  609. /* ptr points to closing quote */
  610. for (;;) {
  611. ptr += MINBPC(enc);
  612. if (ptr == end)
  613. return XML_TOK_PARTIAL;
  614. switch (BYTE_TYPE(enc, ptr)) {
  615. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  616. case BT_S: case BT_CR: case BT_LF:
  617. continue;
  618. case BT_GT:
  619. gt:
  620. *nextTokPtr = ptr + MINBPC(enc);
  621. return XML_TOK_START_TAG_WITH_ATTS;
  622. case BT_SOL:
  623. sol:
  624. ptr += MINBPC(enc);
  625. if (ptr == end)
  626. return XML_TOK_PARTIAL;
  627. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  628. *nextTokPtr = ptr;
  629. return XML_TOK_INVALID;
  630. }
  631. *nextTokPtr = ptr + MINBPC(enc);
  632. return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
  633. default:
  634. *nextTokPtr = ptr;
  635. return XML_TOK_INVALID;
  636. }
  637. break;
  638. }
  639. break;
  640. }
  641. default:
  642. *nextTokPtr = ptr;
  643. return XML_TOK_INVALID;
  644. }
  645. }
  646. return XML_TOK_PARTIAL;
  647. }
  648. /* ptr points to character following "<" */
  649. static
  650. int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
  651. const char **nextTokPtr)
  652. {
  653. #ifdef XML_NS
  654. int hadColon;
  655. #endif
  656. if (ptr == end)
  657. return XML_TOK_PARTIAL;
  658. switch (BYTE_TYPE(enc, ptr)) {
  659. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  660. case BT_EXCL:
  661. if ((ptr += MINBPC(enc)) == end)
  662. return XML_TOK_PARTIAL;
  663. switch (BYTE_TYPE(enc, ptr)) {
  664. case BT_MINUS:
  665. return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  666. case BT_LSQB:
  667. return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  668. }
  669. *nextTokPtr = ptr;
  670. return XML_TOK_INVALID;
  671. case BT_QUEST:
  672. return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  673. case BT_SOL:
  674. return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  675. default:
  676. *nextTokPtr = ptr;
  677. return XML_TOK_INVALID;
  678. }
  679. #ifdef XML_NS
  680. hadColon = 0;
  681. #endif
  682. /* we have a start-tag */
  683. while (ptr != end) {
  684. switch (BYTE_TYPE(enc, ptr)) {
  685. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  686. #ifdef XML_NS
  687. case BT_COLON:
  688. if (hadColon) {
  689. *nextTokPtr = ptr;
  690. return XML_TOK_INVALID;
  691. }
  692. hadColon = 1;
  693. ptr += MINBPC(enc);
  694. if (ptr == end)
  695. return XML_TOK_PARTIAL;
  696. switch (BYTE_TYPE(enc, ptr)) {
  697. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  698. default:
  699. *nextTokPtr = ptr;
  700. return XML_TOK_INVALID;
  701. }
  702. break;
  703. #endif
  704. case BT_S: case BT_CR: case BT_LF:
  705. {
  706. ptr += MINBPC(enc);
  707. while (ptr != end) {
  708. switch (BYTE_TYPE(enc, ptr)) {
  709. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  710. case BT_GT:
  711. goto gt;
  712. case BT_SOL:
  713. goto sol;
  714. case BT_S: case BT_CR: case BT_LF:
  715. ptr += MINBPC(enc);
  716. continue;
  717. default:
  718. *nextTokPtr = ptr;
  719. return XML_TOK_INVALID;
  720. }
  721. return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
  722. }
  723. return XML_TOK_PARTIAL;
  724. }
  725. case BT_GT:
  726. gt:
  727. *nextTokPtr = ptr + MINBPC(enc);
  728. return XML_TOK_START_TAG_NO_ATTS;
  729. case BT_SOL:
  730. sol:
  731. ptr += MINBPC(enc);
  732. if (ptr == end)
  733. return XML_TOK_PARTIAL;
  734. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  735. *nextTokPtr = ptr;
  736. return XML_TOK_INVALID;
  737. }
  738. *nextTokPtr = ptr + MINBPC(enc);
  739. return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
  740. default:
  741. *nextTokPtr = ptr;
  742. return XML_TOK_INVALID;
  743. }
  744. }
  745. return XML_TOK_PARTIAL;
  746. }
  747. static
  748. int PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
  749. const char **nextTokPtr)
  750. {
  751. if (ptr == end)
  752. return XML_TOK_NONE;
  753. if (MINBPC(enc) > 1) {
  754. size_t n = end - ptr;
  755. if (n & (MINBPC(enc) - 1)) {
  756. n &= ~(MINBPC(enc) - 1);
  757. if (n == 0)
  758. return XML_TOK_PARTIAL;
  759. end = ptr + n;
  760. }
  761. }
  762. switch (BYTE_TYPE(enc, ptr)) {
  763. case BT_LT:
  764. return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  765. case BT_AMP:
  766. return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  767. case BT_CR:
  768. ptr += MINBPC(enc);
  769. if (ptr == end)
  770. return XML_TOK_TRAILING_CR;
  771. if (BYTE_TYPE(enc, ptr) == BT_LF)
  772. ptr += MINBPC(enc);
  773. *nextTokPtr = ptr;
  774. return XML_TOK_DATA_NEWLINE;
  775. case BT_LF:
  776. *nextTokPtr = ptr + MINBPC(enc);
  777. return XML_TOK_DATA_NEWLINE;
  778. case BT_RSQB:
  779. ptr += MINBPC(enc);
  780. if (ptr == end)
  781. return XML_TOK_TRAILING_RSQB;
  782. if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
  783. break;
  784. ptr += MINBPC(enc);
  785. if (ptr == end)
  786. return XML_TOK_TRAILING_RSQB;
  787. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  788. ptr -= MINBPC(enc);
  789. break;
  790. }
  791. *nextTokPtr = ptr;
  792. return XML_TOK_INVALID;
  793. INVALID_CASES(ptr, nextTokPtr)
  794. default:
  795. ptr += MINBPC(enc);
  796. break;
  797. }
  798. while (ptr != end) {
  799. switch (BYTE_TYPE(enc, ptr)) {
  800. #define LEAD_CASE(n) \
  801. case BT_LEAD ## n: \
  802. if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
  803. *nextTokPtr = ptr; \
  804. return XML_TOK_DATA_CHARS; \
  805. } \
  806. ptr += n; \
  807. break;
  808. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  809. #undef LEAD_CASE
  810. case BT_RSQB:
  811. if (ptr + MINBPC(enc) != end) {
  812. if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
  813. ptr += MINBPC(enc);
  814. break;
  815. }
  816. if (ptr + 2*MINBPC(enc) != end) {
  817. if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
  818. ptr += MINBPC(enc);
  819. break;
  820. }
  821. *nextTokPtr = ptr + 2*MINBPC(enc);
  822. return XML_TOK_INVALID;
  823. }
  824. }
  825. /* fall through */
  826. case BT_AMP:
  827. case BT_LT:
  828. case BT_NONXML:
  829. case BT_MALFORM:
  830. case BT_TRAIL:
  831. case BT_CR:
  832. case BT_LF:
  833. *nextTokPtr = ptr;
  834. return XML_TOK_DATA_CHARS;
  835. default:
  836. ptr += MINBPC(enc);
  837. break;
  838. }
  839. }
  840. *nextTokPtr = ptr;
  841. return XML_TOK_DATA_CHARS;
  842. }
  843. /* ptr points to character following "%" */
  844. static
  845. int PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
  846. const char **nextTokPtr)
  847. {
  848. if (ptr == end)
  849. return XML_TOK_PARTIAL;
  850. switch (BYTE_TYPE(enc, ptr)) {
  851. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  852. case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
  853. *nextTokPtr = ptr;
  854. return XML_TOK_PERCENT;
  855. default:
  856. *nextTokPtr = ptr;
  857. return XML_TOK_INVALID;
  858. }
  859. while (ptr != end) {
  860. switch (BYTE_TYPE(enc, ptr)) {
  861. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  862. case BT_SEMI:
  863. *nextTokPtr = ptr + MINBPC(enc);
  864. return XML_TOK_PARAM_ENTITY_REF;
  865. default:
  866. *nextTokPtr = ptr;
  867. return XML_TOK_INVALID;
  868. }
  869. }
  870. return XML_TOK_PARTIAL;
  871. }
  872. static
  873. int PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
  874. const char **nextTokPtr)
  875. {
  876. if (ptr == end)
  877. return XML_TOK_PARTIAL;
  878. switch (BYTE_TYPE(enc, ptr)) {
  879. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  880. default:
  881. *nextTokPtr = ptr;
  882. return XML_TOK_INVALID;
  883. }
  884. while (ptr != end) {
  885. switch (BYTE_TYPE(enc, ptr)) {
  886. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  887. case BT_CR: case BT_LF: case BT_S:
  888. case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
  889. *nextTokPtr = ptr;
  890. return XML_TOK_POUND_NAME;
  891. default:
  892. *nextTokPtr = ptr;
  893. return XML_TOK_INVALID;
  894. }
  895. }
  896. return -XML_TOK_POUND_NAME;
  897. }
  898. static
  899. int PREFIX(scanLit)(int open, const ENCODING *enc,
  900. const char *ptr, const char *end,
  901. const char **nextTokPtr)
  902. {
  903. while (ptr != end) {
  904. int t = BYTE_TYPE(enc, ptr);
  905. switch (t) {
  906. INVALID_CASES(ptr, nextTokPtr)
  907. case BT_QUOT:
  908. case BT_APOS:
  909. ptr += MINBPC(enc);
  910. if (t != open)
  911. break;
  912. if (ptr == end)
  913. return -XML_TOK_LITERAL;
  914. *nextTokPtr = ptr;
  915. switch (BYTE_TYPE(enc, ptr)) {
  916. case BT_S: case BT_CR: case BT_LF:
  917. case BT_GT: case BT_PERCNT: case BT_LSQB:
  918. return XML_TOK_LITERAL;
  919. default:
  920. return XML_TOK_INVALID;
  921. }
  922. default:
  923. ptr += MINBPC(enc);
  924. break;
  925. }
  926. }
  927. return XML_TOK_PARTIAL;
  928. }
  929. static
  930. int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
  931. const char **nextTokPtr)
  932. {
  933. int tok;
  934. if (ptr == end)
  935. return XML_TOK_NONE;
  936. if (MINBPC(enc) > 1) {
  937. size_t n = end - ptr;
  938. if (n & (MINBPC(enc) - 1)) {
  939. n &= ~(MINBPC(enc) - 1);
  940. if (n == 0)
  941. return XML_TOK_PARTIAL;
  942. end = ptr + n;
  943. }
  944. }
  945. switch (BYTE_TYPE(enc, ptr)) {
  946. case BT_QUOT:
  947. return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
  948. case BT_APOS:
  949. return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
  950. case BT_LT:
  951. {
  952. ptr += MINBPC(enc);
  953. if (ptr == end)
  954. return XML_TOK_PARTIAL;
  955. switch (BYTE_TYPE(enc, ptr)) {
  956. case BT_EXCL:
  957. return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  958. case BT_QUEST:
  959. return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  960. case BT_NMSTRT:
  961. case BT_HEX:
  962. case BT_NONASCII:
  963. case BT_LEAD2:
  964. case BT_LEAD3:
  965. case BT_LEAD4:
  966. *nextTokPtr = ptr - MINBPC(enc);
  967. return XML_TOK_INSTANCE_START;
  968. }
  969. *nextTokPtr = ptr;
  970. return XML_TOK_INVALID;
  971. }
  972. case BT_CR:
  973. if (ptr + MINBPC(enc) == end)
  974. return -XML_TOK_PROLOG_S;
  975. /* fall through */
  976. case BT_S: case BT_LF:
  977. for (;;) {
  978. ptr += MINBPC(enc);
  979. if (ptr == end)
  980. break;
  981. switch (BYTE_TYPE(enc, ptr)) {
  982. case BT_S: case BT_LF:
  983. break;
  984. case BT_CR:
  985. /* don't split CR/LF pair */
  986. if (ptr + MINBPC(enc) != end)
  987. break;
  988. /* fall through */
  989. default:
  990. *nextTokPtr = ptr;
  991. return XML_TOK_PROLOG_S;
  992. }
  993. }
  994. *nextTokPtr = ptr;
  995. return XML_TOK_PROLOG_S;
  996. case BT_PERCNT:
  997. return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  998. case BT_COMMA:
  999. *nextTokPtr = ptr + MINBPC(enc);
  1000. return XML_TOK_COMMA;
  1001. case BT_LSQB:
  1002. *nextTokPtr = ptr + MINBPC(enc);
  1003. return XML_TOK_OPEN_BRACKET;
  1004. case BT_RSQB:
  1005. ptr += MINBPC(enc);
  1006. if (ptr == end)
  1007. return -XML_TOK_CLOSE_BRACKET;
  1008. if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
  1009. if (ptr + MINBPC(enc) == end)
  1010. return XML_TOK_PARTIAL;
  1011. if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
  1012. *nextTokPtr = ptr + 2*MINBPC(enc);
  1013. return XML_TOK_COND_SECT_CLOSE;
  1014. }
  1015. }
  1016. *nextTokPtr = ptr;
  1017. return XML_TOK_CLOSE_BRACKET;
  1018. case BT_LPAR:
  1019. *nextTokPtr = ptr + MINBPC(enc);
  1020. return XML_TOK_OPEN_PAREN;
  1021. case BT_RPAR:
  1022. ptr += MINBPC(enc);
  1023. if (ptr == end)
  1024. return -XML_TOK_CLOSE_PAREN;
  1025. switch (BYTE_TYPE(enc, ptr)) {
  1026. case BT_AST:
  1027. *nextTokPtr = ptr + MINBPC(enc);
  1028. return XML_TOK_CLOSE_PAREN_ASTERISK;
  1029. case BT_QUEST:
  1030. *nextTokPtr = ptr + MINBPC(enc);
  1031. return XML_TOK_CLOSE_PAREN_QUESTION;
  1032. case BT_PLUS:
  1033. *nextTokPtr = ptr + MINBPC(enc);
  1034. return XML_TOK_CLOSE_PAREN_PLUS;
  1035. case BT_CR: case BT_LF: case BT_S:
  1036. case BT_GT: case BT_COMMA: case BT_VERBAR:
  1037. case BT_RPAR:
  1038. *nextTokPtr = ptr;
  1039. return XML_TOK_CLOSE_PAREN;
  1040. }
  1041. *nextTokPtr = ptr;
  1042. return XML_TOK_INVALID;
  1043. case BT_VERBAR:
  1044. *nextTokPtr = ptr + MINBPC(enc);
  1045. return XML_TOK_OR;
  1046. case BT_GT:
  1047. *nextTokPtr = ptr + MINBPC(enc);
  1048. return XML_TOK_DECL_CLOSE;
  1049. case BT_NUM:
  1050. return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1051. #define LEAD_CASE(n) \
  1052. case BT_LEAD ## n: \
  1053. if (end - ptr < n) \
  1054. return XML_TOK_PARTIAL_CHAR; \
  1055. if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
  1056. ptr += n; \
  1057. tok = XML_TOK_NAME; \
  1058. break; \
  1059. } \
  1060. if (IS_NAME_CHAR(enc, ptr, n)) { \
  1061. ptr += n; \
  1062. tok = XML_TOK_NMTOKEN; \
  1063. break; \
  1064. } \
  1065. *nextTokPtr = ptr; \
  1066. return XML_TOK_INVALID;
  1067. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1068. #undef LEAD_CASE
  1069. case BT_NMSTRT:
  1070. case BT_HEX:
  1071. tok = XML_TOK_NAME;
  1072. ptr += MINBPC(enc);
  1073. break;
  1074. case BT_DIGIT:
  1075. case BT_NAME:
  1076. case BT_MINUS:
  1077. #ifdef XML_NS
  1078. case BT_COLON:
  1079. #endif
  1080. tok = XML_TOK_NMTOKEN;
  1081. ptr += MINBPC(enc);
  1082. break;
  1083. case BT_NONASCII:
  1084. if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
  1085. ptr += MINBPC(enc);
  1086. tok = XML_TOK_NAME;
  1087. break;
  1088. }
  1089. if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
  1090. ptr += MINBPC(enc);
  1091. tok = XML_TOK_NMTOKEN;
  1092. break;
  1093. }
  1094. /* fall through */
  1095. default:
  1096. *nextTokPtr = ptr;
  1097. return XML_TOK_INVALID;
  1098. }
  1099. while (ptr != end) {
  1100. switch (BYTE_TYPE(enc, ptr)) {
  1101. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  1102. case BT_GT: case BT_RPAR: case BT_COMMA:
  1103. case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
  1104. case BT_S: case BT_CR: case BT_LF:
  1105. *nextTokPtr = ptr;
  1106. return tok;
  1107. #ifdef XML_NS
  1108. case BT_COLON:
  1109. ptr += MINBPC(enc);
  1110. switch (tok) {
  1111. case XML_TOK_NAME:
  1112. if (ptr == end)
  1113. return XML_TOK_PARTIAL;
  1114. tok = XML_TOK_PREFIXED_NAME;
  1115. switch (BYTE_TYPE(enc, ptr)) {
  1116. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  1117. default:
  1118. tok = XML_TOK_NMTOKEN;
  1119. break;
  1120. }
  1121. break;
  1122. case XML_TOK_PREFIXED_NAME:
  1123. tok = XML_TOK_NMTOKEN;
  1124. break;
  1125. }
  1126. break;
  1127. #endif
  1128. case BT_PLUS:
  1129. if (tok == XML_TOK_NMTOKEN) {
  1130. *nextTokPtr = ptr;
  1131. return XML_TOK_INVALID;
  1132. }
  1133. *nextTokPtr = ptr + MINBPC(enc);
  1134. return XML_TOK_NAME_PLUS;
  1135. case BT_AST:
  1136. if (tok == XML_TOK_NMTOKEN) {
  1137. *nextTokPtr = ptr;
  1138. return XML_TOK_INVALID;
  1139. }
  1140. *nextTokPtr = ptr + MINBPC(enc);
  1141. return XML_TOK_NAME_ASTERISK;
  1142. case BT_QUEST:
  1143. if (tok == XML_TOK_NMTOKEN) {
  1144. *nextTokPtr = ptr;
  1145. return XML_TOK_INVALID;
  1146. }
  1147. *nextTokPtr = ptr + MINBPC(enc);
  1148. return XML_TOK_NAME_QUESTION;
  1149. default:
  1150. *nextTokPtr = ptr;
  1151. return XML_TOK_INVALID;
  1152. }
  1153. }
  1154. return -tok;
  1155. }
  1156. static
  1157. int PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
  1158. const char **nextTokPtr)
  1159. {
  1160. const char *start;
  1161. if (ptr == end)
  1162. return XML_TOK_NONE;
  1163. start = ptr;
  1164. while (ptr != end) {
  1165. switch (BYTE_TYPE(enc, ptr)) {
  1166. #define LEAD_CASE(n) \
  1167. case BT_LEAD ## n: ptr += n; break;
  1168. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1169. #undef LEAD_CASE
  1170. case BT_AMP:
  1171. if (ptr == start)
  1172. return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1173. *nextTokPtr = ptr;
  1174. return XML_TOK_DATA_CHARS;
  1175. case BT_LT:
  1176. /* this is for inside entity references */
  1177. *nextTokPtr = ptr;
  1178. return XML_TOK_INVALID;
  1179. case BT_LF:
  1180. if (ptr == start) {
  1181. *nextTokPtr = ptr + MINBPC(enc);
  1182. return XML_TOK_DATA_NEWLINE;
  1183. }
  1184. *nextTokPtr = ptr;
  1185. return XML_TOK_DATA_CHARS;
  1186. case BT_CR:
  1187. if (ptr == start) {
  1188. ptr += MINBPC(enc);
  1189. if (ptr == end)
  1190. return XML_TOK_TRAILING_CR;
  1191. if (BYTE_TYPE(enc, ptr) == BT_LF)
  1192. ptr += MINBPC(enc);
  1193. *nextTokPtr = ptr;
  1194. return XML_TOK_DATA_NEWLINE;
  1195. }
  1196. *nextTokPtr = ptr;
  1197. return XML_TOK_DATA_CHARS;
  1198. case BT_S:
  1199. if (ptr == start) {
  1200. *nextTokPtr = ptr + MINBPC(enc);
  1201. return XML_TOK_ATTRIBUTE_VALUE_S;
  1202. }
  1203. *nextTokPtr = ptr;
  1204. return XML_TOK_DATA_CHARS;
  1205. default:
  1206. ptr += MINBPC(enc);
  1207. break;
  1208. }
  1209. }
  1210. *nextTokPtr = ptr;
  1211. return XML_TOK_DATA_CHARS;
  1212. }
  1213. static
  1214. int PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
  1215. const char **nextTokPtr)
  1216. {
  1217. const char *start;
  1218. if (ptr == end)
  1219. return XML_TOK_NONE;
  1220. start = ptr;
  1221. while (ptr != end) {
  1222. switch (BYTE_TYPE(enc, ptr)) {
  1223. #define LEAD_CASE(n) \
  1224. case BT_LEAD ## n: ptr += n; break;
  1225. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1226. #undef LEAD_CASE
  1227. case BT_AMP:
  1228. if (ptr == start)
  1229. return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1230. *nextTokPtr = ptr;
  1231. return XML_TOK_DATA_CHARS;
  1232. case BT_PERCNT:
  1233. if (ptr == start) {
  1234. int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
  1235. end, nextTokPtr);
  1236. return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
  1237. }
  1238. *nextTokPtr = ptr;
  1239. return XML_TOK_DATA_CHARS;
  1240. case BT_LF:
  1241. if (ptr == start) {
  1242. *nextTokPtr = ptr + MINBPC(enc);
  1243. return XML_TOK_DATA_NEWLINE;
  1244. }
  1245. *nextTokPtr = ptr;
  1246. return XML_TOK_DATA_CHARS;
  1247. case BT_CR:
  1248. if (ptr == start) {
  1249. ptr += MINBPC(enc);
  1250. if (ptr == end)
  1251. return XML_TOK_TRAILING_CR;
  1252. if (BYTE_TYPE(enc, ptr) == BT_LF)
  1253. ptr += MINBPC(enc);
  1254. *nextTokPtr = ptr;
  1255. return XML_TOK_DATA_NEWLINE;
  1256. }
  1257. *nextTokPtr = ptr;
  1258. return XML_TOK_DATA_CHARS;
  1259. default:
  1260. ptr += MINBPC(enc);
  1261. break;
  1262. }
  1263. }
  1264. *nextTokPtr = ptr;
  1265. return XML_TOK_DATA_CHARS;
  1266. }
  1267. #ifdef XML_DTD
  1268. static
  1269. int PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
  1270. const char **nextTokPtr)
  1271. {
  1272. int level = 0;
  1273. if (MINBPC(enc) > 1) {
  1274. size_t n = end - ptr;
  1275. if (n & (MINBPC(enc) - 1)) {
  1276. n &= ~(MINBPC(enc) - 1);
  1277. end = ptr + n;
  1278. }
  1279. }
  1280. while (ptr != end) {
  1281. switch (BYTE_TYPE(enc, ptr)) {
  1282. INVALID_CASES(ptr, nextTokPtr)
  1283. case BT_LT:
  1284. if ((ptr += MINBPC(enc)) == end)
  1285. return XML_TOK_PARTIAL;
  1286. if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
  1287. if ((ptr += MINBPC(enc)) == end)
  1288. return XML_TOK_PARTIAL;
  1289. if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
  1290. ++level;
  1291. ptr += MINBPC(enc);
  1292. }
  1293. }
  1294. break;
  1295. case BT_RSQB:
  1296. if ((ptr += MINBPC(enc)) == end)
  1297. return XML_TOK_PARTIAL;
  1298. if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
  1299. if ((ptr += MINBPC(enc)) == end)
  1300. return XML_TOK_PARTIAL;
  1301. if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  1302. ptr += MINBPC(enc);
  1303. if (level == 0) {
  1304. *nextTokPtr = ptr;
  1305. return XML_TOK_IGNORE_SECT;
  1306. }
  1307. --level;
  1308. }
  1309. }
  1310. break;
  1311. default:
  1312. ptr += MINBPC(enc);
  1313. break;
  1314. }
  1315. }
  1316. return XML_TOK_PARTIAL;
  1317. }
  1318. #endif /* XML_DTD */
  1319. static
  1320. int PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
  1321. const char **badPtr)
  1322. {
  1323. ptr += MINBPC(enc);
  1324. end -= MINBPC(enc);
  1325. for (; ptr != end; ptr += MINBPC(enc)) {
  1326. switch (BYTE_TYPE(enc, ptr)) {
  1327. case BT_DIGIT:
  1328. case BT_HEX:
  1329. case BT_MINUS:
  1330. case BT_APOS:
  1331. case BT_LPAR:
  1332. case BT_RPAR:
  1333. case BT_PLUS:
  1334. case BT_COMMA:
  1335. case BT_SOL:
  1336. case BT_EQUALS:
  1337. case BT_QUEST:
  1338. case BT_CR:
  1339. case BT_LF:
  1340. case BT_SEMI:
  1341. case BT_EXCL:
  1342. case BT_AST:
  1343. case BT_PERCNT:
  1344. case BT_NUM:
  1345. #ifdef XML_NS
  1346. case BT_COLON:
  1347. #endif
  1348. break;
  1349. case BT_S:
  1350. if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
  1351. *badPtr = ptr;
  1352. return 0;
  1353. }
  1354. break;
  1355. case BT_NAME:
  1356. case BT_NMSTRT:
  1357. if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
  1358. break;
  1359. default:
  1360. switch (BYTE_TO_ASCII(enc, ptr)) {
  1361. case 0x24: /* $ */
  1362. case 0x40: /* @ */
  1363. break;
  1364. default:
  1365. *badPtr = ptr;
  1366. return 0;
  1367. }
  1368. break;
  1369. }
  1370. }
  1371. return 1;
  1372. }
  1373. /* This must only be called for a well-formed start-tag or empty element tag.
  1374. Returns the number of attributes. Pointers to the first attsMax attributes
  1375. are stored in atts. */
  1376. static
  1377. int PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
  1378. int attsMax, ATTRIBUTE *atts)
  1379. {
  1380. enum { other, inName, inValue } state = inName;
  1381. int nAtts = 0;
  1382. int open = 0; /* defined when state == inValue;
  1383. initialization just to shut up compilers */
  1384. for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
  1385. switch (BYTE_TYPE(enc, ptr)) {
  1386. #define START_NAME \
  1387. if (state == other) { \
  1388. if (nAtts < attsMax) { \
  1389. atts[nAtts].name = ptr; \
  1390. atts[nAtts].normalized = 1; \
  1391. } \
  1392. state = inName; \
  1393. }
  1394. #define LEAD_CASE(n) \
  1395. case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
  1396. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1397. #undef LEAD_CASE
  1398. case BT_NONASCII:
  1399. case BT_NMSTRT:
  1400. case BT_HEX:
  1401. START_NAME
  1402. break;
  1403. #undef START_NAME
  1404. case BT_QUOT:
  1405. if (state != inValue) {
  1406. if (nAtts < attsMax)
  1407. atts[nAtts].valuePtr = ptr + MINBPC(enc);
  1408. state = inValue;
  1409. open = BT_QUOT;
  1410. }
  1411. else if (open == BT_QUOT) {
  1412. state = other;
  1413. if (nAtts < attsMax)
  1414. atts[nAtts].valueEnd = ptr;
  1415. nAtts++;
  1416. }
  1417. break;
  1418. case BT_APOS:
  1419. if (state != inValue) {
  1420. if (nAtts < attsMax)
  1421. atts[nAtts].valuePtr = ptr + MINBPC(enc);
  1422. state = inValue;
  1423. open = BT_APOS;
  1424. }
  1425. else if (open == BT_APOS) {
  1426. state = other;
  1427. if (nAtts < attsMax)
  1428. atts[nAtts].valueEnd = ptr;
  1429. nAtts++;
  1430. }
  1431. break;
  1432. case BT_AMP:
  1433. if (nAtts < attsMax)
  1434. atts[nAtts].normalized = 0;
  1435. break;
  1436. case BT_S:
  1437. if (state == inName)
  1438. state = other;
  1439. else if (state == inValue
  1440. && nAtts < attsMax
  1441. && atts[nAtts].normalized
  1442. && (ptr == atts[nAtts].valuePtr
  1443. || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
  1444. || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
  1445. || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
  1446. atts[nAtts].normalized = 0;
  1447. break;
  1448. case BT_CR: case BT_LF:
  1449. /* This case ensures that the first attribute name is counted
  1450. Apart from that we could just change state on the quote. */
  1451. if (state == inName)
  1452. state = other;
  1453. else if (state == inValue && nAtts < attsMax)
  1454. atts[nAtts].normalized = 0;
  1455. break;
  1456. case BT_GT:
  1457. case BT_SOL:
  1458. if (state != inValue)
  1459. return nAtts;
  1460. break;
  1461. default:
  1462. break;
  1463. }
  1464. }
  1465. /* not reached */
  1466. }
  1467. static
  1468. int PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
  1469. {
  1470. int result = 0;
  1471. /* skip &# */
  1472. ptr += 2*MINBPC(enc);
  1473. if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
  1474. for (ptr += MINBPC(enc); !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
  1475. int c = BYTE_TO_ASCII(enc, ptr);
  1476. switch (c) {
  1477. case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
  1478. case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
  1479. result <<= 4;
  1480. result |= (c - ASCII_0);
  1481. break;
  1482. case ASCII_A: case ASCII_B: case ASCII_C: case ASCII_D: case ASCII_E: case ASCII_F:
  1483. result <<= 4;
  1484. result += 10 + (c - ASCII_A);
  1485. break;
  1486. case ASCII_a: case ASCII_b: case ASCII_c: case ASCII_d: case ASCII_e: case ASCII_f:
  1487. result <<= 4;
  1488. result += 10 + (c - ASCII_a);
  1489. break;
  1490. }
  1491. if (result >= 0x110000)
  1492. return -1;
  1493. }
  1494. }
  1495. else {
  1496. for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
  1497. int c = BYTE_TO_ASCII(enc, ptr);
  1498. result *= 10;
  1499. result += (c - ASCII_0);
  1500. if (result >= 0x110000)
  1501. return -1;
  1502. }
  1503. }
  1504. return checkCharRefNumber(result);
  1505. }
  1506. static
  1507. int PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, const char *end)
  1508. {
  1509. switch ((end - ptr)/MINBPC(enc)) {
  1510. case 2:
  1511. if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
  1512. switch (BYTE_TO_ASCII(enc, ptr)) {
  1513. case ASCII_l:
  1514. return ASCII_LT;
  1515. case ASCII_g:
  1516. return ASCII_GT;
  1517. }
  1518. }
  1519. break;
  1520. case 3:
  1521. if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
  1522. ptr += MINBPC(enc);
  1523. if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
  1524. ptr += MINBPC(enc);
  1525. if (CHAR_MATCHES(enc, ptr, ASCII_p))
  1526. return ASCII_AMP;
  1527. }
  1528. }
  1529. break;
  1530. case 4:
  1531. switch (BYTE_TO_ASCII(enc, ptr)) {
  1532. case ASCII_q:
  1533. ptr += MINBPC(enc);
  1534. if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
  1535. ptr += MINBPC(enc);
  1536. if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
  1537. ptr += MINBPC(enc);
  1538. if (CHAR_MATCHES(enc, ptr, ASCII_t))
  1539. return ASCII_QUOT;
  1540. }
  1541. }
  1542. break;
  1543. case ASCII_a:
  1544. ptr += MINBPC(enc);
  1545. if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
  1546. ptr += MINBPC(enc);
  1547. if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
  1548. ptr += MINBPC(enc);
  1549. if (CHAR_MATCHES(enc, ptr, ASCII_s))
  1550. return ASCII_APOS;
  1551. }
  1552. }
  1553. break;
  1554. }
  1555. }
  1556. return 0;
  1557. }
  1558. static
  1559. int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
  1560. {
  1561. for (;;) {
  1562. switch (BYTE_TYPE(enc, ptr1)) {
  1563. #define LEAD_CASE(n) \
  1564. case BT_LEAD ## n: \
  1565. if (*ptr1++ != *ptr2++) \
  1566. return 0;
  1567. LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
  1568. #undef LEAD_CASE
  1569. /* fall through */
  1570. if (*ptr1++ != *ptr2++)
  1571. return 0;
  1572. break;
  1573. case BT_NONASCII:
  1574. case BT_NMSTRT:
  1575. #ifdef XML_NS
  1576. case BT_COLON:
  1577. #endif
  1578. case BT_HEX:
  1579. case BT_DIGIT:
  1580. case BT_NAME:
  1581. case BT_MINUS:
  1582. if (*ptr2++ != *ptr1++)
  1583. return 0;
  1584. if (MINBPC(enc) > 1) {
  1585. if (*ptr2++ != *ptr1++)
  1586. return 0;
  1587. if (MINBPC(enc) > 2) {
  1588. if (*ptr2++ != *ptr1++)
  1589. return 0;
  1590. if (MINBPC(enc) > 3) {
  1591. if (*ptr2++ != *ptr1++)
  1592. return 0;
  1593. }
  1594. }
  1595. }
  1596. break;
  1597. default:
  1598. if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
  1599. return 1;
  1600. switch (BYTE_TYPE(enc, ptr2)) {
  1601. case BT_LEAD2:
  1602. case BT_LEAD3:
  1603. case BT_LEAD4:
  1604. case BT_NONASCII:
  1605. case BT_NMSTRT:
  1606. #ifdef XML_NS
  1607. case BT_COLON:
  1608. #endif
  1609. case BT_HEX:
  1610. case BT_DIGIT:
  1611. case BT_NAME:
  1612. case BT_MINUS:
  1613. return 0;
  1614. default:
  1615. return 1;
  1616. }
  1617. }
  1618. }
  1619. /* not reached */
  1620. }
  1621. static
  1622. int PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
  1623. const char *end1, const char *ptr2)
  1624. {
  1625. for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
  1626. if (ptr1 == end1)
  1627. return 0;
  1628. if (!CHAR_MATCHES(enc, ptr1, *ptr2))
  1629. return 0;
  1630. }
  1631. return ptr1 == end1;
  1632. }
  1633. static
  1634. int PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
  1635. {
  1636. const char *start = ptr;
  1637. for (;;) {
  1638. switch (BYTE_TYPE(enc, ptr)) {
  1639. #define LEAD_CASE(n) \
  1640. case BT_LEAD ## n: ptr += n; break;
  1641. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1642. #undef LEAD_CASE
  1643. case BT_NONASCII:
  1644. case BT_NMSTRT:
  1645. #ifdef XML_NS
  1646. case BT_COLON:
  1647. #endif
  1648. case BT_HEX:
  1649. case BT_DIGIT:
  1650. case BT_NAME:
  1651. case BT_MINUS:
  1652. ptr += MINBPC(enc);
  1653. break;
  1654. default:
  1655. return ptr - start;
  1656. }
  1657. }
  1658. }
  1659. static
  1660. const char *PREFIX(skipS)(const ENCODING *enc, const char *ptr)
  1661. {
  1662. for (;;) {
  1663. switch (BYTE_TYPE(enc, ptr)) {
  1664. case BT_LF:
  1665. case BT_CR:
  1666. case BT_S:
  1667. ptr += MINBPC(enc);
  1668. break;
  1669. default:
  1670. return ptr;
  1671. }
  1672. }
  1673. }
  1674. static
  1675. void PREFIX(updatePosition)(const ENCODING *enc,
  1676. const char *ptr,
  1677. const char *end,
  1678. POSITION *pos)
  1679. {
  1680. while (ptr != end) {
  1681. switch (BYTE_TYPE(enc, ptr)) {
  1682. #define LEAD_CASE(n) \
  1683. case BT_LEAD ## n: \
  1684. ptr += n; \
  1685. break;
  1686. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1687. #undef LEAD_CASE
  1688. case BT_LF:
  1689. pos->columnNumber = (unsigned)-1;
  1690. pos->lineNumber++;
  1691. ptr += MINBPC(enc);
  1692. break;
  1693. case BT_CR:
  1694. pos->lineNumber++;
  1695. ptr += MINBPC(enc);
  1696. if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
  1697. ptr += MINBPC(enc);
  1698. pos->columnNumber = (unsigned)-1;
  1699. break;
  1700. default:
  1701. ptr += MINBPC(enc);
  1702. break;
  1703. }
  1704. pos->columnNumber++;
  1705. }
  1706. }
  1707. #undef DO_LEAD_CASE
  1708. #undef MULTIBYTE_CASES
  1709. #undef INVALID_CASES
  1710. #undef CHECK_NAME_CASE
  1711. #undef CHECK_NAME_CASES
  1712. #undef CHECK_NMSTRT_CASE
  1713. #undef CHECK_NMSTRT_CASES