HTMLparser.c 209 KB


  1. /*
  2. * HTMLparser.c : an HTML 4.0 non-verifying parser
  3. *
  4. * See Copyright for the status of this software.
  5. *
  6. * daniel@veillard.com
  7. */
  8. #define IN_LIBXML
  9. #include "libxml.h"
  10. #ifdef LIBXML_HTML_ENABLED
  11. #include <string.h>
  12. #ifdef HAVE_CTYPE_H
  13. #include <ctype.h>
  14. #endif
  15. #ifdef HAVE_STDLIB_H
  16. #include <stdlib.h>
  17. #endif
  18. #ifdef HAVE_SYS_STAT_H
  19. #include <sys/stat.h>
  20. #endif
  21. #ifdef HAVE_FCNTL_H
  22. #include <fcntl.h>
  23. #endif
  24. #ifdef HAVE_UNISTD_H
  25. #include <unistd.h>
  26. #endif
  27. #ifdef LIBXML_ZLIB_ENABLED
  28. #include <zlib.h>
  29. #endif
  30. #include <libxml/xmlmemory.h>
  31. #include <libxml/tree.h>
  32. #include <libxml/parser.h>
  33. #include <libxml/parserInternals.h>
  34. #include <libxml/xmlerror.h>
  35. #include <libxml/HTMLparser.h>
  36. #include <libxml/HTMLtree.h>
  37. #include <libxml/entities.h>
  38. #include <libxml/encoding.h>
  39. #include <libxml/valid.h>
  40. #include <libxml/xmlIO.h>
  41. #include <libxml/globals.h>
  42. #include <libxml/uri.h>
  43. #include "buf.h"
  44. #include "enc.h"
  45. #define HTML_MAX_NAMELEN 1000
  46. #define HTML_PARSER_BIG_BUFFER_SIZE 1000
  47. #define HTML_PARSER_BUFFER_SIZE 100
  48. /* #define DEBUG */
  49. /* #define DEBUG_PUSH */
  50. static int htmlOmittedDefaultValue = 1;
  51. xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
  52. xmlChar end, xmlChar end2, xmlChar end3);
  53. static void htmlParseComment(htmlParserCtxtPtr ctxt);
  54. /************************************************************************
  55. * *
  56. * Some factorized error routines *
  57. * *
  58. ************************************************************************/
  59. /**
  60. * htmlErrMemory:
  61. * @ctxt: an HTML parser context
  62. * @extra: extra information
  63. *
  64. * Handle a redefinition of attribute error
  65. */
  66. static void
  67. htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
  68. {
  69. if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  70. (ctxt->instate == XML_PARSER_EOF))
  71. return;
  72. if (ctxt != NULL) {
  73. ctxt->errNo = XML_ERR_NO_MEMORY;
  74. ctxt->instate = XML_PARSER_EOF;
  75. ctxt->disableSAX = 1;
  76. }
  77. if (extra)
  78. __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  79. XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
  80. NULL, NULL, 0, 0,
  81. "Memory allocation failed : %s\n", extra);
  82. else
  83. __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  84. XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
  85. NULL, NULL, 0, 0, "Memory allocation failed\n");
  86. }
  87. /**
  88. * htmlParseErr:
  89. * @ctxt: an HTML parser context
  90. * @error: the error number
  91. * @msg: the error message
  92. * @str1: string infor
  93. * @str2: string infor
  94. *
  95. * Handle a fatal parser error, i.e. violating Well-Formedness constraints
  96. */
  97. static void LIBXML_ATTR_FORMAT(3,0)
  98. htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
  99. const char *msg, const xmlChar *str1, const xmlChar *str2)
  100. {
  101. if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  102. (ctxt->instate == XML_PARSER_EOF))
  103. return;
  104. if (ctxt != NULL)
  105. ctxt->errNo = error;
  106. __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
  107. XML_ERR_ERROR, NULL, 0,
  108. (const char *) str1, (const char *) str2,
  109. NULL, 0, 0,
  110. msg, str1, str2);
  111. if (ctxt != NULL)
  112. ctxt->wellFormed = 0;
  113. }
  114. /**
  115. * htmlParseErrInt:
  116. * @ctxt: an HTML parser context
  117. * @error: the error number
  118. * @msg: the error message
  119. * @val: integer info
  120. *
  121. * Handle a fatal parser error, i.e. violating Well-Formedness constraints
  122. */
  123. static void LIBXML_ATTR_FORMAT(3,0)
  124. htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
  125. const char *msg, int val)
  126. {
  127. if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  128. (ctxt->instate == XML_PARSER_EOF))
  129. return;
  130. if (ctxt != NULL)
  131. ctxt->errNo = error;
  132. __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
  133. XML_ERR_ERROR, NULL, 0, NULL, NULL,
  134. NULL, val, 0, msg, val);
  135. if (ctxt != NULL)
  136. ctxt->wellFormed = 0;
  137. }
  138. /************************************************************************
  139. * *
  140. * Parser stacks related functions and macros *
  141. * *
  142. ************************************************************************/
  143. /**
  144. * htmlnamePush:
  145. * @ctxt: an HTML parser context
  146. * @value: the element name
  147. *
  148. * Pushes a new element name on top of the name stack
  149. *
  150. * Returns 0 in case of error, the index in the stack otherwise
  151. */
  152. static int
  153. htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
  154. {
  155. if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
  156. ctxt->html = 3;
  157. if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
  158. ctxt->html = 10;
  159. if (ctxt->nameNr >= ctxt->nameMax) {
  160. ctxt->nameMax *= 2;
  161. ctxt->nameTab = (const xmlChar * *)
  162. xmlRealloc((xmlChar * *)ctxt->nameTab,
  163. ctxt->nameMax *
  164. sizeof(ctxt->nameTab[0]));
  165. if (ctxt->nameTab == NULL) {
  166. htmlErrMemory(ctxt, NULL);
  167. return (0);
  168. }
  169. }
  170. ctxt->nameTab[ctxt->nameNr] = value;
  171. ctxt->name = value;
  172. return (ctxt->nameNr++);
  173. }
  174. /**
  175. * htmlnamePop:
  176. * @ctxt: an HTML parser context
  177. *
  178. * Pops the top element name from the name stack
  179. *
  180. * Returns the name just removed
  181. */
  182. static const xmlChar *
  183. htmlnamePop(htmlParserCtxtPtr ctxt)
  184. {
  185. const xmlChar *ret;
  186. if (ctxt->nameNr <= 0)
  187. return (NULL);
  188. ctxt->nameNr--;
  189. if (ctxt->nameNr < 0)
  190. return (NULL);
  191. if (ctxt->nameNr > 0)
  192. ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
  193. else
  194. ctxt->name = NULL;
  195. ret = ctxt->nameTab[ctxt->nameNr];
  196. ctxt->nameTab[ctxt->nameNr] = NULL;
  197. return (ret);
  198. }
  199. /**
  200. * htmlNodeInfoPush:
  201. * @ctxt: an HTML parser context
  202. * @value: the node info
  203. *
  204. * Pushes a new element name on top of the node info stack
  205. *
  206. * Returns 0 in case of error, the index in the stack otherwise
  207. */
  208. static int
  209. htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
  210. {
  211. if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
  212. if (ctxt->nodeInfoMax == 0)
  213. ctxt->nodeInfoMax = 5;
  214. ctxt->nodeInfoMax *= 2;
  215. ctxt->nodeInfoTab = (htmlParserNodeInfo *)
  216. xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
  217. ctxt->nodeInfoMax *
  218. sizeof(ctxt->nodeInfoTab[0]));
  219. if (ctxt->nodeInfoTab == NULL) {
  220. htmlErrMemory(ctxt, NULL);
  221. return (0);
  222. }
  223. }
  224. ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
  225. ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
  226. return (ctxt->nodeInfoNr++);
  227. }
  228. /**
  229. * htmlNodeInfoPop:
  230. * @ctxt: an HTML parser context
  231. *
  232. * Pops the top element name from the node info stack
  233. *
  234. * Returns 0 in case of error, the pointer to NodeInfo otherwise
  235. */
  236. static htmlParserNodeInfo *
  237. htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
  238. {
  239. if (ctxt->nodeInfoNr <= 0)
  240. return (NULL);
  241. ctxt->nodeInfoNr--;
  242. if (ctxt->nodeInfoNr < 0)
  243. return (NULL);
  244. if (ctxt->nodeInfoNr > 0)
  245. ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
  246. else
  247. ctxt->nodeInfo = NULL;
  248. return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
  249. }
  250. /*
  251. * Macros for accessing the content. Those should be used only by the parser,
  252. * and not exported.
  253. *
  254. * Dirty macros, i.e. one need to make assumption on the context to use them
  255. *
  256. * CUR_PTR return the current pointer to the xmlChar to be parsed.
  257. * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
  258. * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
  259. * in UNICODE mode. This should be used internally by the parser
  260. * only to compare to ASCII values otherwise it would break when
  261. * running with UTF-8 encoding.
  262. * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
  263. * to compare on ASCII based substring.
  264. * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
  265. * it should be used only to compare on ASCII based substring.
  266. * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
  267. * strings without newlines within the parser.
  268. *
  269. * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
  270. *
  271. * CURRENT Returns the current char value, with the full decoding of
  272. * UTF-8 if we are using this mode. It returns an int.
  273. * NEXT Skip to the next character, this does the proper decoding
  274. * in UTF-8 mode. It also pop-up unfinished entities on the fly.
  275. * NEXTL(l) Skip the current unicode character of l xmlChars long.
  276. * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
  277. */
  278. #define UPPER (toupper(*ctxt->input->cur))
  279. #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
  280. #define NXT(val) ctxt->input->cur[(val)]
  281. #define UPP(val) (toupper(ctxt->input->cur[(val)]))
  282. #define CUR_PTR ctxt->input->cur
  283. #define BASE_PTR ctxt->input->base
  284. #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
  285. (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
  286. xmlParserInputShrink(ctxt->input)
  287. #define GROW if ((ctxt->progressive == 0) && \
  288. (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
  289. xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
  290. #define CURRENT ((int) (*ctxt->input->cur))
  291. #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
  292. /* Imported from XML */
  293. /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
  294. #define CUR ((int) (*ctxt->input->cur))
  295. #define NEXT xmlNextChar(ctxt)
  296. #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
  297. #define NEXTL(l) do { \
  298. if (*(ctxt->input->cur) == '\n') { \
  299. ctxt->input->line++; ctxt->input->col = 1; \
  300. } else ctxt->input->col++; \
  301. ctxt->token = 0; ctxt->input->cur += l; \
  302. } while (0)
  303. /************
  304. \
  305. if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
  306. if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
  307. ************/
  308. #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
  309. #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
  310. #define COPY_BUF(l,b,i,v) \
  311. if (l == 1) b[i++] = (xmlChar) v; \
  312. else i += xmlCopyChar(l,&b[i],v)
  313. /**
  314. * htmlFindEncoding:
  315. * @the HTML parser context
  316. *
  317. * Ty to find and encoding in the current data available in the input
  318. * buffer this is needed to try to switch to the proper encoding when
  319. * one face a character error.
  320. * That's an heuristic, since it's operating outside of parsing it could
  321. * try to use a meta which had been commented out, that's the reason it
  322. * should only be used in case of error, not as a default.
  323. *
  324. * Returns an encoding string or NULL if not found, the string need to
  325. * be freed
  326. */
  327. static xmlChar *
  328. htmlFindEncoding(xmlParserCtxtPtr ctxt) {
  329. const xmlChar *start, *cur, *end;
  330. if ((ctxt == NULL) || (ctxt->input == NULL) ||
  331. (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
  332. (ctxt->input->buf->encoder != NULL))
  333. return(NULL);
  334. if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
  335. return(NULL);
  336. start = ctxt->input->cur;
  337. end = ctxt->input->end;
  338. /* we also expect the input buffer to be zero terminated */
  339. if (*end != 0)
  340. return(NULL);
  341. cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
  342. if (cur == NULL)
  343. return(NULL);
  344. cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
  345. if (cur == NULL)
  346. return(NULL);
  347. cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
  348. if (cur == NULL)
  349. return(NULL);
  350. cur += 8;
  351. start = cur;
  352. while (((*cur >= 'A') && (*cur <= 'Z')) ||
  353. ((*cur >= 'a') && (*cur <= 'z')) ||
  354. ((*cur >= '0') && (*cur <= '9')) ||
  355. (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
  356. cur++;
  357. if (cur == start)
  358. return(NULL);
  359. return(xmlStrndup(start, cur - start));
  360. }
  361. /**
  362. * htmlCurrentChar:
  363. * @ctxt: the HTML parser context
  364. * @len: pointer to the length of the char read
  365. *
  366. * The current char value, if using UTF-8 this may actually span multiple
  367. * bytes in the input buffer. Implement the end of line normalization:
  368. * 2.11 End-of-Line Handling
  369. * If the encoding is unspecified, in the case we find an ISO-Latin-1
  370. * char, then the encoding converter is plugged in automatically.
  371. *
  372. * Returns the current char value and its length
  373. */
  374. static int
  375. htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
  376. const unsigned char *cur;
  377. unsigned char c;
  378. unsigned int val;
  379. if (ctxt->instate == XML_PARSER_EOF)
  380. return(0);
  381. if (ctxt->token != 0) {
  382. *len = 0;
  383. return(ctxt->token);
  384. }
  385. if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
  386. xmlChar * guess;
  387. xmlCharEncodingHandlerPtr handler;
  388. /*
  389. * Assume it's a fixed length encoding (1) with
  390. * a compatible encoding for the ASCII set, since
  391. * HTML constructs only use < 128 chars
  392. */
  393. if ((int) *ctxt->input->cur < 0x80) {
  394. *len = 1;
  395. if ((*ctxt->input->cur == 0) &&
  396. (ctxt->input->cur < ctxt->input->end)) {
  397. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  398. "Char 0x%X out of allowed range\n", 0);
  399. return(' ');
  400. }
  401. return((int) *ctxt->input->cur);
  402. }
  403. /*
  404. * Humm this is bad, do an automatic flow conversion
  405. */
  406. guess = htmlFindEncoding(ctxt);
  407. if (guess == NULL) {
  408. xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
  409. } else {
  410. if (ctxt->input->encoding != NULL)
  411. xmlFree((xmlChar *) ctxt->input->encoding);
  412. ctxt->input->encoding = guess;
  413. handler = xmlFindCharEncodingHandler((const char *) guess);
  414. if (handler != NULL) {
  415. /*
  416. * Don't use UTF-8 encoder which isn't required and
  417. * can produce invalid UTF-8.
  418. */
  419. if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
  420. xmlSwitchToEncoding(ctxt, handler);
  421. } else {
  422. htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
  423. "Unsupported encoding %s", guess, NULL);
  424. }
  425. }
  426. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  427. }
  428. /*
  429. * We are supposed to handle UTF8, check it's valid
  430. * From rfc2044: encoding of the Unicode values on UTF-8:
  431. *
  432. * UCS-4 range (hex.) UTF-8 octet sequence (binary)
  433. * 0000 0000-0000 007F 0xxxxxxx
  434. * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
  435. * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
  436. *
  437. * Check for the 0x110000 limit too
  438. */
  439. cur = ctxt->input->cur;
  440. c = *cur;
  441. if (c & 0x80) {
  442. if ((c & 0x40) == 0)
  443. goto encoding_error;
  444. if (cur[1] == 0) {
  445. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  446. cur = ctxt->input->cur;
  447. }
  448. if ((cur[1] & 0xc0) != 0x80)
  449. goto encoding_error;
  450. if ((c & 0xe0) == 0xe0) {
  451. if (cur[2] == 0) {
  452. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  453. cur = ctxt->input->cur;
  454. }
  455. if ((cur[2] & 0xc0) != 0x80)
  456. goto encoding_error;
  457. if ((c & 0xf0) == 0xf0) {
  458. if (cur[3] == 0) {
  459. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  460. cur = ctxt->input->cur;
  461. }
  462. if (((c & 0xf8) != 0xf0) ||
  463. ((cur[3] & 0xc0) != 0x80))
  464. goto encoding_error;
  465. /* 4-byte code */
  466. *len = 4;
  467. val = (cur[0] & 0x7) << 18;
  468. val |= (cur[1] & 0x3f) << 12;
  469. val |= (cur[2] & 0x3f) << 6;
  470. val |= cur[3] & 0x3f;
  471. if (val < 0x10000)
  472. goto encoding_error;
  473. } else {
  474. /* 3-byte code */
  475. *len = 3;
  476. val = (cur[0] & 0xf) << 12;
  477. val |= (cur[1] & 0x3f) << 6;
  478. val |= cur[2] & 0x3f;
  479. if (val < 0x800)
  480. goto encoding_error;
  481. }
  482. } else {
  483. /* 2-byte code */
  484. *len = 2;
  485. val = (cur[0] & 0x1f) << 6;
  486. val |= cur[1] & 0x3f;
  487. if (val < 0x80)
  488. goto encoding_error;
  489. }
  490. if (!IS_CHAR(val)) {
  491. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  492. "Char 0x%X out of allowed range\n", val);
  493. }
  494. return(val);
  495. } else {
  496. if ((*ctxt->input->cur == 0) &&
  497. (ctxt->input->cur < ctxt->input->end)) {
  498. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  499. "Char 0x%X out of allowed range\n", 0);
  500. *len = 1;
  501. return(' ');
  502. }
  503. /* 1-byte code */
  504. *len = 1;
  505. return((int) *ctxt->input->cur);
  506. }
  507. encoding_error:
  508. /*
  509. * If we detect an UTF8 error that probably mean that the
  510. * input encoding didn't get properly advertised in the
  511. * declaration header. Report the error and switch the encoding
  512. * to ISO-Latin-1 (if you don't like this policy, just declare the
  513. * encoding !)
  514. */
  515. {
  516. char buffer[150];
  517. if (ctxt->input->end - ctxt->input->cur >= 4) {
  518. snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
  519. ctxt->input->cur[0], ctxt->input->cur[1],
  520. ctxt->input->cur[2], ctxt->input->cur[3]);
  521. } else {
  522. snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
  523. }
  524. htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
  525. "Input is not proper UTF-8, indicate encoding !\n",
  526. BAD_CAST buffer, NULL);
  527. }
  528. /*
  529. * Don't switch encodings twice. Note that if there's an encoder, we
  530. * shouldn't receive invalid UTF-8 anyway.
  531. *
  532. * Note that if ctxt->input->buf == NULL, switching encodings is
  533. * impossible, see Gitlab issue #34.
  534. */
  535. if ((ctxt->input->buf != NULL) &&
  536. (ctxt->input->buf->encoder == NULL))
  537. xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
  538. *len = 1;
  539. return((int) *ctxt->input->cur);
  540. }
  541. /**
  542. * htmlSkipBlankChars:
  543. * @ctxt: the HTML parser context
  544. *
  545. * skip all blanks character found at that point in the input streams.
  546. *
  547. * Returns the number of space chars skipped
  548. */
  549. static int
  550. htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
  551. int res = 0;
  552. while (IS_BLANK_CH(*(ctxt->input->cur))) {
  553. if ((*ctxt->input->cur == 0) &&
  554. (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
  555. xmlPopInput(ctxt);
  556. } else {
  557. if (*(ctxt->input->cur) == '\n') {
  558. ctxt->input->line++; ctxt->input->col = 1;
  559. } else ctxt->input->col++;
  560. ctxt->input->cur++;
  561. if (*ctxt->input->cur == 0)
  562. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  563. }
  564. res++;
  565. }
  566. return(res);
  567. }
  568. /************************************************************************
  569. * *
  570. * The list of HTML elements and their properties *
  571. * *
  572. ************************************************************************/
  573. /*
  574. * Start Tag: 1 means the start tag can be omitted
  575. * End Tag: 1 means the end tag can be omitted
  576. * 2 means it's forbidden (empty elements)
  577. * 3 means the tag is stylistic and should be closed easily
  578. * Depr: this element is deprecated
  579. * DTD: 1 means that this element is valid only in the Loose DTD
  580. * 2 means that this element is valid only in the Frameset DTD
  581. *
  582. * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
  583. , subElements , impliedsubelt , Attributes, userdata
  584. */
  585. /* Definitions and a couple of vars for HTML Elements */
  586. #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
  587. #define NB_FONTSTYLE 8
  588. #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
  589. #define NB_PHRASE 10
  590. #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
  591. #define NB_SPECIAL 16
  592. #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
  593. #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
  594. #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
  595. #define NB_BLOCK NB_HEADING + NB_LIST + 14
  596. #define FORMCTRL "input", "select", "textarea", "label", "button"
  597. #define NB_FORMCTRL 5
  598. #define PCDATA
  599. #define NB_PCDATA 0
  600. #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
  601. #define NB_HEADING 6
  602. #define LIST "ul", "ol", "dir", "menu"
  603. #define NB_LIST 4
  604. #define MODIFIER
  605. #define NB_MODIFIER 0
  606. #define FLOW BLOCK,INLINE
  607. #define NB_FLOW NB_BLOCK + NB_INLINE
  608. #define EMPTY NULL
  609. static const char* const html_flow[] = { FLOW, NULL } ;
  610. static const char* const html_inline[] = { INLINE, NULL } ;
  611. /* placeholders: elts with content but no subelements */
  612. static const char* const html_pcdata[] = { NULL } ;
  613. #define html_cdata html_pcdata
  614. /* ... and for HTML Attributes */
  615. #define COREATTRS "id", "class", "style", "title"
  616. #define NB_COREATTRS 4
  617. #define I18N "lang", "dir"
  618. #define NB_I18N 2
  619. #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
  620. #define NB_EVENTS 9
  621. #define ATTRS COREATTRS,I18N,EVENTS
  622. #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
  623. #define CELLHALIGN "align", "char", "charoff"
  624. #define NB_CELLHALIGN 3
  625. #define CELLVALIGN "valign"
  626. #define NB_CELLVALIGN 1
  627. static const char* const html_attrs[] = { ATTRS, NULL } ;
  628. static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
  629. static const char* const core_attrs[] = { COREATTRS, NULL } ;
  630. static const char* const i18n_attrs[] = { I18N, NULL } ;
  631. /* Other declarations that should go inline ... */
  632. static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
  633. "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
  634. "tabindex", "onfocus", "onblur", NULL } ;
  635. static const char* const target_attr[] = { "target", NULL } ;
  636. static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
  637. static const char* const alt_attr[] = { "alt", NULL } ;
  638. static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
  639. static const char* const href_attrs[] = { "href", NULL } ;
  640. static const char* const clear_attrs[] = { "clear", NULL } ;
  641. static const char* const inline_p[] = { INLINE, "p", NULL } ;
  642. static const char* const flow_param[] = { FLOW, "param", NULL } ;
  643. static const char* const applet_attrs[] = { COREATTRS , "codebase",
  644. "archive", "alt", "name", "height", "width", "align",
  645. "hspace", "vspace", NULL } ;
  646. static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
  647. "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
  648. static const char* const basefont_attrs[] =
  649. { "id", "size", "color", "face", NULL } ;
  650. static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
  651. static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
  652. static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
  653. static const char* const body_depr[] = { "background", "bgcolor", "text",
  654. "link", "vlink", "alink", NULL } ;
  655. static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
  656. "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
  657. static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
  658. static const char* const col_elt[] = { "col", NULL } ;
  659. static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
  660. static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
  661. static const char* const dl_contents[] = { "dt", "dd", NULL } ;
  662. static const char* const compact_attr[] = { "compact", NULL } ;
  663. static const char* const label_attr[] = { "label", NULL } ;
  664. static const char* const fieldset_contents[] = { FLOW, "legend" } ;
  665. static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
  666. static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
  667. static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
  668. static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
  669. static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
  670. static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
  671. static const char* const head_attrs[] = { I18N, "profile", NULL } ;
  672. static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
  673. static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
  674. static const char* const version_attr[] = { "version", NULL } ;
  675. static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
  676. static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
  677. static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
  678. static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
  679. static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
  680. static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
  681. static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
  682. static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
  683. static const char* const align_attr[] = { "align", NULL } ;
  684. static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
  685. static const char* const map_contents[] = { BLOCK, "area", NULL } ;
  686. static const char* const name_attr[] = { "name", NULL } ;
  687. static const char* const action_attr[] = { "action", NULL } ;
  688. static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
  689. static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
  690. static const char* const content_attr[] = { "content", NULL } ;
  691. static const char* const type_attr[] = { "type", NULL } ;
  692. static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
  693. static const char* const object_contents[] = { FLOW, "param", NULL } ;
  694. static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
  695. static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
  696. static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
  697. static const char* const option_elt[] = { "option", NULL } ;
  698. static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
  699. static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
  700. static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
  701. static const char* const width_attr[] = { "width", NULL } ;
  702. static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
  703. static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
  704. static const char* const language_attr[] = { "language", NULL } ;
  705. static const char* const select_content[] = { "optgroup", "option", NULL } ;
  706. static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
  707. static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
  708. static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
  709. static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
  710. static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
  711. static const char* const tr_elt[] = { "tr", NULL } ;
  712. static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
  713. static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
  714. static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
  715. static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
  716. static const char* const tr_contents[] = { "th", "td", NULL } ;
  717. static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
  718. static const char* const li_elt[] = { "li", NULL } ;
  719. static const char* const ul_depr[] = { "type", "compact", NULL} ;
  720. static const char* const dir_attr[] = { "dir", NULL} ;
  721. #define DECL (const char**)
  722. static const htmlElemDesc
  723. html40ElementTable[] = {
  724. { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
  725. DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
  726. },
  727. { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
  728. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  729. },
  730. { "acronym", 0, 0, 0, 0, 0, 0, 1, "",
  731. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  732. },
  733. { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
  734. DECL inline_p , NULL , DECL html_attrs, NULL, NULL
  735. },
  736. { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
  737. DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
  738. },
  739. { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
  740. EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
  741. },
  742. { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
  743. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  744. },
  745. { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
  746. EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
  747. },
  748. { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
  749. EMPTY , NULL , NULL, DECL basefont_attrs, NULL
  750. },
  751. { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
  752. DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
  753. },
  754. { "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
  755. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  756. },
  757. { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
  758. DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
  759. },
  760. { "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
  761. DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
  762. },
  763. { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
  764. EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
  765. },
  766. { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
  767. DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
  768. },
  769. { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
  770. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  771. },
  772. { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
  773. DECL html_flow , NULL , NULL, DECL html_attrs, NULL
  774. },
  775. { "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
  776. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  777. },
  778. { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
  779. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  780. },
  781. { "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
  782. EMPTY , NULL , DECL col_attrs , NULL, NULL
  783. },
  784. { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
  785. DECL col_elt , "col" , DECL col_attrs , NULL, NULL
  786. },
  787. { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
  788. DECL html_flow , NULL , DECL html_attrs, NULL, NULL
  789. },
  790. { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
  791. DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
  792. },
  793. { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
  794. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  795. },
  796. { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
  797. DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
  798. },
  799. { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
  800. DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
  801. },
  802. { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
  803. DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
  804. },
  805. { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
  806. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  807. },
  808. { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
  809. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  810. },
  811. { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
  812. EMPTY, NULL, DECL embed_attrs, NULL, NULL
  813. },
  814. { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
  815. DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
  816. },
  817. { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
  818. DECL html_inline, NULL, NULL, DECL font_attrs, NULL
  819. },
  820. { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
  821. DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
  822. },
  823. { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
  824. EMPTY, NULL, NULL, DECL frame_attrs, NULL
  825. },
  826. { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
  827. DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
  828. },
  829. { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
  830. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  831. },
  832. { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
  833. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  834. },
  835. { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
  836. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  837. },
  838. { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
  839. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  840. },
  841. { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
  842. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  843. },
  844. { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
  845. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  846. },
  847. { "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
  848. DECL head_contents, NULL, DECL head_attrs, NULL, NULL
  849. },
  850. { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
  851. EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
  852. },
  853. { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
  854. DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
  855. },
  856. { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
  857. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  858. },
  859. { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
  860. DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
  861. },
  862. { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
  863. EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
  864. },
  865. { "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
  866. EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
  867. },
  868. { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
  869. DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
  870. },
  871. { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
  872. EMPTY, NULL, NULL, DECL prompt_attrs, NULL
  873. },
  874. { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
  875. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  876. },
  877. { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
  878. DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
  879. },
  880. { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
  881. DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
  882. },
  883. { "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
  884. DECL html_flow, NULL, DECL html_attrs, NULL, NULL
  885. },
  886. { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
  887. EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
  888. },
  889. { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
  890. DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
  891. },
  892. { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
  893. DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
  894. },
  895. { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
  896. EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
  897. },
  898. { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
  899. DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
  900. },
  901. { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
  902. DECL html_flow, "div", DECL html_attrs, NULL, NULL
  903. },
  904. { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
  905. DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
  906. },
  907. { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
  908. DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
  909. },
  910. { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
  911. DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
  912. },
  913. { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
  914. DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
  915. },
  916. { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
  917. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  918. },
  919. { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
  920. EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
  921. },
  922. { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
  923. DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
  924. },
  925. { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
  926. DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
  927. },
  928. { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
  929. DECL html_inline, NULL, NULL, DECL html_attrs, NULL
  930. },
  931. { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
  932. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  933. },
  934. { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
  935. DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
  936. },
  937. { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
  938. DECL select_content, NULL, DECL select_attrs, NULL, NULL
  939. },
  940. { "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
  941. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  942. },
  943. { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
  944. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  945. },
  946. { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
  947. DECL html_inline, NULL, NULL, DECL html_attrs, NULL
  948. },
  949. { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
  950. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  951. },
  952. { "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
  953. DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
  954. },
  955. { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
  956. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  957. },
  958. { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
  959. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  960. },
  961. { "table", 0, 0, 0, 0, 0, 0, 0, "",
  962. DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
  963. },
  964. { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
  965. DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
  966. },
  967. { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
  968. DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
  969. },
  970. { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
  971. DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
  972. },
  973. { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
  974. DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
  975. },
  976. { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
  977. DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
  978. },
  979. { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
  980. DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
  981. },
  982. { "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
  983. DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
  984. },
  985. { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
  986. DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
  987. },
  988. { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
  989. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  990. },
  991. { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
  992. DECL html_inline, NULL, NULL, DECL html_attrs, NULL
  993. },
  994. { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
  995. DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
  996. },
  997. { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
  998. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  999. }
  1000. };
  1001. typedef struct {
  1002. const char *oldTag;
  1003. const char *newTag;
  1004. } htmlStartCloseEntry;
  1005. /*
  1006. * start tags that imply the end of current element
  1007. */
  1008. static const htmlStartCloseEntry htmlStartClose[] = {
  1009. { "a", "a" },
  1010. { "a", "fieldset" },
  1011. { "a", "table" },
  1012. { "a", "td" },
  1013. { "a", "th" },
  1014. { "address", "dd" },
  1015. { "address", "dl" },
  1016. { "address", "dt" },
  1017. { "address", "form" },
  1018. { "address", "li" },
  1019. { "address", "ul" },
  1020. { "b", "center" },
  1021. { "b", "p" },
  1022. { "b", "td" },
  1023. { "b", "th" },
  1024. { "big", "p" },
  1025. { "caption", "col" },
  1026. { "caption", "colgroup" },
  1027. { "caption", "tbody" },
  1028. { "caption", "tfoot" },
  1029. { "caption", "thead" },
  1030. { "caption", "tr" },
  1031. { "col", "col" },
  1032. { "col", "colgroup" },
  1033. { "col", "tbody" },
  1034. { "col", "tfoot" },
  1035. { "col", "thead" },
  1036. { "col", "tr" },
  1037. { "colgroup", "colgroup" },
  1038. { "colgroup", "tbody" },
  1039. { "colgroup", "tfoot" },
  1040. { "colgroup", "thead" },
  1041. { "colgroup", "tr" },
  1042. { "dd", "dt" },
  1043. { "dir", "dd" },
  1044. { "dir", "dl" },
  1045. { "dir", "dt" },
  1046. { "dir", "form" },
  1047. { "dir", "ul" },
  1048. { "dl", "form" },
  1049. { "dl", "li" },
  1050. { "dt", "dd" },
  1051. { "dt", "dl" },
  1052. { "font", "center" },
  1053. { "font", "td" },
  1054. { "font", "th" },
  1055. { "form", "form" },
  1056. { "h1", "fieldset" },
  1057. { "h1", "form" },
  1058. { "h1", "li" },
  1059. { "h1", "p" },
  1060. { "h1", "table" },
  1061. { "h2", "fieldset" },
  1062. { "h2", "form" },
  1063. { "h2", "li" },
  1064. { "h2", "p" },
  1065. { "h2", "table" },
  1066. { "h3", "fieldset" },
  1067. { "h3", "form" },
  1068. { "h3", "li" },
  1069. { "h3", "p" },
  1070. { "h3", "table" },
  1071. { "h4", "fieldset" },
  1072. { "h4", "form" },
  1073. { "h4", "li" },
  1074. { "h4", "p" },
  1075. { "h4", "table" },
  1076. { "h5", "fieldset" },
  1077. { "h5", "form" },
  1078. { "h5", "li" },
  1079. { "h5", "p" },
  1080. { "h5", "table" },
  1081. { "h6", "fieldset" },
  1082. { "h6", "form" },
  1083. { "h6", "li" },
  1084. { "h6", "p" },
  1085. { "h6", "table" },
  1086. { "head", "a" },
  1087. { "head", "abbr" },
  1088. { "head", "acronym" },
  1089. { "head", "address" },
  1090. { "head", "b" },
  1091. { "head", "bdo" },
  1092. { "head", "big" },
  1093. { "head", "blockquote" },
  1094. { "head", "body" },
  1095. { "head", "br" },
  1096. { "head", "center" },
  1097. { "head", "cite" },
  1098. { "head", "code" },
  1099. { "head", "dd" },
  1100. { "head", "dfn" },
  1101. { "head", "dir" },
  1102. { "head", "div" },
  1103. { "head", "dl" },
  1104. { "head", "dt" },
  1105. { "head", "em" },
  1106. { "head", "fieldset" },
  1107. { "head", "font" },
  1108. { "head", "form" },
  1109. { "head", "frameset" },
  1110. { "head", "h1" },
  1111. { "head", "h2" },
  1112. { "head", "h3" },
  1113. { "head", "h4" },
  1114. { "head", "h5" },
  1115. { "head", "h6" },
  1116. { "head", "hr" },
  1117. { "head", "i" },
  1118. { "head", "iframe" },
  1119. { "head", "img" },
  1120. { "head", "kbd" },
  1121. { "head", "li" },
  1122. { "head", "listing" },
  1123. { "head", "map" },
  1124. { "head", "menu" },
  1125. { "head", "ol" },
  1126. { "head", "p" },
  1127. { "head", "pre" },
  1128. { "head", "q" },
  1129. { "head", "s" },
  1130. { "head", "samp" },
  1131. { "head", "small" },
  1132. { "head", "span" },
  1133. { "head", "strike" },
  1134. { "head", "strong" },
  1135. { "head", "sub" },
  1136. { "head", "sup" },
  1137. { "head", "table" },
  1138. { "head", "tt" },
  1139. { "head", "u" },
  1140. { "head", "ul" },
  1141. { "head", "var" },
  1142. { "head", "xmp" },
  1143. { "hr", "form" },
  1144. { "i", "center" },
  1145. { "i", "p" },
  1146. { "i", "td" },
  1147. { "i", "th" },
  1148. { "legend", "fieldset" },
  1149. { "li", "li" },
  1150. { "link", "body" },
  1151. { "link", "frameset" },
  1152. { "listing", "dd" },
  1153. { "listing", "dl" },
  1154. { "listing", "dt" },
  1155. { "listing", "fieldset" },
  1156. { "listing", "form" },
  1157. { "listing", "li" },
  1158. { "listing", "table" },
  1159. { "listing", "ul" },
  1160. { "menu", "dd" },
  1161. { "menu", "dl" },
  1162. { "menu", "dt" },
  1163. { "menu", "form" },
  1164. { "menu", "ul" },
  1165. { "ol", "form" },
  1166. { "ol", "ul" },
  1167. { "option", "optgroup" },
  1168. { "option", "option" },
  1169. { "p", "address" },
  1170. { "p", "blockquote" },
  1171. { "p", "body" },
  1172. { "p", "caption" },
  1173. { "p", "center" },
  1174. { "p", "col" },
  1175. { "p", "colgroup" },
  1176. { "p", "dd" },
  1177. { "p", "dir" },
  1178. { "p", "div" },
  1179. { "p", "dl" },
  1180. { "p", "dt" },
  1181. { "p", "fieldset" },
  1182. { "p", "form" },
  1183. { "p", "frameset" },
  1184. { "p", "h1" },
  1185. { "p", "h2" },
  1186. { "p", "h3" },
  1187. { "p", "h4" },
  1188. { "p", "h5" },
  1189. { "p", "h6" },
  1190. { "p", "head" },
  1191. { "p", "hr" },
  1192. { "p", "li" },
  1193. { "p", "listing" },
  1194. { "p", "menu" },
  1195. { "p", "ol" },
  1196. { "p", "p" },
  1197. { "p", "pre" },
  1198. { "p", "table" },
  1199. { "p", "tbody" },
  1200. { "p", "td" },
  1201. { "p", "tfoot" },
  1202. { "p", "th" },
  1203. { "p", "title" },
  1204. { "p", "tr" },
  1205. { "p", "ul" },
  1206. { "p", "xmp" },
  1207. { "pre", "dd" },
  1208. { "pre", "dl" },
  1209. { "pre", "dt" },
  1210. { "pre", "fieldset" },
  1211. { "pre", "form" },
  1212. { "pre", "li" },
  1213. { "pre", "table" },
  1214. { "pre", "ul" },
  1215. { "s", "p" },
  1216. { "script", "noscript" },
  1217. { "small", "p" },
  1218. { "span", "td" },
  1219. { "span", "th" },
  1220. { "strike", "p" },
  1221. { "style", "body" },
  1222. { "style", "frameset" },
  1223. { "tbody", "tbody" },
  1224. { "tbody", "tfoot" },
  1225. { "td", "tbody" },
  1226. { "td", "td" },
  1227. { "td", "tfoot" },
  1228. { "td", "th" },
  1229. { "td", "tr" },
  1230. { "tfoot", "tbody" },
  1231. { "th", "tbody" },
  1232. { "th", "td" },
  1233. { "th", "tfoot" },
  1234. { "th", "th" },
  1235. { "th", "tr" },
  1236. { "thead", "tbody" },
  1237. { "thead", "tfoot" },
  1238. { "title", "body" },
  1239. { "title", "frameset" },
  1240. { "tr", "tbody" },
  1241. { "tr", "tfoot" },
  1242. { "tr", "tr" },
  1243. { "tt", "p" },
  1244. { "u", "p" },
  1245. { "u", "td" },
  1246. { "u", "th" },
  1247. { "ul", "address" },
  1248. { "ul", "form" },
  1249. { "ul", "menu" },
  1250. { "ul", "ol" },
  1251. { "ul", "pre" },
  1252. { "xmp", "dd" },
  1253. { "xmp", "dl" },
  1254. { "xmp", "dt" },
  1255. { "xmp", "fieldset" },
  1256. { "xmp", "form" },
  1257. { "xmp", "li" },
  1258. { "xmp", "table" },
  1259. { "xmp", "ul" }
  1260. };
  1261. /*
  1262. * The list of HTML elements which are supposed not to have
  1263. * CDATA content and where a p element will be implied
  1264. *
  1265. * TODO: extend that list by reading the HTML SGML DTD on
  1266. * implied paragraph
  1267. */
  1268. static const char *const htmlNoContentElements[] = {
  1269. "html",
  1270. "head",
  1271. NULL
  1272. };
  1273. /*
  1274. * The list of HTML attributes which are of content %Script;
  1275. * NOTE: when adding ones, check htmlIsScriptAttribute() since
  1276. * it assumes the name starts with 'on'
  1277. */
  1278. static const char *const htmlScriptAttributes[] = {
  1279. "onclick",
  1280. "ondblclick",
  1281. "onmousedown",
  1282. "onmouseup",
  1283. "onmouseover",
  1284. "onmousemove",
  1285. "onmouseout",
  1286. "onkeypress",
  1287. "onkeydown",
  1288. "onkeyup",
  1289. "onload",
  1290. "onunload",
  1291. "onfocus",
  1292. "onblur",
  1293. "onsubmit",
  1294. "onreset",
  1295. "onchange",
  1296. "onselect"
  1297. };
  1298. /*
  1299. * This table is used by the htmlparser to know what to do with
  1300. * broken html pages. By assigning different priorities to different
  1301. * elements the parser can decide how to handle extra endtags.
  1302. * Endtags are only allowed to close elements with lower or equal
  1303. * priority.
  1304. */
  1305. typedef struct {
  1306. const char *name;
  1307. int priority;
  1308. } elementPriority;
  1309. static const elementPriority htmlEndPriority[] = {
  1310. {"div", 150},
  1311. {"td", 160},
  1312. {"th", 160},
  1313. {"tr", 170},
  1314. {"thead", 180},
  1315. {"tbody", 180},
  1316. {"tfoot", 180},
  1317. {"table", 190},
  1318. {"head", 200},
  1319. {"body", 200},
  1320. {"html", 220},
  1321. {NULL, 100} /* Default priority */
  1322. };
  1323. /************************************************************************
  1324. * *
  1325. * functions to handle HTML specific data *
  1326. * *
  1327. ************************************************************************/
  1328. /**
  1329. * htmlInitAutoClose:
  1330. *
  1331. * This is a no-op now.
  1332. */
  1333. void
  1334. htmlInitAutoClose(void) {
  1335. }
  1336. static int __cdecl
  1337. htmlCompareTags(const void *key, const void *member) {
  1338. const xmlChar *tag = (const xmlChar *) key;
  1339. const htmlElemDesc *desc = (const htmlElemDesc *) member;
  1340. return(xmlStrcasecmp(tag, BAD_CAST desc->name));
  1341. }
  1342. /**
  1343. * htmlTagLookup:
  1344. * @tag: The tag name in lowercase
  1345. *
  1346. * Lookup the HTML tag in the ElementTable
  1347. *
  1348. * Returns the related htmlElemDescPtr or NULL if not found.
  1349. */
  1350. const htmlElemDesc *
  1351. htmlTagLookup(const xmlChar *tag) {
  1352. if (tag == NULL)
  1353. return(NULL);
  1354. return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
  1355. sizeof(html40ElementTable) / sizeof(htmlElemDesc),
  1356. sizeof(htmlElemDesc), htmlCompareTags));
  1357. }
  1358. /**
  1359. * htmlGetEndPriority:
  1360. * @name: The name of the element to look up the priority for.
  1361. *
  1362. * Return value: The "endtag" priority.
  1363. **/
  1364. static int
  1365. htmlGetEndPriority (const xmlChar *name) {
  1366. int i = 0;
  1367. while ((htmlEndPriority[i].name != NULL) &&
  1368. (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
  1369. i++;
  1370. return(htmlEndPriority[i].priority);
  1371. }
  1372. static int __cdecl
  1373. htmlCompareStartClose(const void *vkey, const void *member) {
  1374. const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
  1375. const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
  1376. int ret;
  1377. ret = strcmp(key->oldTag, entry->oldTag);
  1378. if (ret == 0)
  1379. ret = strcmp(key->newTag, entry->newTag);
  1380. return(ret);
  1381. }
  1382. /**
  1383. * htmlCheckAutoClose:
  1384. * @newtag: The new tag name
  1385. * @oldtag: The old tag name
  1386. *
  1387. * Checks whether the new tag is one of the registered valid tags for
  1388. * closing old.
  1389. *
  1390. * Returns 0 if no, 1 if yes.
  1391. */
  1392. static int
  1393. htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
  1394. {
  1395. htmlStartCloseEntry key;
  1396. void *res;
  1397. key.oldTag = (const char *) oldtag;
  1398. key.newTag = (const char *) newtag;
  1399. res = bsearch(&key, htmlStartClose,
  1400. sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
  1401. sizeof(htmlStartCloseEntry), htmlCompareStartClose);
  1402. return(res != NULL);
  1403. }
  1404. /**
  1405. * htmlAutoCloseOnClose:
  1406. * @ctxt: an HTML parser context
  1407. * @newtag: The new tag name
  1408. * @force: force the tag closure
  1409. *
  1410. * The HTML DTD allows an ending tag to implicitly close other tags.
  1411. */
  1412. static void
  1413. htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
  1414. {
  1415. const htmlElemDesc *info;
  1416. int i, priority;
  1417. priority = htmlGetEndPriority(newtag);
  1418. for (i = (ctxt->nameNr - 1); i >= 0; i--) {
  1419. if (xmlStrEqual(newtag, ctxt->nameTab[i]))
  1420. break;
  1421. /*
  1422. * A misplaced endtag can only close elements with lower
  1423. * or equal priority, so if we find an element with higher
  1424. * priority before we find an element with
  1425. * matching name, we just ignore this endtag
  1426. */
  1427. if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
  1428. return;
  1429. }
  1430. if (i < 0)
  1431. return;
  1432. while (!xmlStrEqual(newtag, ctxt->name)) {
  1433. info = htmlTagLookup(ctxt->name);
  1434. if ((info != NULL) && (info->endTag == 3)) {
  1435. htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
  1436. "Opening and ending tag mismatch: %s and %s\n",
  1437. newtag, ctxt->name);
  1438. }
  1439. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  1440. ctxt->sax->endElement(ctxt->userData, ctxt->name);
  1441. htmlnamePop(ctxt);
  1442. }
  1443. }
  1444. /**
  1445. * htmlAutoCloseOnEnd:
  1446. * @ctxt: an HTML parser context
  1447. *
  1448. * Close all remaining tags at the end of the stream
  1449. */
  1450. static void
  1451. htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
  1452. {
  1453. int i;
  1454. if (ctxt->nameNr == 0)
  1455. return;
  1456. for (i = (ctxt->nameNr - 1); i >= 0; i--) {
  1457. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  1458. ctxt->sax->endElement(ctxt->userData, ctxt->name);
  1459. htmlnamePop(ctxt);
  1460. }
  1461. }
  1462. /**
  1463. * htmlAutoClose:
  1464. * @ctxt: an HTML parser context
  1465. * @newtag: The new tag name or NULL
  1466. *
  1467. * The HTML DTD allows a tag to implicitly close other tags.
  1468. * The list is kept in htmlStartClose array. This function is
  1469. * called when a new tag has been detected and generates the
  1470. * appropriates closes if possible/needed.
  1471. * If newtag is NULL this mean we are at the end of the resource
  1472. * and we should check
  1473. */
  1474. static void
  1475. htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
  1476. {
  1477. while ((newtag != NULL) && (ctxt->name != NULL) &&
  1478. (htmlCheckAutoClose(newtag, ctxt->name))) {
  1479. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  1480. ctxt->sax->endElement(ctxt->userData, ctxt->name);
  1481. htmlnamePop(ctxt);
  1482. }
  1483. if (newtag == NULL) {
  1484. htmlAutoCloseOnEnd(ctxt);
  1485. return;
  1486. }
  1487. while ((newtag == NULL) && (ctxt->name != NULL) &&
  1488. ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
  1489. (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
  1490. (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
  1491. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  1492. ctxt->sax->endElement(ctxt->userData, ctxt->name);
  1493. htmlnamePop(ctxt);
  1494. }
  1495. }
  1496. /**
  1497. * htmlAutoCloseTag:
  1498. * @doc: the HTML document
  1499. * @name: The tag name
  1500. * @elem: the HTML element
  1501. *
  1502. * The HTML DTD allows a tag to implicitly close other tags.
  1503. * The list is kept in htmlStartClose array. This function checks
  1504. * if the element or one of it's children would autoclose the
  1505. * given tag.
  1506. *
  1507. * Returns 1 if autoclose, 0 otherwise
  1508. */
  1509. int
  1510. htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
  1511. htmlNodePtr child;
  1512. if (elem == NULL) return(1);
  1513. if (xmlStrEqual(name, elem->name)) return(0);
  1514. if (htmlCheckAutoClose(elem->name, name)) return(1);
  1515. child = elem->children;
  1516. while (child != NULL) {
  1517. if (htmlAutoCloseTag(doc, name, child)) return(1);
  1518. child = child->next;
  1519. }
  1520. return(0);
  1521. }
  1522. /**
  1523. * htmlIsAutoClosed:
  1524. * @doc: the HTML document
  1525. * @elem: the HTML element
  1526. *
  1527. * The HTML DTD allows a tag to implicitly close other tags.
  1528. * The list is kept in htmlStartClose array. This function checks
  1529. * if a tag is autoclosed by one of it's child
  1530. *
  1531. * Returns 1 if autoclosed, 0 otherwise
  1532. */
  1533. int
  1534. htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
  1535. htmlNodePtr child;
  1536. if (elem == NULL) return(1);
  1537. child = elem->children;
  1538. while (child != NULL) {
  1539. if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
  1540. child = child->next;
  1541. }
  1542. return(0);
  1543. }
  1544. /**
  1545. * htmlCheckImplied:
  1546. * @ctxt: an HTML parser context
  1547. * @newtag: The new tag name
  1548. *
  1549. * The HTML DTD allows a tag to exists only implicitly
  1550. * called when a new tag has been detected and generates the
  1551. * appropriates implicit tags if missing
  1552. */
  1553. static void
  1554. htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
  1555. int i;
  1556. if (ctxt->options & HTML_PARSE_NOIMPLIED)
  1557. return;
  1558. if (!htmlOmittedDefaultValue)
  1559. return;
  1560. if (xmlStrEqual(newtag, BAD_CAST"html"))
  1561. return;
  1562. if (ctxt->nameNr <= 0) {
  1563. htmlnamePush(ctxt, BAD_CAST"html");
  1564. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
  1565. ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
  1566. }
  1567. if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
  1568. return;
  1569. if ((ctxt->nameNr <= 1) &&
  1570. ((xmlStrEqual(newtag, BAD_CAST"script")) ||
  1571. (xmlStrEqual(newtag, BAD_CAST"style")) ||
  1572. (xmlStrEqual(newtag, BAD_CAST"meta")) ||
  1573. (xmlStrEqual(newtag, BAD_CAST"link")) ||
  1574. (xmlStrEqual(newtag, BAD_CAST"title")) ||
  1575. (xmlStrEqual(newtag, BAD_CAST"base")))) {
  1576. if (ctxt->html >= 3) {
  1577. /* we already saw or generated an <head> before */
  1578. return;
  1579. }
  1580. /*
  1581. * dropped OBJECT ... i you put it first BODY will be
  1582. * assumed !
  1583. */
  1584. htmlnamePush(ctxt, BAD_CAST"head");
  1585. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
  1586. ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
  1587. } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
  1588. (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
  1589. (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
  1590. if (ctxt->html >= 10) {
  1591. /* we already saw or generated a <body> before */
  1592. return;
  1593. }
  1594. for (i = 0;i < ctxt->nameNr;i++) {
  1595. if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
  1596. return;
  1597. }
  1598. if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
  1599. return;
  1600. }
  1601. }
  1602. htmlnamePush(ctxt, BAD_CAST"body");
  1603. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
  1604. ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
  1605. }
  1606. }
  1607. /**
  1608. * htmlCheckParagraph
  1609. * @ctxt: an HTML parser context
  1610. *
  1611. * Check whether a p element need to be implied before inserting
  1612. * characters in the current element.
  1613. *
  1614. * Returns 1 if a paragraph has been inserted, 0 if not and -1
  1615. * in case of error.
  1616. */
  1617. static int
  1618. htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
  1619. const xmlChar *tag;
  1620. int i;
  1621. if (ctxt == NULL)
  1622. return(-1);
  1623. tag = ctxt->name;
  1624. if (tag == NULL) {
  1625. htmlAutoClose(ctxt, BAD_CAST"p");
  1626. htmlCheckImplied(ctxt, BAD_CAST"p");
  1627. htmlnamePush(ctxt, BAD_CAST"p");
  1628. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
  1629. ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
  1630. return(1);
  1631. }
  1632. if (!htmlOmittedDefaultValue)
  1633. return(0);
  1634. for (i = 0; htmlNoContentElements[i] != NULL; i++) {
  1635. if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
  1636. htmlAutoClose(ctxt, BAD_CAST"p");
  1637. htmlCheckImplied(ctxt, BAD_CAST"p");
  1638. htmlnamePush(ctxt, BAD_CAST"p");
  1639. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
  1640. ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
  1641. return(1);
  1642. }
  1643. }
  1644. return(0);
  1645. }
  1646. /**
  1647. * htmlIsScriptAttribute:
  1648. * @name: an attribute name
  1649. *
  1650. * Check if an attribute is of content type Script
  1651. *
  1652. * Returns 1 is the attribute is a script 0 otherwise
  1653. */
  1654. int
  1655. htmlIsScriptAttribute(const xmlChar *name) {
  1656. unsigned int i;
  1657. if (name == NULL)
  1658. return(0);
  1659. /*
  1660. * all script attributes start with 'on'
  1661. */
  1662. if ((name[0] != 'o') || (name[1] != 'n'))
  1663. return(0);
  1664. for (i = 0;
  1665. i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
  1666. i++) {
  1667. if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
  1668. return(1);
  1669. }
  1670. return(0);
  1671. }
  1672. /************************************************************************
  1673. * *
  1674. * The list of HTML predefined entities *
  1675. * *
  1676. ************************************************************************/
  1677. static const htmlEntityDesc html40EntitiesTable[] = {
  1678. /*
  1679. * the 4 absolute ones, plus apostrophe.
  1680. */
  1681. { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
  1682. { 38, "amp", "ampersand, U+0026 ISOnum" },
  1683. { 39, "apos", "single quote" },
  1684. { 60, "lt", "less-than sign, U+003C ISOnum" },
  1685. { 62, "gt", "greater-than sign, U+003E ISOnum" },
  1686. /*
  1687. * A bunch still in the 128-255 range
  1688. * Replacing them depend really on the charset used.
  1689. */
  1690. { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
  1691. { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
  1692. { 162, "cent", "cent sign, U+00A2 ISOnum" },
  1693. { 163, "pound","pound sign, U+00A3 ISOnum" },
  1694. { 164, "curren","currency sign, U+00A4 ISOnum" },
  1695. { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
  1696. { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
  1697. { 167, "sect", "section sign, U+00A7 ISOnum" },
  1698. { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
  1699. { 169, "copy", "copyright sign, U+00A9 ISOnum" },
  1700. { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
  1701. { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
  1702. { 172, "not", "not sign, U+00AC ISOnum" },
  1703. { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
  1704. { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
  1705. { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
  1706. { 176, "deg", "degree sign, U+00B0 ISOnum" },
  1707. { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
  1708. { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
  1709. { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
  1710. { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
  1711. { 181, "micro","micro sign, U+00B5 ISOnum" },
  1712. { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
  1713. { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
  1714. { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
  1715. { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
  1716. { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
  1717. { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
  1718. { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
  1719. { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
  1720. { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
  1721. { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
  1722. { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
  1723. { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
  1724. { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
  1725. { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
  1726. { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
  1727. { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
  1728. { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
  1729. { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
  1730. { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
  1731. { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
  1732. { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
  1733. { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
  1734. { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
  1735. { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
  1736. { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
  1737. { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
  1738. { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
  1739. { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
  1740. { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
  1741. { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
  1742. { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
  1743. { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
  1744. { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
  1745. { 215, "times","multiplication sign, U+00D7 ISOnum" },
  1746. { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
  1747. { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
  1748. { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
  1749. { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
  1750. { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
  1751. { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
  1752. { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
  1753. { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
  1754. { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
  1755. { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
  1756. { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
  1757. { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
  1758. { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
  1759. { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
  1760. { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
  1761. { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
  1762. { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
  1763. { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
  1764. { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
  1765. { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
  1766. { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
  1767. { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
  1768. { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
  1769. { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
  1770. { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
  1771. { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
  1772. { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
  1773. { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
  1774. { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
  1775. { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
  1776. { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
  1777. { 247, "divide","division sign, U+00F7 ISOnum" },
  1778. { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
  1779. { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
  1780. { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
  1781. { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
  1782. { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
  1783. { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
  1784. { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
  1785. { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
  1786. { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
  1787. { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
  1788. { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
  1789. { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
  1790. { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
  1791. /*
  1792. * Anything below should really be kept as entities references
  1793. */
  1794. { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
  1795. { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
  1796. { 732, "tilde","small tilde, U+02DC ISOdia" },
  1797. { 913, "Alpha","greek capital letter alpha, U+0391" },
  1798. { 914, "Beta", "greek capital letter beta, U+0392" },
  1799. { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
  1800. { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
  1801. { 917, "Epsilon","greek capital letter epsilon, U+0395" },
  1802. { 918, "Zeta", "greek capital letter zeta, U+0396" },
  1803. { 919, "Eta", "greek capital letter eta, U+0397" },
  1804. { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
  1805. { 921, "Iota", "greek capital letter iota, U+0399" },
  1806. { 922, "Kappa","greek capital letter kappa, U+039A" },
  1807. { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
  1808. { 924, "Mu", "greek capital letter mu, U+039C" },
  1809. { 925, "Nu", "greek capital letter nu, U+039D" },
  1810. { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
  1811. { 927, "Omicron","greek capital letter omicron, U+039F" },
  1812. { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
  1813. { 929, "Rho", "greek capital letter rho, U+03A1" },
  1814. { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
  1815. { 932, "Tau", "greek capital letter tau, U+03A4" },
  1816. { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
  1817. { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
  1818. { 935, "Chi", "greek capital letter chi, U+03A7" },
  1819. { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
  1820. { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
  1821. { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
  1822. { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
  1823. { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
  1824. { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
  1825. { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
  1826. { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
  1827. { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
  1828. { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
  1829. { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
  1830. { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
  1831. { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
  1832. { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
  1833. { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
  1834. { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
  1835. { 959, "omicron","greek small letter omicron, U+03BF NEW" },
  1836. { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
  1837. { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
  1838. { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
  1839. { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
  1840. { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
  1841. { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
  1842. { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
  1843. { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
  1844. { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
  1845. { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
  1846. { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
  1847. { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
  1848. { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
  1849. { 8194, "ensp", "en space, U+2002 ISOpub" },
  1850. { 8195, "emsp", "em space, U+2003 ISOpub" },
  1851. { 8201, "thinsp","thin space, U+2009 ISOpub" },
  1852. { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
  1853. { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
  1854. { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
  1855. { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
  1856. { 8211, "ndash","en dash, U+2013 ISOpub" },
  1857. { 8212, "mdash","em dash, U+2014 ISOpub" },
  1858. { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
  1859. { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
  1860. { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
  1861. { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
  1862. { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
  1863. { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
  1864. { 8224, "dagger","dagger, U+2020 ISOpub" },
  1865. { 8225, "Dagger","double dagger, U+2021 ISOpub" },
  1866. { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
  1867. { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
  1868. { 8240, "permil","per mille sign, U+2030 ISOtech" },
  1869. { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
  1870. { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
  1871. { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
  1872. { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
  1873. { 8254, "oline","overline = spacing overscore, U+203E NEW" },
  1874. { 8260, "frasl","fraction slash, U+2044 NEW" },
  1875. { 8364, "euro", "euro sign, U+20AC NEW" },
  1876. { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
  1877. { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
  1878. { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
  1879. { 8482, "trade","trade mark sign, U+2122 ISOnum" },
  1880. { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
  1881. { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
  1882. { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
  1883. { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
  1884. { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
  1885. { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
  1886. { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
  1887. { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
  1888. { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
  1889. { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
  1890. { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
  1891. { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
  1892. { 8704, "forall","for all, U+2200 ISOtech" },
  1893. { 8706, "part", "partial differential, U+2202 ISOtech" },
  1894. { 8707, "exist","there exists, U+2203 ISOtech" },
  1895. { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
  1896. { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
  1897. { 8712, "isin", "element of, U+2208 ISOtech" },
  1898. { 8713, "notin","not an element of, U+2209 ISOtech" },
  1899. { 8715, "ni", "contains as member, U+220B ISOtech" },
  1900. { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
  1901. { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
  1902. { 8722, "minus","minus sign, U+2212 ISOtech" },
  1903. { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
  1904. { 8730, "radic","square root = radical sign, U+221A ISOtech" },
  1905. { 8733, "prop", "proportional to, U+221D ISOtech" },
  1906. { 8734, "infin","infinity, U+221E ISOtech" },
  1907. { 8736, "ang", "angle, U+2220 ISOamso" },
  1908. { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
  1909. { 8744, "or", "logical or = vee, U+2228 ISOtech" },
  1910. { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
  1911. { 8746, "cup", "union = cup, U+222A ISOtech" },
  1912. { 8747, "int", "integral, U+222B ISOtech" },
  1913. { 8756, "there4","therefore, U+2234 ISOtech" },
  1914. { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
  1915. { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
  1916. { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
  1917. { 8800, "ne", "not equal to, U+2260 ISOtech" },
  1918. { 8801, "equiv","identical to, U+2261 ISOtech" },
  1919. { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
  1920. { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
  1921. { 8834, "sub", "subset of, U+2282 ISOtech" },
  1922. { 8835, "sup", "superset of, U+2283 ISOtech" },
  1923. { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
  1924. { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
  1925. { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
  1926. { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
  1927. { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
  1928. { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
  1929. { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
  1930. { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
  1931. { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
  1932. { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
  1933. { 8971, "rfloor","right floor, U+230B ISOamsc" },
  1934. { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
  1935. { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
  1936. { 9674, "loz", "lozenge, U+25CA ISOpub" },
  1937. { 9824, "spades","black spade suit, U+2660 ISOpub" },
  1938. { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
  1939. { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
  1940. { 9830, "diams","black diamond suit, U+2666 ISOpub" },
  1941. };
  1942. /************************************************************************
  1943. * *
  1944. * Commodity functions to handle entities *
  1945. * *
  1946. ************************************************************************/
  1947. /*
  1948. * Macro used to grow the current buffer.
  1949. */
  1950. #define growBuffer(buffer) { \
  1951. xmlChar *tmp; \
  1952. buffer##_size *= 2; \
  1953. tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
  1954. if (tmp == NULL) { \
  1955. htmlErrMemory(ctxt, "growing buffer\n"); \
  1956. xmlFree(buffer); \
  1957. return(NULL); \
  1958. } \
  1959. buffer = tmp; \
  1960. }
  1961. /**
  1962. * htmlEntityLookup:
  1963. * @name: the entity name
  1964. *
  1965. * Lookup the given entity in EntitiesTable
  1966. *
  1967. * TODO: the linear scan is really ugly, an hash table is really needed.
  1968. *
  1969. * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
  1970. */
  1971. const htmlEntityDesc *
  1972. htmlEntityLookup(const xmlChar *name) {
  1973. unsigned int i;
  1974. for (i = 0;i < (sizeof(html40EntitiesTable)/
  1975. sizeof(html40EntitiesTable[0]));i++) {
  1976. if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
  1977. return((htmlEntityDescPtr) &html40EntitiesTable[i]);
  1978. }
  1979. }
  1980. return(NULL);
  1981. }
  1982. /**
  1983. * htmlEntityValueLookup:
  1984. * @value: the entity's unicode value
  1985. *
  1986. * Lookup the given entity in EntitiesTable
  1987. *
  1988. * TODO: the linear scan is really ugly, an hash table is really needed.
  1989. *
  1990. * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
  1991. */
  1992. const htmlEntityDesc *
  1993. htmlEntityValueLookup(unsigned int value) {
  1994. unsigned int i;
  1995. for (i = 0;i < (sizeof(html40EntitiesTable)/
  1996. sizeof(html40EntitiesTable[0]));i++) {
  1997. if (html40EntitiesTable[i].value >= value) {
  1998. if (html40EntitiesTable[i].value > value)
  1999. break;
  2000. return((htmlEntityDescPtr) &html40EntitiesTable[i]);
  2001. }
  2002. }
  2003. return(NULL);
  2004. }
  2005. /**
  2006. * UTF8ToHtml:
  2007. * @out: a pointer to an array of bytes to store the result
  2008. * @outlen: the length of @out
  2009. * @in: a pointer to an array of UTF-8 chars
  2010. * @inlen: the length of @in
  2011. *
  2012. * Take a block of UTF-8 chars in and try to convert it to an ASCII
  2013. * plus HTML entities block of chars out.
  2014. *
  2015. * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
  2016. * The value of @inlen after return is the number of octets consumed
  2017. * as the return value is positive, else unpredictable.
  2018. * The value of @outlen after return is the number of octets consumed.
  2019. */
  2020. int
  2021. UTF8ToHtml(unsigned char* out, int *outlen,
  2022. const unsigned char* in, int *inlen) {
  2023. const unsigned char* processed = in;
  2024. const unsigned char* outend;
  2025. const unsigned char* outstart = out;
  2026. const unsigned char* instart = in;
  2027. const unsigned char* inend;
  2028. unsigned int c, d;
  2029. int trailing;
  2030. if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
  2031. if (in == NULL) {
  2032. /*
  2033. * initialization nothing to do
  2034. */
  2035. *outlen = 0;
  2036. *inlen = 0;
  2037. return(0);
  2038. }
  2039. inend = in + (*inlen);
  2040. outend = out + (*outlen);
  2041. while (in < inend) {
  2042. d = *in++;
  2043. if (d < 0x80) { c= d; trailing= 0; }
  2044. else if (d < 0xC0) {
  2045. /* trailing byte in leading position */
  2046. *outlen = out - outstart;
  2047. *inlen = processed - instart;
  2048. return(-2);
  2049. } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
  2050. else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
  2051. else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
  2052. else {
  2053. /* no chance for this in Ascii */
  2054. *outlen = out - outstart;
  2055. *inlen = processed - instart;
  2056. return(-2);
  2057. }
  2058. if (inend - in < trailing) {
  2059. break;
  2060. }
  2061. for ( ; trailing; trailing--) {
  2062. if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
  2063. break;
  2064. c <<= 6;
  2065. c |= d & 0x3F;
  2066. }
  2067. /* assertion: c is a single UTF-4 value */
  2068. if (c < 0x80) {
  2069. if (out + 1 >= outend)
  2070. break;
  2071. *out++ = c;
  2072. } else {
  2073. int len;
  2074. const htmlEntityDesc * ent;
  2075. const char *cp;
  2076. char nbuf[16];
  2077. /*
  2078. * Try to lookup a predefined HTML entity for it
  2079. */
  2080. ent = htmlEntityValueLookup(c);
  2081. if (ent == NULL) {
  2082. snprintf(nbuf, sizeof(nbuf), "#%u", c);
  2083. cp = nbuf;
  2084. }
  2085. else
  2086. cp = ent->name;
  2087. len = strlen(cp);
  2088. if (out + 2 + len >= outend)
  2089. break;
  2090. *out++ = '&';
  2091. memcpy(out, cp, len);
  2092. out += len;
  2093. *out++ = ';';
  2094. }
  2095. processed = in;
  2096. }
  2097. *outlen = out - outstart;
  2098. *inlen = processed - instart;
  2099. return(0);
  2100. }
  2101. /**
  2102. * htmlEncodeEntities:
  2103. * @out: a pointer to an array of bytes to store the result
  2104. * @outlen: the length of @out
  2105. * @in: a pointer to an array of UTF-8 chars
  2106. * @inlen: the length of @in
  2107. * @quoteChar: the quote character to escape (' or ") or zero.
  2108. *
  2109. * Take a block of UTF-8 chars in and try to convert it to an ASCII
  2110. * plus HTML entities block of chars out.
  2111. *
  2112. * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
  2113. * The value of @inlen after return is the number of octets consumed
  2114. * as the return value is positive, else unpredictable.
  2115. * The value of @outlen after return is the number of octets consumed.
  2116. */
  2117. int
  2118. htmlEncodeEntities(unsigned char* out, int *outlen,
  2119. const unsigned char* in, int *inlen, int quoteChar) {
  2120. const unsigned char* processed = in;
  2121. const unsigned char* outend;
  2122. const unsigned char* outstart = out;
  2123. const unsigned char* instart = in;
  2124. const unsigned char* inend;
  2125. unsigned int c, d;
  2126. int trailing;
  2127. if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
  2128. return(-1);
  2129. outend = out + (*outlen);
  2130. inend = in + (*inlen);
  2131. while (in < inend) {
  2132. d = *in++;
  2133. if (d < 0x80) { c= d; trailing= 0; }
  2134. else if (d < 0xC0) {
  2135. /* trailing byte in leading position */
  2136. *outlen = out - outstart;
  2137. *inlen = processed - instart;
  2138. return(-2);
  2139. } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
  2140. else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
  2141. else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
  2142. else {
  2143. /* no chance for this in Ascii */
  2144. *outlen = out - outstart;
  2145. *inlen = processed - instart;
  2146. return(-2);
  2147. }
  2148. if (inend - in < trailing)
  2149. break;
  2150. while (trailing--) {
  2151. if (((d= *in++) & 0xC0) != 0x80) {
  2152. *outlen = out - outstart;
  2153. *inlen = processed - instart;
  2154. return(-2);
  2155. }
  2156. c <<= 6;
  2157. c |= d & 0x3F;
  2158. }
  2159. /* assertion: c is a single UTF-4 value */
  2160. if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
  2161. (c != '&') && (c != '<') && (c != '>')) {
  2162. if (out >= outend)
  2163. break;
  2164. *out++ = c;
  2165. } else {
  2166. const htmlEntityDesc * ent;
  2167. const char *cp;
  2168. char nbuf[16];
  2169. int len;
  2170. /*
  2171. * Try to lookup a predefined HTML entity for it
  2172. */
  2173. ent = htmlEntityValueLookup(c);
  2174. if (ent == NULL) {
  2175. snprintf(nbuf, sizeof(nbuf), "#%u", c);
  2176. cp = nbuf;
  2177. }
  2178. else
  2179. cp = ent->name;
  2180. len = strlen(cp);
  2181. if (out + 2 + len > outend)
  2182. break;
  2183. *out++ = '&';
  2184. memcpy(out, cp, len);
  2185. out += len;
  2186. *out++ = ';';
  2187. }
  2188. processed = in;
  2189. }
  2190. *outlen = out - outstart;
  2191. *inlen = processed - instart;
  2192. return(0);
  2193. }
  2194. /************************************************************************
  2195. * *
  2196. * Commodity functions to handle streams *
  2197. * *
  2198. ************************************************************************/
  2199. #ifdef LIBXML_PUSH_ENABLED
  2200. /**
  2201. * htmlNewInputStream:
  2202. * @ctxt: an HTML parser context
  2203. *
  2204. * Create a new input stream structure
  2205. * Returns the new input stream or NULL
  2206. */
  2207. static htmlParserInputPtr
  2208. htmlNewInputStream(htmlParserCtxtPtr ctxt) {
  2209. htmlParserInputPtr input;
  2210. input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
  2211. if (input == NULL) {
  2212. htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
  2213. return(NULL);
  2214. }
  2215. memset(input, 0, sizeof(htmlParserInput));
  2216. input->filename = NULL;
  2217. input->directory = NULL;
  2218. input->base = NULL;
  2219. input->cur = NULL;
  2220. input->buf = NULL;
  2221. input->line = 1;
  2222. input->col = 1;
  2223. input->buf = NULL;
  2224. input->free = NULL;
  2225. input->version = NULL;
  2226. input->consumed = 0;
  2227. input->length = 0;
  2228. return(input);
  2229. }
  2230. #endif
  2231. /************************************************************************
  2232. * *
  2233. * Commodity functions, cleanup needed ? *
  2234. * *
  2235. ************************************************************************/
  2236. /*
  2237. * all tags allowing pc data from the html 4.01 loose dtd
  2238. * NOTE: it might be more appropriate to integrate this information
  2239. * into the html40ElementTable array but I don't want to risk any
  2240. * binary incompatibility
  2241. */
  2242. static const char *allowPCData[] = {
  2243. "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
  2244. "blockquote", "body", "button", "caption", "center", "cite", "code",
  2245. "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
  2246. "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
  2247. "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
  2248. "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
  2249. };
  2250. /**
  2251. * areBlanks:
  2252. * @ctxt: an HTML parser context
  2253. * @str: a xmlChar *
  2254. * @len: the size of @str
  2255. *
  2256. * Is this a sequence of blank chars that one can ignore ?
  2257. *
  2258. * Returns 1 if ignorable 0 otherwise.
  2259. */
  2260. static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
  2261. unsigned int i;
  2262. int j;
  2263. xmlNodePtr lastChild;
  2264. xmlDtdPtr dtd;
  2265. for (j = 0;j < len;j++)
  2266. if (!(IS_BLANK_CH(str[j]))) return(0);
  2267. if (CUR == 0) return(1);
  2268. if (CUR != '<') return(0);
  2269. if (ctxt->name == NULL)
  2270. return(1);
  2271. if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
  2272. return(1);
  2273. if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
  2274. return(1);
  2275. /* Only strip CDATA children of the body tag for strict HTML DTDs */
  2276. if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
  2277. dtd = xmlGetIntSubset(ctxt->myDoc);
  2278. if (dtd != NULL && dtd->ExternalID != NULL) {
  2279. if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
  2280. !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
  2281. return(1);
  2282. }
  2283. }
  2284. if (ctxt->node == NULL) return(0);
  2285. lastChild = xmlGetLastChild(ctxt->node);
  2286. while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
  2287. lastChild = lastChild->prev;
  2288. if (lastChild == NULL) {
  2289. if ((ctxt->node->type != XML_ELEMENT_NODE) &&
  2290. (ctxt->node->content != NULL)) return(0);
  2291. /* keep ws in constructs like ...<b> </b>...
  2292. for all tags "b" allowing PCDATA */
  2293. for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
  2294. if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
  2295. return(0);
  2296. }
  2297. }
  2298. } else if (xmlNodeIsText(lastChild)) {
  2299. return(0);
  2300. } else {
  2301. /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
  2302. for all tags "p" allowing PCDATA */
  2303. for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
  2304. if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
  2305. return(0);
  2306. }
  2307. }
  2308. }
  2309. return(1);
  2310. }
  2311. /**
  2312. * htmlNewDocNoDtD:
  2313. * @URI: URI for the dtd, or NULL
  2314. * @ExternalID: the external ID of the DTD, or NULL
  2315. *
  2316. * Creates a new HTML document without a DTD node if @URI and @ExternalID
  2317. * are NULL
  2318. *
  2319. * Returns a new document, do not initialize the DTD if not provided
  2320. */
  2321. htmlDocPtr
  2322. htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
  2323. xmlDocPtr cur;
  2324. /*
  2325. * Allocate a new document and fill the fields.
  2326. */
  2327. cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
  2328. if (cur == NULL) {
  2329. htmlErrMemory(NULL, "HTML document creation failed\n");
  2330. return(NULL);
  2331. }
  2332. memset(cur, 0, sizeof(xmlDoc));
  2333. cur->type = XML_HTML_DOCUMENT_NODE;
  2334. cur->version = NULL;
  2335. cur->intSubset = NULL;
  2336. cur->doc = cur;
  2337. cur->name = NULL;
  2338. cur->children = NULL;
  2339. cur->extSubset = NULL;
  2340. cur->oldNs = NULL;
  2341. cur->encoding = NULL;
  2342. cur->standalone = 1;
  2343. cur->compression = 0;
  2344. cur->ids = NULL;
  2345. cur->refs = NULL;
  2346. cur->_private = NULL;
  2347. cur->charset = XML_CHAR_ENCODING_UTF8;
  2348. cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
  2349. if ((ExternalID != NULL) ||
  2350. (URI != NULL))
  2351. xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
  2352. return(cur);
  2353. }
  2354. /**
  2355. * htmlNewDoc:
  2356. * @URI: URI for the dtd, or NULL
  2357. * @ExternalID: the external ID of the DTD, or NULL
  2358. *
  2359. * Creates a new HTML document
  2360. *
  2361. * Returns a new document
  2362. */
  2363. htmlDocPtr
  2364. htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
  2365. if ((URI == NULL) && (ExternalID == NULL))
  2366. return(htmlNewDocNoDtD(
  2367. BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
  2368. BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
  2369. return(htmlNewDocNoDtD(URI, ExternalID));
  2370. }
  2371. /************************************************************************
  2372. * *
  2373. * The parser itself *
  2374. * Relates to http://www.w3.org/TR/html40 *
  2375. * *
  2376. ************************************************************************/
  2377. /************************************************************************
  2378. * *
  2379. * The parser itself *
  2380. * *
  2381. ************************************************************************/
  2382. static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
  2383. /**
  2384. * htmlParseHTMLName:
  2385. * @ctxt: an HTML parser context
  2386. *
  2387. * parse an HTML tag or attribute name, note that we convert it to lowercase
  2388. * since HTML names are not case-sensitive.
  2389. *
  2390. * Returns the Tag Name parsed or NULL
  2391. */
  2392. static const xmlChar *
  2393. htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
  2394. int i = 0;
  2395. xmlChar loc[HTML_PARSER_BUFFER_SIZE];
  2396. if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
  2397. (CUR != ':') && (CUR != '.')) return(NULL);
  2398. while ((i < HTML_PARSER_BUFFER_SIZE) &&
  2399. ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
  2400. (CUR == ':') || (CUR == '-') || (CUR == '_') ||
  2401. (CUR == '.'))) {
  2402. if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
  2403. else loc[i] = CUR;
  2404. i++;
  2405. NEXT;
  2406. }
  2407. return(xmlDictLookup(ctxt->dict, loc, i));
  2408. }
  2409. /**
  2410. * htmlParseHTMLName_nonInvasive:
  2411. * @ctxt: an HTML parser context
  2412. *
  2413. * parse an HTML tag or attribute name, note that we convert it to lowercase
  2414. * since HTML names are not case-sensitive, this doesn't consume the data
  2415. * from the stream, it's a look-ahead
  2416. *
  2417. * Returns the Tag Name parsed or NULL
  2418. */
  2419. static const xmlChar *
  2420. htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
  2421. int i = 0;
  2422. xmlChar loc[HTML_PARSER_BUFFER_SIZE];
  2423. if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
  2424. (NXT(1) != ':')) return(NULL);
  2425. while ((i < HTML_PARSER_BUFFER_SIZE) &&
  2426. ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
  2427. (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
  2428. if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
  2429. else loc[i] = NXT(1+i);
  2430. i++;
  2431. }
  2432. return(xmlDictLookup(ctxt->dict, loc, i));
  2433. }
  2434. /**
  2435. * htmlParseName:
  2436. * @ctxt: an HTML parser context
  2437. *
  2438. * parse an HTML name, this routine is case sensitive.
  2439. *
  2440. * Returns the Name parsed or NULL
  2441. */
  2442. static const xmlChar *
  2443. htmlParseName(htmlParserCtxtPtr ctxt) {
  2444. const xmlChar *in;
  2445. const xmlChar *ret;
  2446. int count = 0;
  2447. GROW;
  2448. /*
  2449. * Accelerator for simple ASCII names
  2450. */
  2451. in = ctxt->input->cur;
  2452. if (((*in >= 0x61) && (*in <= 0x7A)) ||
  2453. ((*in >= 0x41) && (*in <= 0x5A)) ||
  2454. (*in == '_') || (*in == ':')) {
  2455. in++;
  2456. while (((*in >= 0x61) && (*in <= 0x7A)) ||
  2457. ((*in >= 0x41) && (*in <= 0x5A)) ||
  2458. ((*in >= 0x30) && (*in <= 0x39)) ||
  2459. (*in == '_') || (*in == '-') ||
  2460. (*in == ':') || (*in == '.'))
  2461. in++;
  2462. if (in == ctxt->input->end)
  2463. return(NULL);
  2464. if ((*in > 0) && (*in < 0x80)) {
  2465. count = in - ctxt->input->cur;
  2466. ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
  2467. ctxt->input->cur = in;
  2468. ctxt->input->col += count;
  2469. return(ret);
  2470. }
  2471. }
  2472. return(htmlParseNameComplex(ctxt));
  2473. }
  2474. static const xmlChar *
  2475. htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
  2476. int len = 0, l;
  2477. int c;
  2478. int count = 0;
  2479. const xmlChar *base = ctxt->input->base;
  2480. /*
  2481. * Handler for more complex cases
  2482. */
  2483. GROW;
  2484. c = CUR_CHAR(l);
  2485. if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
  2486. (!IS_LETTER(c) && (c != '_') &&
  2487. (c != ':'))) {
  2488. return(NULL);
  2489. }
  2490. while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
  2491. ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
  2492. (c == '.') || (c == '-') ||
  2493. (c == '_') || (c == ':') ||
  2494. (IS_COMBINING(c)) ||
  2495. (IS_EXTENDER(c)))) {
  2496. if (count++ > 100) {
  2497. count = 0;
  2498. GROW;
  2499. }
  2500. len += l;
  2501. NEXTL(l);
  2502. c = CUR_CHAR(l);
  2503. if (ctxt->input->base != base) {
  2504. /*
  2505. * We changed encoding from an unknown encoding
  2506. * Input buffer changed location, so we better start again
  2507. */
  2508. return(htmlParseNameComplex(ctxt));
  2509. }
  2510. }
  2511. if (ctxt->input->cur - ctxt->input->base < len) {
  2512. /* Sanity check */
  2513. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  2514. "unexpected change of input buffer", NULL, NULL);
  2515. return (NULL);
  2516. }
  2517. return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
  2518. }
  2519. /**
  2520. * htmlParseHTMLAttribute:
  2521. * @ctxt: an HTML parser context
  2522. * @stop: a char stop value
  2523. *
  2524. * parse an HTML attribute value till the stop (quote), if
  2525. * stop is 0 then it stops at the first space
  2526. *
  2527. * Returns the attribute parsed or NULL
  2528. */
  2529. static xmlChar *
  2530. htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
  2531. xmlChar *buffer = NULL;
  2532. int buffer_size = 0;
  2533. xmlChar *out = NULL;
  2534. const xmlChar *name = NULL;
  2535. const xmlChar *cur = NULL;
  2536. const htmlEntityDesc * ent;
  2537. /*
  2538. * allocate a translation buffer.
  2539. */
  2540. buffer_size = HTML_PARSER_BUFFER_SIZE;
  2541. buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
  2542. if (buffer == NULL) {
  2543. htmlErrMemory(ctxt, "buffer allocation failed\n");
  2544. return(NULL);
  2545. }
  2546. out = buffer;
  2547. /*
  2548. * Ok loop until we reach one of the ending chars
  2549. */
  2550. while ((CUR != 0) && (CUR != stop)) {
  2551. if ((stop == 0) && (CUR == '>')) break;
  2552. if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
  2553. if (CUR == '&') {
  2554. if (NXT(1) == '#') {
  2555. unsigned int c;
  2556. int bits;
  2557. c = htmlParseCharRef(ctxt);
  2558. if (c < 0x80)
  2559. { *out++ = c; bits= -6; }
  2560. else if (c < 0x800)
  2561. { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
  2562. else if (c < 0x10000)
  2563. { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
  2564. else
  2565. { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
  2566. for ( ; bits >= 0; bits-= 6) {
  2567. *out++ = ((c >> bits) & 0x3F) | 0x80;
  2568. }
  2569. if (out - buffer > buffer_size - 100) {
  2570. int indx = out - buffer;
  2571. growBuffer(buffer);
  2572. out = &buffer[indx];
  2573. }
  2574. } else {
  2575. ent = htmlParseEntityRef(ctxt, &name);
  2576. if (name == NULL) {
  2577. *out++ = '&';
  2578. if (out - buffer > buffer_size - 100) {
  2579. int indx = out - buffer;
  2580. growBuffer(buffer);
  2581. out = &buffer[indx];
  2582. }
  2583. } else if (ent == NULL) {
  2584. *out++ = '&';
  2585. cur = name;
  2586. while (*cur != 0) {
  2587. if (out - buffer > buffer_size - 100) {
  2588. int indx = out - buffer;
  2589. growBuffer(buffer);
  2590. out = &buffer[indx];
  2591. }
  2592. *out++ = *cur++;
  2593. }
  2594. } else {
  2595. unsigned int c;
  2596. int bits;
  2597. if (out - buffer > buffer_size - 100) {
  2598. int indx = out - buffer;
  2599. growBuffer(buffer);
  2600. out = &buffer[indx];
  2601. }
  2602. c = ent->value;
  2603. if (c < 0x80)
  2604. { *out++ = c; bits= -6; }
  2605. else if (c < 0x800)
  2606. { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
  2607. else if (c < 0x10000)
  2608. { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
  2609. else
  2610. { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
  2611. for ( ; bits >= 0; bits-= 6) {
  2612. *out++ = ((c >> bits) & 0x3F) | 0x80;
  2613. }
  2614. }
  2615. }
  2616. } else {
  2617. unsigned int c;
  2618. int bits, l;
  2619. if (out - buffer > buffer_size - 100) {
  2620. int indx = out - buffer;
  2621. growBuffer(buffer);
  2622. out = &buffer[indx];
  2623. }
  2624. c = CUR_CHAR(l);
  2625. if (c < 0x80)
  2626. { *out++ = c; bits= -6; }
  2627. else if (c < 0x800)
  2628. { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
  2629. else if (c < 0x10000)
  2630. { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
  2631. else
  2632. { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
  2633. for ( ; bits >= 0; bits-= 6) {
  2634. *out++ = ((c >> bits) & 0x3F) | 0x80;
  2635. }
  2636. NEXT;
  2637. }
  2638. }
  2639. *out = 0;
  2640. return(buffer);
  2641. }
  2642. /**
  2643. * htmlParseEntityRef:
  2644. * @ctxt: an HTML parser context
  2645. * @str: location to store the entity name
  2646. *
  2647. * parse an HTML ENTITY references
  2648. *
  2649. * [68] EntityRef ::= '&' Name ';'
  2650. *
  2651. * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
  2652. * if non-NULL *str will have to be freed by the caller.
  2653. */
  2654. const htmlEntityDesc *
  2655. htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
  2656. const xmlChar *name;
  2657. const htmlEntityDesc * ent = NULL;
  2658. if (str != NULL) *str = NULL;
  2659. if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
  2660. if (CUR == '&') {
  2661. NEXT;
  2662. name = htmlParseName(ctxt);
  2663. if (name == NULL) {
  2664. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  2665. "htmlParseEntityRef: no name\n", NULL, NULL);
  2666. } else {
  2667. GROW;
  2668. if (CUR == ';') {
  2669. if (str != NULL)
  2670. *str = name;
  2671. /*
  2672. * Lookup the entity in the table.
  2673. */
  2674. ent = htmlEntityLookup(name);
  2675. if (ent != NULL) /* OK that's ugly !!! */
  2676. NEXT;
  2677. } else {
  2678. htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
  2679. "htmlParseEntityRef: expecting ';'\n",
  2680. NULL, NULL);
  2681. if (str != NULL)
  2682. *str = name;
  2683. }
  2684. }
  2685. }
  2686. return(ent);
  2687. }
  2688. /**
  2689. * htmlParseAttValue:
  2690. * @ctxt: an HTML parser context
  2691. *
  2692. * parse a value for an attribute
  2693. * Note: the parser won't do substitution of entities here, this
  2694. * will be handled later in xmlStringGetNodeList, unless it was
  2695. * asked for ctxt->replaceEntities != 0
  2696. *
  2697. * Returns the AttValue parsed or NULL.
  2698. */
  2699. static xmlChar *
  2700. htmlParseAttValue(htmlParserCtxtPtr ctxt) {
  2701. xmlChar *ret = NULL;
  2702. if (CUR == '"') {
  2703. NEXT;
  2704. ret = htmlParseHTMLAttribute(ctxt, '"');
  2705. if (CUR != '"') {
  2706. htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
  2707. "AttValue: \" expected\n", NULL, NULL);
  2708. } else
  2709. NEXT;
  2710. } else if (CUR == '\'') {
  2711. NEXT;
  2712. ret = htmlParseHTMLAttribute(ctxt, '\'');
  2713. if (CUR != '\'') {
  2714. htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
  2715. "AttValue: ' expected\n", NULL, NULL);
  2716. } else
  2717. NEXT;
  2718. } else {
  2719. /*
  2720. * That's an HTMLism, the attribute value may not be quoted
  2721. */
  2722. ret = htmlParseHTMLAttribute(ctxt, 0);
  2723. if (ret == NULL) {
  2724. htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
  2725. "AttValue: no value found\n", NULL, NULL);
  2726. }
  2727. }
  2728. return(ret);
  2729. }
  2730. /**
  2731. * htmlParseSystemLiteral:
  2732. * @ctxt: an HTML parser context
  2733. *
  2734. * parse an HTML Literal
  2735. *
  2736. * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
  2737. *
  2738. * Returns the SystemLiteral parsed or NULL
  2739. */
  2740. static xmlChar *
  2741. htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
  2742. size_t len = 0, startPosition = 0;
  2743. int err = 0;
  2744. int quote;
  2745. xmlChar *ret = NULL;
  2746. if ((CUR != '"') && (CUR != '\'')) {
  2747. htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
  2748. "SystemLiteral \" or ' expected\n", NULL, NULL);
  2749. return(NULL);
  2750. }
  2751. quote = CUR;
  2752. NEXT;
  2753. if (CUR_PTR < BASE_PTR)
  2754. return(ret);
  2755. startPosition = CUR_PTR - BASE_PTR;
  2756. while ((CUR != 0) && (CUR != quote)) {
  2757. /* TODO: Handle UTF-8 */
  2758. if (!IS_CHAR_CH(CUR)) {
  2759. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  2760. "Invalid char in SystemLiteral 0x%X\n", CUR);
  2761. err = 1;
  2762. }
  2763. NEXT;
  2764. len++;
  2765. }
  2766. if (CUR != quote) {
  2767. htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
  2768. "Unfinished SystemLiteral\n", NULL, NULL);
  2769. } else {
  2770. NEXT;
  2771. if (err == 0)
  2772. ret = xmlStrndup((BASE_PTR+startPosition), len);
  2773. }
  2774. return(ret);
  2775. }
  2776. /**
  2777. * htmlParsePubidLiteral:
  2778. * @ctxt: an HTML parser context
  2779. *
  2780. * parse an HTML public literal
  2781. *
  2782. * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
  2783. *
  2784. * Returns the PubidLiteral parsed or NULL.
  2785. */
  2786. static xmlChar *
  2787. htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
  2788. size_t len = 0, startPosition = 0;
  2789. int err = 0;
  2790. int quote;
  2791. xmlChar *ret = NULL;
  2792. if ((CUR != '"') && (CUR != '\'')) {
  2793. htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
  2794. "PubidLiteral \" or ' expected\n", NULL, NULL);
  2795. return(NULL);
  2796. }
  2797. quote = CUR;
  2798. NEXT;
  2799. /*
  2800. * Name ::= (Letter | '_') (NameChar)*
  2801. */
  2802. if (CUR_PTR < BASE_PTR)
  2803. return(ret);
  2804. startPosition = CUR_PTR - BASE_PTR;
  2805. while ((CUR != 0) && (CUR != quote)) {
  2806. if (!IS_PUBIDCHAR_CH(CUR)) {
  2807. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  2808. "Invalid char in PubidLiteral 0x%X\n", CUR);
  2809. err = 1;
  2810. }
  2811. len++;
  2812. NEXT;
  2813. }
  2814. if (CUR != '"') {
  2815. htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
  2816. "Unfinished PubidLiteral\n", NULL, NULL);
  2817. } else {
  2818. NEXT;
  2819. if (err == 0)
  2820. ret = xmlStrndup((BASE_PTR + startPosition), len);
  2821. }
  2822. return(ret);
  2823. }
  2824. /**
  2825. * htmlParseScript:
  2826. * @ctxt: an HTML parser context
  2827. *
  2828. * parse the content of an HTML SCRIPT or STYLE element
  2829. * http://www.w3.org/TR/html4/sgml/dtd.html#Script
  2830. * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
  2831. * http://www.w3.org/TR/html4/types.html#type-script
  2832. * http://www.w3.org/TR/html4/types.html#h-6.15
  2833. * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
  2834. *
  2835. * Script data ( %Script; in the DTD) can be the content of the SCRIPT
  2836. * element and the value of intrinsic event attributes. User agents must
  2837. * not evaluate script data as HTML markup but instead must pass it on as
  2838. * data to a script engine.
  2839. * NOTES:
  2840. * - The content is passed like CDATA
  2841. * - the attributes for style and scripting "onXXX" are also described
  2842. * as CDATA but SGML allows entities references in attributes so their
  2843. * processing is identical as other attributes
  2844. */
  2845. static void
  2846. htmlParseScript(htmlParserCtxtPtr ctxt) {
  2847. xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
  2848. int nbchar = 0;
  2849. int cur,l;
  2850. SHRINK;
  2851. cur = CUR_CHAR(l);
  2852. while (cur != 0) {
  2853. if ((cur == '<') && (NXT(1) == '/')) {
  2854. /*
  2855. * One should break here, the specification is clear:
  2856. * Authors should therefore escape "</" within the content.
  2857. * Escape mechanisms are specific to each scripting or
  2858. * style sheet language.
  2859. *
  2860. * In recovery mode, only break if end tag match the
  2861. * current tag, effectively ignoring all tags inside the
  2862. * script/style block and treating the entire block as
  2863. * CDATA.
  2864. */
  2865. if (ctxt->recovery) {
  2866. if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
  2867. xmlStrlen(ctxt->name)) == 0)
  2868. {
  2869. break; /* while */
  2870. } else {
  2871. htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
  2872. "Element %s embeds close tag\n",
  2873. ctxt->name, NULL);
  2874. }
  2875. } else {
  2876. if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
  2877. ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
  2878. {
  2879. break; /* while */
  2880. }
  2881. }
  2882. }
  2883. if (IS_CHAR(cur)) {
  2884. COPY_BUF(l,buf,nbchar,cur);
  2885. } else {
  2886. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  2887. "Invalid char in CDATA 0x%X\n", cur);
  2888. }
  2889. if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
  2890. buf[nbchar] = 0;
  2891. if (ctxt->sax->cdataBlock!= NULL) {
  2892. /*
  2893. * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
  2894. */
  2895. ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
  2896. } else if (ctxt->sax->characters != NULL) {
  2897. ctxt->sax->characters(ctxt->userData, buf, nbchar);
  2898. }
  2899. nbchar = 0;
  2900. }
  2901. GROW;
  2902. NEXTL(l);
  2903. cur = CUR_CHAR(l);
  2904. }
  2905. if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
  2906. buf[nbchar] = 0;
  2907. if (ctxt->sax->cdataBlock!= NULL) {
  2908. /*
  2909. * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
  2910. */
  2911. ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
  2912. } else if (ctxt->sax->characters != NULL) {
  2913. ctxt->sax->characters(ctxt->userData, buf, nbchar);
  2914. }
  2915. }
  2916. }
  2917. /**
  2918. * htmlParseCharDataInternal:
  2919. * @ctxt: an HTML parser context
  2920. * @readahead: optional read ahead character in ascii range
  2921. *
  2922. * parse a CharData section.
  2923. * if we are within a CDATA section ']]>' marks an end of section.
  2924. *
  2925. * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
  2926. */
  2927. static void
  2928. htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
  2929. xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
  2930. int nbchar = 0;
  2931. int cur, l;
  2932. int chunk = 0;
  2933. if (readahead)
  2934. buf[nbchar++] = readahead;
  2935. SHRINK;
  2936. cur = CUR_CHAR(l);
  2937. while (((cur != '<') || (ctxt->token == '<')) &&
  2938. ((cur != '&') || (ctxt->token == '&')) &&
  2939. (cur != 0)) {
  2940. if (!(IS_CHAR(cur))) {
  2941. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  2942. "Invalid char in CDATA 0x%X\n", cur);
  2943. } else {
  2944. COPY_BUF(l,buf,nbchar,cur);
  2945. }
  2946. if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
  2947. buf[nbchar] = 0;
  2948. /*
  2949. * Ok the segment is to be consumed as chars.
  2950. */
  2951. if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
  2952. if (areBlanks(ctxt, buf, nbchar)) {
  2953. if (ctxt->keepBlanks) {
  2954. if (ctxt->sax->characters != NULL)
  2955. ctxt->sax->characters(ctxt->userData, buf, nbchar);
  2956. } else {
  2957. if (ctxt->sax->ignorableWhitespace != NULL)
  2958. ctxt->sax->ignorableWhitespace(ctxt->userData,
  2959. buf, nbchar);
  2960. }
  2961. } else {
  2962. htmlCheckParagraph(ctxt);
  2963. if (ctxt->sax->characters != NULL)
  2964. ctxt->sax->characters(ctxt->userData, buf, nbchar);
  2965. }
  2966. }
  2967. nbchar = 0;
  2968. }
  2969. NEXTL(l);
  2970. chunk++;
  2971. if (chunk > HTML_PARSER_BUFFER_SIZE) {
  2972. chunk = 0;
  2973. SHRINK;
  2974. GROW;
  2975. }
  2976. cur = CUR_CHAR(l);
  2977. if (cur == 0) {
  2978. SHRINK;
  2979. GROW;
  2980. cur = CUR_CHAR(l);
  2981. }
  2982. }
  2983. if (nbchar != 0) {
  2984. buf[nbchar] = 0;
  2985. /*
  2986. * Ok the segment is to be consumed as chars.
  2987. */
  2988. if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
  2989. if (areBlanks(ctxt, buf, nbchar)) {
  2990. if (ctxt->keepBlanks) {
  2991. if (ctxt->sax->characters != NULL)
  2992. ctxt->sax->characters(ctxt->userData, buf, nbchar);
  2993. } else {
  2994. if (ctxt->sax->ignorableWhitespace != NULL)
  2995. ctxt->sax->ignorableWhitespace(ctxt->userData,
  2996. buf, nbchar);
  2997. }
  2998. } else {
  2999. htmlCheckParagraph(ctxt);
  3000. if (ctxt->sax->characters != NULL)
  3001. ctxt->sax->characters(ctxt->userData, buf, nbchar);
  3002. }
  3003. }
  3004. } else {
  3005. /*
  3006. * Loop detection
  3007. */
  3008. if (cur == 0)
  3009. ctxt->instate = XML_PARSER_EOF;
  3010. }
  3011. }
  3012. /**
  3013. * htmlParseCharData:
  3014. * @ctxt: an HTML parser context
  3015. *
  3016. * parse a CharData section.
  3017. * if we are within a CDATA section ']]>' marks an end of section.
  3018. *
  3019. * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
  3020. */
  3021. static void
  3022. htmlParseCharData(htmlParserCtxtPtr ctxt) {
  3023. htmlParseCharDataInternal(ctxt, 0);
  3024. }
  3025. /**
  3026. * htmlParseExternalID:
  3027. * @ctxt: an HTML parser context
  3028. * @publicID: a xmlChar** receiving PubidLiteral
  3029. *
  3030. * Parse an External ID or a Public ID
  3031. *
  3032. * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
  3033. * | 'PUBLIC' S PubidLiteral S SystemLiteral
  3034. *
  3035. * [83] PublicID ::= 'PUBLIC' S PubidLiteral
  3036. *
  3037. * Returns the function returns SystemLiteral and in the second
  3038. * case publicID receives PubidLiteral, is strict is off
  3039. * it is possible to return NULL and have publicID set.
  3040. */
  3041. static xmlChar *
  3042. htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
  3043. xmlChar *URI = NULL;
  3044. if ((UPPER == 'S') && (UPP(1) == 'Y') &&
  3045. (UPP(2) == 'S') && (UPP(3) == 'T') &&
  3046. (UPP(4) == 'E') && (UPP(5) == 'M')) {
  3047. SKIP(6);
  3048. if (!IS_BLANK_CH(CUR)) {
  3049. htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
  3050. "Space required after 'SYSTEM'\n", NULL, NULL);
  3051. }
  3052. SKIP_BLANKS;
  3053. URI = htmlParseSystemLiteral(ctxt);
  3054. if (URI == NULL) {
  3055. htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
  3056. "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
  3057. }
  3058. } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
  3059. (UPP(2) == 'B') && (UPP(3) == 'L') &&
  3060. (UPP(4) == 'I') && (UPP(5) == 'C')) {
  3061. SKIP(6);
  3062. if (!IS_BLANK_CH(CUR)) {
  3063. htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
  3064. "Space required after 'PUBLIC'\n", NULL, NULL);
  3065. }
  3066. SKIP_BLANKS;
  3067. *publicID = htmlParsePubidLiteral(ctxt);
  3068. if (*publicID == NULL) {
  3069. htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
  3070. "htmlParseExternalID: PUBLIC, no Public Identifier\n",
  3071. NULL, NULL);
  3072. }
  3073. SKIP_BLANKS;
  3074. if ((CUR == '"') || (CUR == '\'')) {
  3075. URI = htmlParseSystemLiteral(ctxt);
  3076. }
  3077. }
  3078. return(URI);
  3079. }
  3080. /**
  3081. * xmlParsePI:
  3082. * @ctxt: an XML parser context
  3083. *
  3084. * parse an XML Processing Instruction.
  3085. *
  3086. * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
  3087. */
  3088. static void
  3089. htmlParsePI(htmlParserCtxtPtr ctxt) {
  3090. xmlChar *buf = NULL;
  3091. int len = 0;
  3092. int size = HTML_PARSER_BUFFER_SIZE;
  3093. int cur, l;
  3094. const xmlChar *target;
  3095. xmlParserInputState state;
  3096. int count = 0;
  3097. if ((RAW == '<') && (NXT(1) == '?')) {
  3098. state = ctxt->instate;
  3099. ctxt->instate = XML_PARSER_PI;
  3100. /*
  3101. * this is a Processing Instruction.
  3102. */
  3103. SKIP(2);
  3104. SHRINK;
  3105. /*
  3106. * Parse the target name and check for special support like
  3107. * namespace.
  3108. */
  3109. target = htmlParseName(ctxt);
  3110. if (target != NULL) {
  3111. if (RAW == '>') {
  3112. SKIP(1);
  3113. /*
  3114. * SAX: PI detected.
  3115. */
  3116. if ((ctxt->sax) && (!ctxt->disableSAX) &&
  3117. (ctxt->sax->processingInstruction != NULL))
  3118. ctxt->sax->processingInstruction(ctxt->userData,
  3119. target, NULL);
  3120. ctxt->instate = state;
  3121. return;
  3122. }
  3123. buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
  3124. if (buf == NULL) {
  3125. htmlErrMemory(ctxt, NULL);
  3126. ctxt->instate = state;
  3127. return;
  3128. }
  3129. cur = CUR;
  3130. if (!IS_BLANK(cur)) {
  3131. htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
  3132. "ParsePI: PI %s space expected\n", target, NULL);
  3133. }
  3134. SKIP_BLANKS;
  3135. cur = CUR_CHAR(l);
  3136. while ((cur != 0) && (cur != '>')) {
  3137. if (len + 5 >= size) {
  3138. xmlChar *tmp;
  3139. size *= 2;
  3140. tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
  3141. if (tmp == NULL) {
  3142. htmlErrMemory(ctxt, NULL);
  3143. xmlFree(buf);
  3144. ctxt->instate = state;
  3145. return;
  3146. }
  3147. buf = tmp;
  3148. }
  3149. count++;
  3150. if (count > 50) {
  3151. GROW;
  3152. count = 0;
  3153. }
  3154. if (IS_CHAR(cur)) {
  3155. COPY_BUF(l,buf,len,cur);
  3156. } else {
  3157. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  3158. "Invalid char in processing instruction "
  3159. "0x%X\n", cur);
  3160. }
  3161. NEXTL(l);
  3162. cur = CUR_CHAR(l);
  3163. if (cur == 0) {
  3164. SHRINK;
  3165. GROW;
  3166. cur = CUR_CHAR(l);
  3167. }
  3168. }
  3169. buf[len] = 0;
  3170. if (cur != '>') {
  3171. htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
  3172. "ParsePI: PI %s never end ...\n", target, NULL);
  3173. } else {
  3174. SKIP(1);
  3175. /*
  3176. * SAX: PI detected.
  3177. */
  3178. if ((ctxt->sax) && (!ctxt->disableSAX) &&
  3179. (ctxt->sax->processingInstruction != NULL))
  3180. ctxt->sax->processingInstruction(ctxt->userData,
  3181. target, buf);
  3182. }
  3183. xmlFree(buf);
  3184. } else {
  3185. htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
  3186. "PI is not started correctly", NULL, NULL);
  3187. }
  3188. ctxt->instate = state;
  3189. }
  3190. }
  3191. /**
  3192. * htmlParseComment:
  3193. * @ctxt: an HTML parser context
  3194. *
  3195. * Parse an XML (SGML) comment <!-- .... -->
  3196. *
  3197. * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
  3198. */
  3199. static void
  3200. htmlParseComment(htmlParserCtxtPtr ctxt) {
  3201. xmlChar *buf = NULL;
  3202. int len;
  3203. int size = HTML_PARSER_BUFFER_SIZE;
  3204. int q, ql;
  3205. int r, rl;
  3206. int cur, l;
  3207. int next, nl;
  3208. xmlParserInputState state;
  3209. /*
  3210. * Check that there is a comment right here.
  3211. */
  3212. if ((RAW != '<') || (NXT(1) != '!') ||
  3213. (NXT(2) != '-') || (NXT(3) != '-')) return;
  3214. state = ctxt->instate;
  3215. ctxt->instate = XML_PARSER_COMMENT;
  3216. SHRINK;
  3217. SKIP(4);
  3218. buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
  3219. if (buf == NULL) {
  3220. htmlErrMemory(ctxt, "buffer allocation failed\n");
  3221. ctxt->instate = state;
  3222. return;
  3223. }
  3224. len = 0;
  3225. buf[len] = 0;
  3226. q = CUR_CHAR(ql);
  3227. if (q == 0)
  3228. goto unfinished;
  3229. NEXTL(ql);
  3230. r = CUR_CHAR(rl);
  3231. if (r == 0)
  3232. goto unfinished;
  3233. NEXTL(rl);
  3234. cur = CUR_CHAR(l);
  3235. while ((cur != 0) &&
  3236. ((cur != '>') ||
  3237. (r != '-') || (q != '-'))) {
  3238. NEXTL(l);
  3239. next = CUR_CHAR(nl);
  3240. if (next == 0) {
  3241. SHRINK;
  3242. GROW;
  3243. next = CUR_CHAR(nl);
  3244. }
  3245. if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
  3246. htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
  3247. "Comment incorrectly closed by '--!>'", NULL, NULL);
  3248. cur = '>';
  3249. break;
  3250. }
  3251. if (len + 5 >= size) {
  3252. xmlChar *tmp;
  3253. size *= 2;
  3254. tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
  3255. if (tmp == NULL) {
  3256. xmlFree(buf);
  3257. htmlErrMemory(ctxt, "growing buffer failed\n");
  3258. ctxt->instate = state;
  3259. return;
  3260. }
  3261. buf = tmp;
  3262. }
  3263. if (IS_CHAR(q)) {
  3264. COPY_BUF(ql,buf,len,q);
  3265. } else {
  3266. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  3267. "Invalid char in comment 0x%X\n", q);
  3268. }
  3269. q = r;
  3270. ql = rl;
  3271. r = cur;
  3272. rl = l;
  3273. cur = next;
  3274. l = nl;
  3275. }
  3276. buf[len] = 0;
  3277. if (cur == '>') {
  3278. NEXT;
  3279. if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
  3280. (!ctxt->disableSAX))
  3281. ctxt->sax->comment(ctxt->userData, buf);
  3282. xmlFree(buf);
  3283. ctxt->instate = state;
  3284. return;
  3285. }
  3286. unfinished:
  3287. htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
  3288. "Comment not terminated \n<!--%.50s\n", buf, NULL);
  3289. xmlFree(buf);
  3290. }
  3291. /**
  3292. * htmlParseCharRef:
  3293. * @ctxt: an HTML parser context
  3294. *
  3295. * parse Reference declarations
  3296. *
  3297. * [66] CharRef ::= '&#' [0-9]+ ';' |
  3298. * '&#x' [0-9a-fA-F]+ ';'
  3299. *
  3300. * Returns the value parsed (as an int)
  3301. */
  3302. int
  3303. htmlParseCharRef(htmlParserCtxtPtr ctxt) {
  3304. int val = 0;
  3305. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  3306. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  3307. "htmlParseCharRef: context error\n",
  3308. NULL, NULL);
  3309. return(0);
  3310. }
  3311. if ((CUR == '&') && (NXT(1) == '#') &&
  3312. ((NXT(2) == 'x') || NXT(2) == 'X')) {
  3313. SKIP(3);
  3314. while (CUR != ';') {
  3315. if ((CUR >= '0') && (CUR <= '9')) {
  3316. if (val < 0x110000)
  3317. val = val * 16 + (CUR - '0');
  3318. } else if ((CUR >= 'a') && (CUR <= 'f')) {
  3319. if (val < 0x110000)
  3320. val = val * 16 + (CUR - 'a') + 10;
  3321. } else if ((CUR >= 'A') && (CUR <= 'F')) {
  3322. if (val < 0x110000)
  3323. val = val * 16 + (CUR - 'A') + 10;
  3324. } else {
  3325. htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
  3326. "htmlParseCharRef: missing semicolon\n",
  3327. NULL, NULL);
  3328. break;
  3329. }
  3330. NEXT;
  3331. }
  3332. if (CUR == ';')
  3333. NEXT;
  3334. } else if ((CUR == '&') && (NXT(1) == '#')) {
  3335. SKIP(2);
  3336. while (CUR != ';') {
  3337. if ((CUR >= '0') && (CUR <= '9')) {
  3338. if (val < 0x110000)
  3339. val = val * 10 + (CUR - '0');
  3340. } else {
  3341. htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
  3342. "htmlParseCharRef: missing semicolon\n",
  3343. NULL, NULL);
  3344. break;
  3345. }
  3346. NEXT;
  3347. }
  3348. if (CUR == ';')
  3349. NEXT;
  3350. } else {
  3351. htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
  3352. "htmlParseCharRef: invalid value\n", NULL, NULL);
  3353. }
  3354. /*
  3355. * Check the value IS_CHAR ...
  3356. */
  3357. if (IS_CHAR(val)) {
  3358. return(val);
  3359. } else if (val >= 0x110000) {
  3360. htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
  3361. "htmlParseCharRef: value too large\n", NULL, NULL);
  3362. } else {
  3363. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  3364. "htmlParseCharRef: invalid xmlChar value %d\n",
  3365. val);
  3366. }
  3367. return(0);
  3368. }
  3369. /**
  3370. * htmlParseDocTypeDecl:
  3371. * @ctxt: an HTML parser context
  3372. *
  3373. * parse a DOCTYPE declaration
  3374. *
  3375. * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
  3376. * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
  3377. */
  3378. static void
  3379. htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
  3380. const xmlChar *name;
  3381. xmlChar *ExternalID = NULL;
  3382. xmlChar *URI = NULL;
  3383. /*
  3384. * We know that '<!DOCTYPE' has been detected.
  3385. */
  3386. SKIP(9);
  3387. SKIP_BLANKS;
  3388. /*
  3389. * Parse the DOCTYPE name.
  3390. */
  3391. name = htmlParseName(ctxt);
  3392. if (name == NULL) {
  3393. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  3394. "htmlParseDocTypeDecl : no DOCTYPE name !\n",
  3395. NULL, NULL);
  3396. }
  3397. /*
  3398. * Check that upper(name) == "HTML" !!!!!!!!!!!!!
  3399. */
  3400. SKIP_BLANKS;
  3401. /*
  3402. * Check for SystemID and ExternalID
  3403. */
  3404. URI = htmlParseExternalID(ctxt, &ExternalID);
  3405. SKIP_BLANKS;
  3406. /*
  3407. * We should be at the end of the DOCTYPE declaration.
  3408. */
  3409. if (CUR != '>') {
  3410. htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
  3411. "DOCTYPE improperly terminated\n", NULL, NULL);
  3412. /* Ignore bogus content */
  3413. while ((CUR != 0) && (CUR != '>'))
  3414. NEXT;
  3415. }
  3416. if (CUR == '>')
  3417. NEXT;
  3418. /*
  3419. * Create or update the document accordingly to the DOCTYPE
  3420. */
  3421. if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
  3422. (!ctxt->disableSAX))
  3423. ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
  3424. /*
  3425. * Cleanup, since we don't use all those identifiers
  3426. */
  3427. if (URI != NULL) xmlFree(URI);
  3428. if (ExternalID != NULL) xmlFree(ExternalID);
  3429. }
  3430. /**
  3431. * htmlParseAttribute:
  3432. * @ctxt: an HTML parser context
  3433. * @value: a xmlChar ** used to store the value of the attribute
  3434. *
  3435. * parse an attribute
  3436. *
  3437. * [41] Attribute ::= Name Eq AttValue
  3438. *
  3439. * [25] Eq ::= S? '=' S?
  3440. *
  3441. * With namespace:
  3442. *
  3443. * [NS 11] Attribute ::= QName Eq AttValue
  3444. *
  3445. * Also the case QName == xmlns:??? is handled independently as a namespace
  3446. * definition.
  3447. *
  3448. * Returns the attribute name, and the value in *value.
  3449. */
  3450. static const xmlChar *
  3451. htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
  3452. const xmlChar *name;
  3453. xmlChar *val = NULL;
  3454. *value = NULL;
  3455. name = htmlParseHTMLName(ctxt);
  3456. if (name == NULL) {
  3457. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  3458. "error parsing attribute name\n", NULL, NULL);
  3459. return(NULL);
  3460. }
  3461. /*
  3462. * read the value
  3463. */
  3464. SKIP_BLANKS;
  3465. if (CUR == '=') {
  3466. NEXT;
  3467. SKIP_BLANKS;
  3468. val = htmlParseAttValue(ctxt);
  3469. }
  3470. *value = val;
  3471. return(name);
  3472. }
  3473. /**
  3474. * htmlCheckEncodingDirect:
  3475. * @ctxt: an HTML parser context
  3476. * @attvalue: the attribute value
  3477. *
  3478. * Checks an attribute value to detect
  3479. * the encoding
  3480. * If a new encoding is detected the parser is switched to decode
  3481. * it and pass UTF8
  3482. */
  3483. static void
  3484. htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
  3485. if ((ctxt == NULL) || (encoding == NULL) ||
  3486. (ctxt->options & HTML_PARSE_IGNORE_ENC))
  3487. return;
  3488. /* do not change encoding */
  3489. if (ctxt->input->encoding != NULL)
  3490. return;
  3491. if (encoding != NULL) {
  3492. xmlCharEncoding enc;
  3493. xmlCharEncodingHandlerPtr handler;
  3494. while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
  3495. if (ctxt->input->encoding != NULL)
  3496. xmlFree((xmlChar *) ctxt->input->encoding);
  3497. ctxt->input->encoding = xmlStrdup(encoding);
  3498. enc = xmlParseCharEncoding((const char *) encoding);
  3499. /*
  3500. * registered set of known encodings
  3501. */
  3502. if (enc != XML_CHAR_ENCODING_ERROR) {
  3503. if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
  3504. (enc == XML_CHAR_ENCODING_UTF16BE) ||
  3505. (enc == XML_CHAR_ENCODING_UCS4LE) ||
  3506. (enc == XML_CHAR_ENCODING_UCS4BE)) &&
  3507. (ctxt->input->buf != NULL) &&
  3508. (ctxt->input->buf->encoder == NULL)) {
  3509. htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
  3510. "htmlCheckEncoding: wrong encoding meta\n",
  3511. NULL, NULL);
  3512. } else {
  3513. xmlSwitchEncoding(ctxt, enc);
  3514. }
  3515. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  3516. } else {
  3517. /*
  3518. * fallback for unknown encodings
  3519. */
  3520. handler = xmlFindCharEncodingHandler((const char *) encoding);
  3521. if (handler != NULL) {
  3522. xmlSwitchToEncoding(ctxt, handler);
  3523. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  3524. } else {
  3525. htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  3526. "htmlCheckEncoding: unknown encoding %s\n",
  3527. encoding, NULL);
  3528. }
  3529. }
  3530. if ((ctxt->input->buf != NULL) &&
  3531. (ctxt->input->buf->encoder != NULL) &&
  3532. (ctxt->input->buf->raw != NULL) &&
  3533. (ctxt->input->buf->buffer != NULL)) {
  3534. int nbchars;
  3535. int processed;
  3536. /*
  3537. * convert as much as possible to the parser reading buffer.
  3538. */
  3539. processed = ctxt->input->cur - ctxt->input->base;
  3540. xmlBufShrink(ctxt->input->buf->buffer, processed);
  3541. nbchars = xmlCharEncInput(ctxt->input->buf, 1);
  3542. xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
  3543. if (nbchars < 0) {
  3544. htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
  3545. "htmlCheckEncoding: encoder error\n",
  3546. NULL, NULL);
  3547. }
  3548. }
  3549. }
  3550. }
  3551. /**
  3552. * htmlCheckEncoding:
  3553. * @ctxt: an HTML parser context
  3554. * @attvalue: the attribute value
  3555. *
  3556. * Checks an http-equiv attribute from a Meta tag to detect
  3557. * the encoding
  3558. * If a new encoding is detected the parser is switched to decode
  3559. * it and pass UTF8
  3560. */
  3561. static void
  3562. htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
  3563. const xmlChar *encoding;
  3564. if (!attvalue)
  3565. return;
  3566. encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
  3567. if (encoding != NULL) {
  3568. encoding += 7;
  3569. }
  3570. /*
  3571. * skip blank
  3572. */
  3573. if (encoding && IS_BLANK_CH(*encoding))
  3574. encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
  3575. if (encoding && *encoding == '=') {
  3576. encoding ++;
  3577. htmlCheckEncodingDirect(ctxt, encoding);
  3578. }
  3579. }
  3580. /**
  3581. * htmlCheckMeta:
  3582. * @ctxt: an HTML parser context
  3583. * @atts: the attributes values
  3584. *
  3585. * Checks an attributes from a Meta tag
  3586. */
  3587. static void
  3588. htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
  3589. int i;
  3590. const xmlChar *att, *value;
  3591. int http = 0;
  3592. const xmlChar *content = NULL;
  3593. if ((ctxt == NULL) || (atts == NULL))
  3594. return;
  3595. i = 0;
  3596. att = atts[i++];
  3597. while (att != NULL) {
  3598. value = atts[i++];
  3599. if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
  3600. && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
  3601. http = 1;
  3602. else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
  3603. htmlCheckEncodingDirect(ctxt, value);
  3604. else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
  3605. content = value;
  3606. att = atts[i++];
  3607. }
  3608. if ((http) && (content != NULL))
  3609. htmlCheckEncoding(ctxt, content);
  3610. }
  3611. /**
  3612. * htmlParseStartTag:
  3613. * @ctxt: an HTML parser context
  3614. *
  3615. * parse a start of tag either for rule element or
  3616. * EmptyElement. In both case we don't parse the tag closing chars.
  3617. *
  3618. * [40] STag ::= '<' Name (S Attribute)* S? '>'
  3619. *
  3620. * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
  3621. *
  3622. * With namespace:
  3623. *
  3624. * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
  3625. *
  3626. * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
  3627. *
  3628. * Returns 0 in case of success, -1 in case of error and 1 if discarded
  3629. */
  3630. static int
  3631. htmlParseStartTag(htmlParserCtxtPtr ctxt) {
  3632. const xmlChar *name;
  3633. const xmlChar *attname;
  3634. xmlChar *attvalue;
  3635. const xmlChar **atts;
  3636. int nbatts = 0;
  3637. int maxatts;
  3638. int meta = 0;
  3639. int i;
  3640. int discardtag = 0;
  3641. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  3642. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  3643. "htmlParseStartTag: context error\n", NULL, NULL);
  3644. return -1;
  3645. }
  3646. if (ctxt->instate == XML_PARSER_EOF)
  3647. return(-1);
  3648. if (CUR != '<') return -1;
  3649. NEXT;
  3650. atts = ctxt->atts;
  3651. maxatts = ctxt->maxatts;
  3652. GROW;
  3653. name = htmlParseHTMLName(ctxt);
  3654. if (name == NULL) {
  3655. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  3656. "htmlParseStartTag: invalid element name\n",
  3657. NULL, NULL);
  3658. /* if recover preserve text on classic misconstructs */
  3659. if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
  3660. (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
  3661. htmlParseCharDataInternal(ctxt, '<');
  3662. return(-1);
  3663. }
  3664. /* Dump the bogus tag like browsers do */
  3665. while ((CUR != 0) && (CUR != '>') &&
  3666. (ctxt->instate != XML_PARSER_EOF))
  3667. NEXT;
  3668. return -1;
  3669. }
  3670. if (xmlStrEqual(name, BAD_CAST"meta"))
  3671. meta = 1;
  3672. /*
  3673. * Check for auto-closure of HTML elements.
  3674. */
  3675. htmlAutoClose(ctxt, name);
  3676. /*
  3677. * Check for implied HTML elements.
  3678. */
  3679. htmlCheckImplied(ctxt, name);
  3680. /*
  3681. * Avoid html at any level > 0, head at any level != 1
  3682. * or any attempt to recurse body
  3683. */
  3684. if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
  3685. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  3686. "htmlParseStartTag: misplaced <html> tag\n",
  3687. name, NULL);
  3688. discardtag = 1;
  3689. ctxt->depth++;
  3690. }
  3691. if ((ctxt->nameNr != 1) &&
  3692. (xmlStrEqual(name, BAD_CAST"head"))) {
  3693. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  3694. "htmlParseStartTag: misplaced <head> tag\n",
  3695. name, NULL);
  3696. discardtag = 1;
  3697. ctxt->depth++;
  3698. }
  3699. if (xmlStrEqual(name, BAD_CAST"body")) {
  3700. int indx;
  3701. for (indx = 0;indx < ctxt->nameNr;indx++) {
  3702. if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
  3703. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  3704. "htmlParseStartTag: misplaced <body> tag\n",
  3705. name, NULL);
  3706. discardtag = 1;
  3707. ctxt->depth++;
  3708. }
  3709. }
  3710. }
  3711. /*
  3712. * Now parse the attributes, it ends up with the ending
  3713. *
  3714. * (S Attribute)* S?
  3715. */
  3716. SKIP_BLANKS;
  3717. while ((CUR != 0) &&
  3718. (CUR != '>') &&
  3719. ((CUR != '/') || (NXT(1) != '>'))) {
  3720. GROW;
  3721. attname = htmlParseAttribute(ctxt, &attvalue);
  3722. if (attname != NULL) {
  3723. /*
  3724. * Well formedness requires at most one declaration of an attribute
  3725. */
  3726. for (i = 0; i < nbatts;i += 2) {
  3727. if (xmlStrEqual(atts[i], attname)) {
  3728. htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
  3729. "Attribute %s redefined\n", attname, NULL);
  3730. if (attvalue != NULL)
  3731. xmlFree(attvalue);
  3732. goto failed;
  3733. }
  3734. }
  3735. /*
  3736. * Add the pair to atts
  3737. */
  3738. if (atts == NULL) {
  3739. maxatts = 22; /* allow for 10 attrs by default */
  3740. atts = (const xmlChar **)
  3741. xmlMalloc(maxatts * sizeof(xmlChar *));
  3742. if (atts == NULL) {
  3743. htmlErrMemory(ctxt, NULL);
  3744. if (attvalue != NULL)
  3745. xmlFree(attvalue);
  3746. goto failed;
  3747. }
  3748. ctxt->atts = atts;
  3749. ctxt->maxatts = maxatts;
  3750. } else if (nbatts + 4 > maxatts) {
  3751. const xmlChar **n;
  3752. maxatts *= 2;
  3753. n = (const xmlChar **) xmlRealloc((void *) atts,
  3754. maxatts * sizeof(const xmlChar *));
  3755. if (n == NULL) {
  3756. htmlErrMemory(ctxt, NULL);
  3757. if (attvalue != NULL)
  3758. xmlFree(attvalue);
  3759. goto failed;
  3760. }
  3761. atts = n;
  3762. ctxt->atts = atts;
  3763. ctxt->maxatts = maxatts;
  3764. }
  3765. atts[nbatts++] = attname;
  3766. atts[nbatts++] = attvalue;
  3767. atts[nbatts] = NULL;
  3768. atts[nbatts + 1] = NULL;
  3769. }
  3770. else {
  3771. if (attvalue != NULL)
  3772. xmlFree(attvalue);
  3773. /* Dump the bogus attribute string up to the next blank or
  3774. * the end of the tag. */
  3775. while ((CUR != 0) &&
  3776. !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
  3777. ((CUR != '/') || (NXT(1) != '>')))
  3778. NEXT;
  3779. }
  3780. failed:
  3781. SKIP_BLANKS;
  3782. }
  3783. /*
  3784. * Handle specific association to the META tag
  3785. */
  3786. if (meta && (nbatts != 0))
  3787. htmlCheckMeta(ctxt, atts);
  3788. /*
  3789. * SAX: Start of Element !
  3790. */
  3791. if (!discardtag) {
  3792. htmlnamePush(ctxt, name);
  3793. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
  3794. if (nbatts != 0)
  3795. ctxt->sax->startElement(ctxt->userData, name, atts);
  3796. else
  3797. ctxt->sax->startElement(ctxt->userData, name, NULL);
  3798. }
  3799. }
  3800. if (atts != NULL) {
  3801. for (i = 1;i < nbatts;i += 2) {
  3802. if (atts[i] != NULL)
  3803. xmlFree((xmlChar *) atts[i]);
  3804. }
  3805. }
  3806. return(discardtag);
  3807. }
  3808. /**
  3809. * htmlParseEndTag:
  3810. * @ctxt: an HTML parser context
  3811. *
  3812. * parse an end of tag
  3813. *
  3814. * [42] ETag ::= '</' Name S? '>'
  3815. *
  3816. * With namespace
  3817. *
  3818. * [NS 9] ETag ::= '</' QName S? '>'
  3819. *
  3820. * Returns 1 if the current level should be closed.
  3821. */
  3822. static int
  3823. htmlParseEndTag(htmlParserCtxtPtr ctxt)
  3824. {
  3825. const xmlChar *name;
  3826. const xmlChar *oldname;
  3827. int i, ret;
  3828. if ((CUR != '<') || (NXT(1) != '/')) {
  3829. htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
  3830. "htmlParseEndTag: '</' not found\n", NULL, NULL);
  3831. return (0);
  3832. }
  3833. SKIP(2);
  3834. name = htmlParseHTMLName(ctxt);
  3835. if (name == NULL)
  3836. return (0);
  3837. /*
  3838. * We should definitely be at the ending "S? '>'" part
  3839. */
  3840. SKIP_BLANKS;
  3841. if (CUR != '>') {
  3842. htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
  3843. "End tag : expected '>'\n", NULL, NULL);
  3844. /* Skip to next '>' */
  3845. while ((CUR != 0) && (CUR != '>'))
  3846. NEXT;
  3847. }
  3848. if (CUR == '>')
  3849. NEXT;
  3850. /*
  3851. * if we ignored misplaced tags in htmlParseStartTag don't pop them
  3852. * out now.
  3853. */
  3854. if ((ctxt->depth > 0) &&
  3855. (xmlStrEqual(name, BAD_CAST "html") ||
  3856. xmlStrEqual(name, BAD_CAST "body") ||
  3857. xmlStrEqual(name, BAD_CAST "head"))) {
  3858. ctxt->depth--;
  3859. return (0);
  3860. }
  3861. /*
  3862. * If the name read is not one of the element in the parsing stack
  3863. * then return, it's just an error.
  3864. */
  3865. for (i = (ctxt->nameNr - 1); i >= 0; i--) {
  3866. if (xmlStrEqual(name, ctxt->nameTab[i]))
  3867. break;
  3868. }
  3869. if (i < 0) {
  3870. htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
  3871. "Unexpected end tag : %s\n", name, NULL);
  3872. return (0);
  3873. }
  3874. /*
  3875. * Check for auto-closure of HTML elements.
  3876. */
  3877. htmlAutoCloseOnClose(ctxt, name);
  3878. /*
  3879. * Well formedness constraints, opening and closing must match.
  3880. * With the exception that the autoclose may have popped stuff out
  3881. * of the stack.
  3882. */
  3883. if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
  3884. htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
  3885. "Opening and ending tag mismatch: %s and %s\n",
  3886. name, ctxt->name);
  3887. }
  3888. /*
  3889. * SAX: End of Tag
  3890. */
  3891. oldname = ctxt->name;
  3892. if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
  3893. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  3894. ctxt->sax->endElement(ctxt->userData, name);
  3895. htmlNodeInfoPop(ctxt);
  3896. htmlnamePop(ctxt);
  3897. ret = 1;
  3898. } else {
  3899. ret = 0;
  3900. }
  3901. return (ret);
  3902. }
  3903. /**
  3904. * htmlParseReference:
  3905. * @ctxt: an HTML parser context
  3906. *
  3907. * parse and handle entity references in content,
  3908. * this will end-up in a call to character() since this is either a
  3909. * CharRef, or a predefined entity.
  3910. */
  3911. static void
  3912. htmlParseReference(htmlParserCtxtPtr ctxt) {
  3913. const htmlEntityDesc * ent;
  3914. xmlChar out[6];
  3915. const xmlChar *name;
  3916. if (CUR != '&') return;
  3917. if (NXT(1) == '#') {
  3918. unsigned int c;
  3919. int bits, i = 0;
  3920. c = htmlParseCharRef(ctxt);
  3921. if (c == 0)
  3922. return;
  3923. if (c < 0x80) { out[i++]= c; bits= -6; }
  3924. else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
  3925. else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
  3926. else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
  3927. for ( ; bits >= 0; bits-= 6) {
  3928. out[i++]= ((c >> bits) & 0x3F) | 0x80;
  3929. }
  3930. out[i] = 0;
  3931. htmlCheckParagraph(ctxt);
  3932. if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
  3933. ctxt->sax->characters(ctxt->userData, out, i);
  3934. } else {
  3935. ent = htmlParseEntityRef(ctxt, &name);
  3936. if (name == NULL) {
  3937. htmlCheckParagraph(ctxt);
  3938. if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
  3939. ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
  3940. return;
  3941. }
  3942. if ((ent == NULL) || !(ent->value > 0)) {
  3943. htmlCheckParagraph(ctxt);
  3944. if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
  3945. ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
  3946. ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
  3947. /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
  3948. }
  3949. } else {
  3950. unsigned int c;
  3951. int bits, i = 0;
  3952. c = ent->value;
  3953. if (c < 0x80)
  3954. { out[i++]= c; bits= -6; }
  3955. else if (c < 0x800)
  3956. { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
  3957. else if (c < 0x10000)
  3958. { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
  3959. else
  3960. { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
  3961. for ( ; bits >= 0; bits-= 6) {
  3962. out[i++]= ((c >> bits) & 0x3F) | 0x80;
  3963. }
  3964. out[i] = 0;
  3965. htmlCheckParagraph(ctxt);
  3966. if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
  3967. ctxt->sax->characters(ctxt->userData, out, i);
  3968. }
  3969. }
  3970. }
  3971. /**
  3972. * htmlParseContent:
  3973. * @ctxt: an HTML parser context
  3974. *
  3975. * Parse a content: comment, sub-element, reference or text.
  3976. * Kept for compatibility with old code
  3977. */
  3978. static void
  3979. htmlParseContent(htmlParserCtxtPtr ctxt) {
  3980. xmlChar *currentNode;
  3981. int depth;
  3982. const xmlChar *name;
  3983. currentNode = xmlStrdup(ctxt->name);
  3984. depth = ctxt->nameNr;
  3985. while (1) {
  3986. GROW;
  3987. if (ctxt->instate == XML_PARSER_EOF)
  3988. break;
  3989. /*
  3990. * Our tag or one of it's parent or children is ending.
  3991. */
  3992. if ((CUR == '<') && (NXT(1) == '/')) {
  3993. if (htmlParseEndTag(ctxt) &&
  3994. ((currentNode != NULL) || (ctxt->nameNr == 0))) {
  3995. if (currentNode != NULL)
  3996. xmlFree(currentNode);
  3997. return;
  3998. }
  3999. continue; /* while */
  4000. }
  4001. else if ((CUR == '<') &&
  4002. ((IS_ASCII_LETTER(NXT(1))) ||
  4003. (NXT(1) == '_') || (NXT(1) == ':'))) {
  4004. name = htmlParseHTMLName_nonInvasive(ctxt);
  4005. if (name == NULL) {
  4006. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  4007. "htmlParseStartTag: invalid element name\n",
  4008. NULL, NULL);
  4009. /* Dump the bogus tag like browsers do */
  4010. while ((CUR != 0) && (CUR != '>'))
  4011. NEXT;
  4012. if (currentNode != NULL)
  4013. xmlFree(currentNode);
  4014. return;
  4015. }
  4016. if (ctxt->name != NULL) {
  4017. if (htmlCheckAutoClose(name, ctxt->name) == 1) {
  4018. htmlAutoClose(ctxt, name);
  4019. continue;
  4020. }
  4021. }
  4022. }
  4023. /*
  4024. * Has this node been popped out during parsing of
  4025. * the next element
  4026. */
  4027. if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
  4028. (!xmlStrEqual(currentNode, ctxt->name)))
  4029. {
  4030. if (currentNode != NULL) xmlFree(currentNode);
  4031. return;
  4032. }
  4033. if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
  4034. (xmlStrEqual(currentNode, BAD_CAST"style")))) {
  4035. /*
  4036. * Handle SCRIPT/STYLE separately
  4037. */
  4038. htmlParseScript(ctxt);
  4039. } else {
  4040. /*
  4041. * Sometimes DOCTYPE arrives in the middle of the document
  4042. */
  4043. if ((CUR == '<') && (NXT(1) == '!') &&
  4044. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  4045. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  4046. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  4047. (UPP(8) == 'E')) {
  4048. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  4049. "Misplaced DOCTYPE declaration\n",
  4050. BAD_CAST "DOCTYPE" , NULL);
  4051. htmlParseDocTypeDecl(ctxt);
  4052. }
  4053. /*
  4054. * First case : a comment
  4055. */
  4056. if ((CUR == '<') && (NXT(1) == '!') &&
  4057. (NXT(2) == '-') && (NXT(3) == '-')) {
  4058. htmlParseComment(ctxt);
  4059. }
  4060. /*
  4061. * Second case : a Processing Instruction.
  4062. */
  4063. else if ((CUR == '<') && (NXT(1) == '?')) {
  4064. htmlParsePI(ctxt);
  4065. }
  4066. /*
  4067. * Third case : a sub-element.
  4068. */
  4069. else if (CUR == '<') {
  4070. htmlParseElement(ctxt);
  4071. }
  4072. /*
  4073. * Fourth case : a reference. If if has not been resolved,
  4074. * parsing returns it's Name, create the node
  4075. */
  4076. else if (CUR == '&') {
  4077. htmlParseReference(ctxt);
  4078. }
  4079. /*
  4080. * Fifth case : end of the resource
  4081. */
  4082. else if (CUR == 0) {
  4083. htmlAutoCloseOnEnd(ctxt);
  4084. break;
  4085. }
  4086. /*
  4087. * Last case, text. Note that References are handled directly.
  4088. */
  4089. else {
  4090. htmlParseCharData(ctxt);
  4091. }
  4092. }
  4093. GROW;
  4094. }
  4095. if (currentNode != NULL) xmlFree(currentNode);
  4096. }
  4097. /**
  4098. * htmlParseElement:
  4099. * @ctxt: an HTML parser context
  4100. *
  4101. * parse an HTML element, this is highly recursive
  4102. * this is kept for compatibility with previous code versions
  4103. *
  4104. * [39] element ::= EmptyElemTag | STag content ETag
  4105. *
  4106. * [41] Attribute ::= Name Eq AttValue
  4107. */
  4108. void
  4109. htmlParseElement(htmlParserCtxtPtr ctxt) {
  4110. const xmlChar *name;
  4111. xmlChar *currentNode = NULL;
  4112. const htmlElemDesc * info;
  4113. htmlParserNodeInfo node_info;
  4114. int failed;
  4115. int depth;
  4116. const xmlChar *oldptr;
  4117. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  4118. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  4119. "htmlParseElement: context error\n", NULL, NULL);
  4120. return;
  4121. }
  4122. if (ctxt->instate == XML_PARSER_EOF)
  4123. return;
  4124. /* Capture start position */
  4125. if (ctxt->record_info) {
  4126. node_info.begin_pos = ctxt->input->consumed +
  4127. (CUR_PTR - ctxt->input->base);
  4128. node_info.begin_line = ctxt->input->line;
  4129. }
  4130. failed = htmlParseStartTag(ctxt);
  4131. name = ctxt->name;
  4132. if ((failed == -1) || (name == NULL)) {
  4133. if (CUR == '>')
  4134. NEXT;
  4135. return;
  4136. }
  4137. /*
  4138. * Lookup the info for that element.
  4139. */
  4140. info = htmlTagLookup(name);
  4141. if (info == NULL) {
  4142. htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
  4143. "Tag %s invalid\n", name, NULL);
  4144. }
  4145. /*
  4146. * Check for an Empty Element labeled the XML/SGML way
  4147. */
  4148. if ((CUR == '/') && (NXT(1) == '>')) {
  4149. SKIP(2);
  4150. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  4151. ctxt->sax->endElement(ctxt->userData, name);
  4152. htmlnamePop(ctxt);
  4153. return;
  4154. }
  4155. if (CUR == '>') {
  4156. NEXT;
  4157. } else {
  4158. htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
  4159. "Couldn't find end of Start Tag %s\n", name, NULL);
  4160. /*
  4161. * end of parsing of this node.
  4162. */
  4163. if (xmlStrEqual(name, ctxt->name)) {
  4164. nodePop(ctxt);
  4165. htmlnamePop(ctxt);
  4166. }
  4167. /*
  4168. * Capture end position and add node
  4169. */
  4170. if (ctxt->record_info) {
  4171. node_info.end_pos = ctxt->input->consumed +
  4172. (CUR_PTR - ctxt->input->base);
  4173. node_info.end_line = ctxt->input->line;
  4174. node_info.node = ctxt->node;
  4175. xmlParserAddNodeInfo(ctxt, &node_info);
  4176. }
  4177. return;
  4178. }
  4179. /*
  4180. * Check for an Empty Element from DTD definition
  4181. */
  4182. if ((info != NULL) && (info->empty)) {
  4183. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  4184. ctxt->sax->endElement(ctxt->userData, name);
  4185. htmlnamePop(ctxt);
  4186. return;
  4187. }
  4188. /*
  4189. * Parse the content of the element:
  4190. */
  4191. currentNode = xmlStrdup(ctxt->name);
  4192. depth = ctxt->nameNr;
  4193. while (CUR != 0) {
  4194. oldptr = ctxt->input->cur;
  4195. htmlParseContent(ctxt);
  4196. if (oldptr==ctxt->input->cur) break;
  4197. if (ctxt->nameNr < depth) break;
  4198. }
  4199. /*
  4200. * Capture end position and add node
  4201. */
  4202. if ( currentNode != NULL && ctxt->record_info ) {
  4203. node_info.end_pos = ctxt->input->consumed +
  4204. (CUR_PTR - ctxt->input->base);
  4205. node_info.end_line = ctxt->input->line;
  4206. node_info.node = ctxt->node;
  4207. xmlParserAddNodeInfo(ctxt, &node_info);
  4208. }
  4209. if (CUR == 0) {
  4210. htmlAutoCloseOnEnd(ctxt);
  4211. }
  4212. if (currentNode != NULL)
  4213. xmlFree(currentNode);
  4214. }
  4215. static void
  4216. htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
  4217. /*
  4218. * Capture end position and add node
  4219. */
  4220. if ( ctxt->node != NULL && ctxt->record_info ) {
  4221. ctxt->nodeInfo->end_pos = ctxt->input->consumed +
  4222. (CUR_PTR - ctxt->input->base);
  4223. ctxt->nodeInfo->end_line = ctxt->input->line;
  4224. ctxt->nodeInfo->node = ctxt->node;
  4225. xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
  4226. htmlNodeInfoPop(ctxt);
  4227. }
  4228. if (CUR == 0) {
  4229. htmlAutoCloseOnEnd(ctxt);
  4230. }
  4231. }
  4232. /**
  4233. * htmlParseElementInternal:
  4234. * @ctxt: an HTML parser context
  4235. *
  4236. * parse an HTML element, new version, non recursive
  4237. *
  4238. * [39] element ::= EmptyElemTag | STag content ETag
  4239. *
  4240. * [41] Attribute ::= Name Eq AttValue
  4241. */
  4242. static void
  4243. htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
  4244. const xmlChar *name;
  4245. const htmlElemDesc * info;
  4246. htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
  4247. int failed;
  4248. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  4249. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  4250. "htmlParseElementInternal: context error\n", NULL, NULL);
  4251. return;
  4252. }
  4253. if (ctxt->instate == XML_PARSER_EOF)
  4254. return;
  4255. /* Capture start position */
  4256. if (ctxt->record_info) {
  4257. node_info.begin_pos = ctxt->input->consumed +
  4258. (CUR_PTR - ctxt->input->base);
  4259. node_info.begin_line = ctxt->input->line;
  4260. }
  4261. failed = htmlParseStartTag(ctxt);
  4262. name = ctxt->name;
  4263. if ((failed == -1) || (name == NULL)) {
  4264. if (CUR == '>')
  4265. NEXT;
  4266. return;
  4267. }
  4268. /*
  4269. * Lookup the info for that element.
  4270. */
  4271. info = htmlTagLookup(name);
  4272. if (info == NULL) {
  4273. htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
  4274. "Tag %s invalid\n", name, NULL);
  4275. }
  4276. /*
  4277. * Check for an Empty Element labeled the XML/SGML way
  4278. */
  4279. if ((CUR == '/') && (NXT(1) == '>')) {
  4280. SKIP(2);
  4281. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  4282. ctxt->sax->endElement(ctxt->userData, name);
  4283. htmlnamePop(ctxt);
  4284. return;
  4285. }
  4286. if (CUR == '>') {
  4287. NEXT;
  4288. } else {
  4289. htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
  4290. "Couldn't find end of Start Tag %s\n", name, NULL);
  4291. /*
  4292. * end of parsing of this node.
  4293. */
  4294. if (xmlStrEqual(name, ctxt->name)) {
  4295. nodePop(ctxt);
  4296. htmlnamePop(ctxt);
  4297. }
  4298. if (ctxt->record_info)
  4299. htmlNodeInfoPush(ctxt, &node_info);
  4300. htmlParserFinishElementParsing(ctxt);
  4301. return;
  4302. }
  4303. /*
  4304. * Check for an Empty Element from DTD definition
  4305. */
  4306. if ((info != NULL) && (info->empty)) {
  4307. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  4308. ctxt->sax->endElement(ctxt->userData, name);
  4309. htmlnamePop(ctxt);
  4310. return;
  4311. }
  4312. if (ctxt->record_info)
  4313. htmlNodeInfoPush(ctxt, &node_info);
  4314. }
  4315. /**
  4316. * htmlParseContentInternal:
  4317. * @ctxt: an HTML parser context
  4318. *
  4319. * Parse a content: comment, sub-element, reference or text.
  4320. * New version for non recursive htmlParseElementInternal
  4321. */
  4322. static void
  4323. htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
  4324. xmlChar *currentNode;
  4325. int depth;
  4326. const xmlChar *name;
  4327. currentNode = xmlStrdup(ctxt->name);
  4328. depth = ctxt->nameNr;
  4329. while (1) {
  4330. GROW;
  4331. if (ctxt->instate == XML_PARSER_EOF)
  4332. break;
  4333. /*
  4334. * Our tag or one of it's parent or children is ending.
  4335. */
  4336. if ((CUR == '<') && (NXT(1) == '/')) {
  4337. if (htmlParseEndTag(ctxt) &&
  4338. ((currentNode != NULL) || (ctxt->nameNr == 0))) {
  4339. if (currentNode != NULL)
  4340. xmlFree(currentNode);
  4341. currentNode = xmlStrdup(ctxt->name);
  4342. depth = ctxt->nameNr;
  4343. }
  4344. continue; /* while */
  4345. }
  4346. else if ((CUR == '<') &&
  4347. ((IS_ASCII_LETTER(NXT(1))) ||
  4348. (NXT(1) == '_') || (NXT(1) == ':'))) {
  4349. name = htmlParseHTMLName_nonInvasive(ctxt);
  4350. if (name == NULL) {
  4351. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  4352. "htmlParseStartTag: invalid element name\n",
  4353. NULL, NULL);
  4354. /* Dump the bogus tag like browsers do */
  4355. while ((CUR == 0) && (CUR != '>'))
  4356. NEXT;
  4357. htmlParserFinishElementParsing(ctxt);
  4358. if (currentNode != NULL)
  4359. xmlFree(currentNode);
  4360. currentNode = xmlStrdup(ctxt->name);
  4361. depth = ctxt->nameNr;
  4362. continue;
  4363. }
  4364. if (ctxt->name != NULL) {
  4365. if (htmlCheckAutoClose(name, ctxt->name) == 1) {
  4366. htmlAutoClose(ctxt, name);
  4367. continue;
  4368. }
  4369. }
  4370. }
  4371. /*
  4372. * Has this node been popped out during parsing of
  4373. * the next element
  4374. */
  4375. if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
  4376. (!xmlStrEqual(currentNode, ctxt->name)))
  4377. {
  4378. htmlParserFinishElementParsing(ctxt);
  4379. if (currentNode != NULL) xmlFree(currentNode);
  4380. currentNode = xmlStrdup(ctxt->name);
  4381. depth = ctxt->nameNr;
  4382. continue;
  4383. }
  4384. if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
  4385. (xmlStrEqual(currentNode, BAD_CAST"style")))) {
  4386. /*
  4387. * Handle SCRIPT/STYLE separately
  4388. */
  4389. htmlParseScript(ctxt);
  4390. } else {
  4391. /*
  4392. * Sometimes DOCTYPE arrives in the middle of the document
  4393. */
  4394. if ((CUR == '<') && (NXT(1) == '!') &&
  4395. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  4396. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  4397. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  4398. (UPP(8) == 'E')) {
  4399. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  4400. "Misplaced DOCTYPE declaration\n",
  4401. BAD_CAST "DOCTYPE" , NULL);
  4402. htmlParseDocTypeDecl(ctxt);
  4403. }
  4404. /*
  4405. * First case : a comment
  4406. */
  4407. if ((CUR == '<') && (NXT(1) == '!') &&
  4408. (NXT(2) == '-') && (NXT(3) == '-')) {
  4409. htmlParseComment(ctxt);
  4410. }
  4411. /*
  4412. * Second case : a Processing Instruction.
  4413. */
  4414. else if ((CUR == '<') && (NXT(1) == '?')) {
  4415. htmlParsePI(ctxt);
  4416. }
  4417. /*
  4418. * Third case : a sub-element.
  4419. */
  4420. else if (CUR == '<') {
  4421. htmlParseElementInternal(ctxt);
  4422. if (currentNode != NULL) xmlFree(currentNode);
  4423. currentNode = xmlStrdup(ctxt->name);
  4424. depth = ctxt->nameNr;
  4425. }
  4426. /*
  4427. * Fourth case : a reference. If if has not been resolved,
  4428. * parsing returns it's Name, create the node
  4429. */
  4430. else if (CUR == '&') {
  4431. htmlParseReference(ctxt);
  4432. }
  4433. /*
  4434. * Fifth case : end of the resource
  4435. */
  4436. else if (CUR == 0) {
  4437. htmlAutoCloseOnEnd(ctxt);
  4438. break;
  4439. }
  4440. /*
  4441. * Last case, text. Note that References are handled directly.
  4442. */
  4443. else {
  4444. htmlParseCharData(ctxt);
  4445. }
  4446. }
  4447. GROW;
  4448. }
  4449. if (currentNode != NULL) xmlFree(currentNode);
  4450. }
  4451. /**
  4452. * htmlParseContent:
  4453. * @ctxt: an HTML parser context
  4454. *
  4455. * Parse a content: comment, sub-element, reference or text.
  4456. * This is the entry point when called from parser.c
  4457. */
  4458. void
  4459. __htmlParseContent(void *ctxt) {
  4460. if (ctxt != NULL)
  4461. htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
  4462. }
  4463. /**
  4464. * htmlParseDocument:
  4465. * @ctxt: an HTML parser context
  4466. *
  4467. * parse an HTML document (and build a tree if using the standard SAX
  4468. * interface).
  4469. *
  4470. * Returns 0, -1 in case of error. the parser context is augmented
  4471. * as a result of the parsing.
  4472. */
  4473. int
  4474. htmlParseDocument(htmlParserCtxtPtr ctxt) {
  4475. xmlChar start[4];
  4476. xmlCharEncoding enc;
  4477. xmlDtdPtr dtd;
  4478. xmlInitParser();
  4479. htmlDefaultSAXHandlerInit();
  4480. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  4481. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  4482. "htmlParseDocument: context error\n", NULL, NULL);
  4483. return(XML_ERR_INTERNAL_ERROR);
  4484. }
  4485. ctxt->html = 1;
  4486. ctxt->linenumbers = 1;
  4487. GROW;
  4488. /*
  4489. * SAX: beginning of the document processing.
  4490. */
  4491. if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
  4492. ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
  4493. if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
  4494. ((ctxt->input->end - ctxt->input->cur) >= 4)) {
  4495. /*
  4496. * Get the 4 first bytes and decode the charset
  4497. * if enc != XML_CHAR_ENCODING_NONE
  4498. * plug some encoding conversion routines.
  4499. */
  4500. start[0] = RAW;
  4501. start[1] = NXT(1);
  4502. start[2] = NXT(2);
  4503. start[3] = NXT(3);
  4504. enc = xmlDetectCharEncoding(&start[0], 4);
  4505. if (enc != XML_CHAR_ENCODING_NONE) {
  4506. xmlSwitchEncoding(ctxt, enc);
  4507. }
  4508. }
  4509. /*
  4510. * Wipe out everything which is before the first '<'
  4511. */
  4512. SKIP_BLANKS;
  4513. if (CUR == 0) {
  4514. htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
  4515. "Document is empty\n", NULL, NULL);
  4516. }
  4517. if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
  4518. ctxt->sax->startDocument(ctxt->userData);
  4519. /*
  4520. * Parse possible comments and PIs before any content
  4521. */
  4522. while (((CUR == '<') && (NXT(1) == '!') &&
  4523. (NXT(2) == '-') && (NXT(3) == '-')) ||
  4524. ((CUR == '<') && (NXT(1) == '?'))) {
  4525. htmlParseComment(ctxt);
  4526. htmlParsePI(ctxt);
  4527. SKIP_BLANKS;
  4528. }
  4529. /*
  4530. * Then possibly doc type declaration(s) and more Misc
  4531. * (doctypedecl Misc*)?
  4532. */
  4533. if ((CUR == '<') && (NXT(1) == '!') &&
  4534. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  4535. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  4536. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  4537. (UPP(8) == 'E')) {
  4538. htmlParseDocTypeDecl(ctxt);
  4539. }
  4540. SKIP_BLANKS;
  4541. /*
  4542. * Parse possible comments and PIs before any content
  4543. */
  4544. while (((CUR == '<') && (NXT(1) == '!') &&
  4545. (NXT(2) == '-') && (NXT(3) == '-')) ||
  4546. ((CUR == '<') && (NXT(1) == '?'))) {
  4547. htmlParseComment(ctxt);
  4548. htmlParsePI(ctxt);
  4549. SKIP_BLANKS;
  4550. }
  4551. /*
  4552. * Time to start parsing the tree itself
  4553. */
  4554. htmlParseContentInternal(ctxt);
  4555. /*
  4556. * autoclose
  4557. */
  4558. if (CUR == 0)
  4559. htmlAutoCloseOnEnd(ctxt);
  4560. /*
  4561. * SAX: end of the document processing.
  4562. */
  4563. if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
  4564. ctxt->sax->endDocument(ctxt->userData);
  4565. if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
  4566. dtd = xmlGetIntSubset(ctxt->myDoc);
  4567. if (dtd == NULL)
  4568. ctxt->myDoc->intSubset =
  4569. xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
  4570. BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
  4571. BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
  4572. }
  4573. if (! ctxt->wellFormed) return(-1);
  4574. return(0);
  4575. }
  4576. /************************************************************************
  4577. * *
  4578. * Parser contexts handling *
  4579. * *
  4580. ************************************************************************/
  4581. /**
  4582. * htmlInitParserCtxt:
  4583. * @ctxt: an HTML parser context
  4584. *
  4585. * Initialize a parser context
  4586. *
  4587. * Returns 0 in case of success and -1 in case of error
  4588. */
  4589. static int
  4590. htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
  4591. {
  4592. htmlSAXHandler *sax;
  4593. if (ctxt == NULL) return(-1);
  4594. memset(ctxt, 0, sizeof(htmlParserCtxt));
  4595. ctxt->dict = xmlDictCreate();
  4596. if (ctxt->dict == NULL) {
  4597. htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
  4598. return(-1);
  4599. }
  4600. sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
  4601. if (sax == NULL) {
  4602. htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
  4603. return(-1);
  4604. }
  4605. else
  4606. memset(sax, 0, sizeof(htmlSAXHandler));
  4607. /* Allocate the Input stack */
  4608. ctxt->inputTab = (htmlParserInputPtr *)
  4609. xmlMalloc(5 * sizeof(htmlParserInputPtr));
  4610. if (ctxt->inputTab == NULL) {
  4611. htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
  4612. ctxt->inputNr = 0;
  4613. ctxt->inputMax = 0;
  4614. ctxt->input = NULL;
  4615. return(-1);
  4616. }
  4617. ctxt->inputNr = 0;
  4618. ctxt->inputMax = 5;
  4619. ctxt->input = NULL;
  4620. ctxt->version = NULL;
  4621. ctxt->encoding = NULL;
  4622. ctxt->standalone = -1;
  4623. ctxt->instate = XML_PARSER_START;
  4624. /* Allocate the Node stack */
  4625. ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
  4626. if (ctxt->nodeTab == NULL) {
  4627. htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
  4628. ctxt->nodeNr = 0;
  4629. ctxt->nodeMax = 0;
  4630. ctxt->node = NULL;
  4631. ctxt->inputNr = 0;
  4632. ctxt->inputMax = 0;
  4633. ctxt->input = NULL;
  4634. return(-1);
  4635. }
  4636. ctxt->nodeNr = 0;
  4637. ctxt->nodeMax = 10;
  4638. ctxt->node = NULL;
  4639. /* Allocate the Name stack */
  4640. ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
  4641. if (ctxt->nameTab == NULL) {
  4642. htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
  4643. ctxt->nameNr = 0;
  4644. ctxt->nameMax = 0;
  4645. ctxt->name = NULL;
  4646. ctxt->nodeNr = 0;
  4647. ctxt->nodeMax = 0;
  4648. ctxt->node = NULL;
  4649. ctxt->inputNr = 0;
  4650. ctxt->inputMax = 0;
  4651. ctxt->input = NULL;
  4652. return(-1);
  4653. }
  4654. ctxt->nameNr = 0;
  4655. ctxt->nameMax = 10;
  4656. ctxt->name = NULL;
  4657. ctxt->nodeInfoTab = NULL;
  4658. ctxt->nodeInfoNr = 0;
  4659. ctxt->nodeInfoMax = 0;
  4660. if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
  4661. else {
  4662. ctxt->sax = sax;
  4663. memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
  4664. }
  4665. ctxt->userData = ctxt;
  4666. ctxt->myDoc = NULL;
  4667. ctxt->wellFormed = 1;
  4668. ctxt->replaceEntities = 0;
  4669. ctxt->linenumbers = xmlLineNumbersDefaultValue;
  4670. ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
  4671. ctxt->html = 1;
  4672. ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
  4673. ctxt->vctxt.userData = ctxt;
  4674. ctxt->vctxt.error = xmlParserValidityError;
  4675. ctxt->vctxt.warning = xmlParserValidityWarning;
  4676. ctxt->record_info = 0;
  4677. ctxt->validate = 0;
  4678. ctxt->checkIndex = 0;
  4679. ctxt->catalogs = NULL;
  4680. xmlInitNodeInfoSeq(&ctxt->node_seq);
  4681. return(0);
  4682. }
  4683. /**
  4684. * htmlFreeParserCtxt:
  4685. * @ctxt: an HTML parser context
  4686. *
  4687. * Free all the memory used by a parser context. However the parsed
  4688. * document in ctxt->myDoc is not freed.
  4689. */
  4690. void
  4691. htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
  4692. {
  4693. xmlFreeParserCtxt(ctxt);
  4694. }
  4695. /**
  4696. * htmlNewParserCtxt:
  4697. *
  4698. * Allocate and initialize a new parser context.
  4699. *
  4700. * Returns the htmlParserCtxtPtr or NULL in case of allocation error
  4701. */
  4702. htmlParserCtxtPtr
  4703. htmlNewParserCtxt(void)
  4704. {
  4705. xmlParserCtxtPtr ctxt;
  4706. ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
  4707. if (ctxt == NULL) {
  4708. htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
  4709. return(NULL);
  4710. }
  4711. memset(ctxt, 0, sizeof(xmlParserCtxt));
  4712. if (htmlInitParserCtxt(ctxt) < 0) {
  4713. htmlFreeParserCtxt(ctxt);
  4714. return(NULL);
  4715. }
  4716. return(ctxt);
  4717. }
  4718. /**
  4719. * htmlCreateMemoryParserCtxt:
  4720. * @buffer: a pointer to a char array
  4721. * @size: the size of the array
  4722. *
  4723. * Create a parser context for an HTML in-memory document.
  4724. *
  4725. * Returns the new parser context or NULL
  4726. */
  4727. htmlParserCtxtPtr
  4728. htmlCreateMemoryParserCtxt(const char *buffer, int size) {
  4729. xmlParserCtxtPtr ctxt;
  4730. xmlParserInputPtr input;
  4731. xmlParserInputBufferPtr buf;
  4732. if (buffer == NULL)
  4733. return(NULL);
  4734. if (size <= 0)
  4735. return(NULL);
  4736. ctxt = htmlNewParserCtxt();
  4737. if (ctxt == NULL)
  4738. return(NULL);
  4739. buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
  4740. if (buf == NULL) return(NULL);
  4741. input = xmlNewInputStream(ctxt);
  4742. if (input == NULL) {
  4743. xmlFreeParserCtxt(ctxt);
  4744. return(NULL);
  4745. }
  4746. input->filename = NULL;
  4747. input->buf = buf;
  4748. xmlBufResetInput(buf->buffer, input);
  4749. inputPush(ctxt, input);
  4750. return(ctxt);
  4751. }
  4752. /**
  4753. * htmlCreateDocParserCtxt:
  4754. * @cur: a pointer to an array of xmlChar
  4755. * @encoding: a free form C string describing the HTML document encoding, or NULL
  4756. *
  4757. * Create a parser context for an HTML document.
  4758. *
  4759. * TODO: check the need to add encoding handling there
  4760. *
  4761. * Returns the new parser context or NULL
  4762. */
  4763. static htmlParserCtxtPtr
  4764. htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
  4765. int len;
  4766. htmlParserCtxtPtr ctxt;
  4767. if (cur == NULL)
  4768. return(NULL);
  4769. len = xmlStrlen(cur);
  4770. ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
  4771. if (ctxt == NULL)
  4772. return(NULL);
  4773. if (encoding != NULL) {
  4774. xmlCharEncoding enc;
  4775. xmlCharEncodingHandlerPtr handler;
  4776. if (ctxt->input->encoding != NULL)
  4777. xmlFree((xmlChar *) ctxt->input->encoding);
  4778. ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
  4779. enc = xmlParseCharEncoding(encoding);
  4780. /*
  4781. * registered set of known encodings
  4782. */
  4783. if (enc != XML_CHAR_ENCODING_ERROR) {
  4784. xmlSwitchEncoding(ctxt, enc);
  4785. if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
  4786. htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  4787. "Unsupported encoding %s\n",
  4788. (const xmlChar *) encoding, NULL);
  4789. }
  4790. } else {
  4791. /*
  4792. * fallback for unknown encodings
  4793. */
  4794. handler = xmlFindCharEncodingHandler((const char *) encoding);
  4795. if (handler != NULL) {
  4796. xmlSwitchToEncoding(ctxt, handler);
  4797. } else {
  4798. htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  4799. "Unsupported encoding %s\n",
  4800. (const xmlChar *) encoding, NULL);
  4801. }
  4802. }
  4803. }
  4804. return(ctxt);
  4805. }
  4806. #ifdef LIBXML_PUSH_ENABLED
  4807. /************************************************************************
  4808. * *
  4809. * Progressive parsing interfaces *
  4810. * *
  4811. ************************************************************************/
  4812. /**
  4813. * htmlParseLookupSequence:
  4814. * @ctxt: an HTML parser context
  4815. * @first: the first char to lookup
  4816. * @next: the next char to lookup or zero
  4817. * @third: the next char to lookup or zero
  4818. * @ignoreattrval: skip over attribute values
  4819. *
  4820. * Try to find if a sequence (first, next, third) or just (first next) or
  4821. * (first) is available in the input stream.
  4822. * This function has a side effect of (possibly) incrementing ctxt->checkIndex
  4823. * to avoid rescanning sequences of bytes, it DOES change the state of the
  4824. * parser, do not use liberally.
  4825. * This is basically similar to xmlParseLookupSequence()
  4826. *
  4827. * Returns the index to the current parsing point if the full sequence
  4828. * is available, -1 otherwise.
  4829. */
  4830. static int
  4831. htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
  4832. xmlChar next, xmlChar third, int ignoreattrval)
  4833. {
  4834. int base, len;
  4835. htmlParserInputPtr in;
  4836. const xmlChar *buf;
  4837. int invalue = 0;
  4838. char valdellim = 0x0;
  4839. in = ctxt->input;
  4840. if (in == NULL)
  4841. return (-1);
  4842. base = in->cur - in->base;
  4843. if (base < 0)
  4844. return (-1);
  4845. if (ctxt->checkIndex > base) {
  4846. base = ctxt->checkIndex;
  4847. /* Abuse hasPErefs member to restore current state. */
  4848. invalue = ctxt->hasPErefs & 1 ? 1 : 0;
  4849. }
  4850. if (in->buf == NULL) {
  4851. buf = in->base;
  4852. len = in->length;
  4853. } else {
  4854. buf = xmlBufContent(in->buf->buffer);
  4855. len = xmlBufUse(in->buf->buffer);
  4856. }
  4857. /* take into account the sequence length */
  4858. if (third)
  4859. len -= 2;
  4860. else if (next)
  4861. len--;
  4862. for (; base < len; base++) {
  4863. if (ignoreattrval) {
  4864. if (buf[base] == '"' || buf[base] == '\'') {
  4865. if (invalue) {
  4866. if (buf[base] == valdellim) {
  4867. invalue = 0;
  4868. continue;
  4869. }
  4870. } else {
  4871. valdellim = buf[base];
  4872. invalue = 1;
  4873. continue;
  4874. }
  4875. } else if (invalue) {
  4876. continue;
  4877. }
  4878. }
  4879. if (buf[base] == first) {
  4880. if (third != 0) {
  4881. if ((buf[base + 1] != next) || (buf[base + 2] != third))
  4882. continue;
  4883. } else if (next != 0) {
  4884. if (buf[base + 1] != next)
  4885. continue;
  4886. }
  4887. ctxt->checkIndex = 0;
  4888. #ifdef DEBUG_PUSH
  4889. if (next == 0)
  4890. xmlGenericError(xmlGenericErrorContext,
  4891. "HPP: lookup '%c' found at %d\n",
  4892. first, base);
  4893. else if (third == 0)
  4894. xmlGenericError(xmlGenericErrorContext,
  4895. "HPP: lookup '%c%c' found at %d\n",
  4896. first, next, base);
  4897. else
  4898. xmlGenericError(xmlGenericErrorContext,
  4899. "HPP: lookup '%c%c%c' found at %d\n",
  4900. first, next, third, base);
  4901. #endif
  4902. return (base - (in->cur - in->base));
  4903. }
  4904. }
  4905. ctxt->checkIndex = base;
  4906. /* Abuse hasPErefs member to track current state. */
  4907. if (invalue)
  4908. ctxt->hasPErefs |= 1;
  4909. else
  4910. ctxt->hasPErefs &= ~1;
  4911. #ifdef DEBUG_PUSH
  4912. if (next == 0)
  4913. xmlGenericError(xmlGenericErrorContext,
  4914. "HPP: lookup '%c' failed\n", first);
  4915. else if (third == 0)
  4916. xmlGenericError(xmlGenericErrorContext,
  4917. "HPP: lookup '%c%c' failed\n", first, next);
  4918. else
  4919. xmlGenericError(xmlGenericErrorContext,
  4920. "HPP: lookup '%c%c%c' failed\n", first, next,
  4921. third);
  4922. #endif
  4923. return (-1);
  4924. }
  4925. /**
  4926. * htmlParseLookupCommentEnd:
  4927. * @ctxt: an HTML parser context
  4928. *
  4929. * Try to find a comment end tag in the input stream
  4930. * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
  4931. * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
  4932. * This function has a side effect of (possibly) incrementing ctxt->checkIndex
  4933. * to avoid rescanning sequences of bytes, it DOES change the state of the
  4934. * parser, do not use liberally.
  4935. * This wraps to htmlParseLookupSequence()
  4936. *
  4937. * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
  4938. */
  4939. static int
  4940. htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
  4941. {
  4942. int mark = 0;
  4943. int cur = CUR_PTR - BASE_PTR;
  4944. while (mark >= 0) {
  4945. mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
  4946. if ((mark < 0) ||
  4947. (NXT(mark+2) == '>') ||
  4948. ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
  4949. return mark;
  4950. }
  4951. ctxt->checkIndex = cur + mark + 1;
  4952. }
  4953. return mark;
  4954. }
  4955. /**
  4956. * htmlParseTryOrFinish:
  4957. * @ctxt: an HTML parser context
  4958. * @terminate: last chunk indicator
  4959. *
  4960. * Try to progress on parsing
  4961. *
  4962. * Returns zero if no parsing was possible
  4963. */
  4964. static int
  4965. htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
  4966. int ret = 0;
  4967. htmlParserInputPtr in;
  4968. ptrdiff_t avail = 0;
  4969. xmlChar cur, next;
  4970. htmlParserNodeInfo node_info;
  4971. #ifdef DEBUG_PUSH
  4972. switch (ctxt->instate) {
  4973. case XML_PARSER_EOF:
  4974. xmlGenericError(xmlGenericErrorContext,
  4975. "HPP: try EOF\n"); break;
  4976. case XML_PARSER_START:
  4977. xmlGenericError(xmlGenericErrorContext,
  4978. "HPP: try START\n"); break;
  4979. case XML_PARSER_MISC:
  4980. xmlGenericError(xmlGenericErrorContext,
  4981. "HPP: try MISC\n");break;
  4982. case XML_PARSER_COMMENT:
  4983. xmlGenericError(xmlGenericErrorContext,
  4984. "HPP: try COMMENT\n");break;
  4985. case XML_PARSER_PROLOG:
  4986. xmlGenericError(xmlGenericErrorContext,
  4987. "HPP: try PROLOG\n");break;
  4988. case XML_PARSER_START_TAG:
  4989. xmlGenericError(xmlGenericErrorContext,
  4990. "HPP: try START_TAG\n");break;
  4991. case XML_PARSER_CONTENT:
  4992. xmlGenericError(xmlGenericErrorContext,
  4993. "HPP: try CONTENT\n");break;
  4994. case XML_PARSER_CDATA_SECTION:
  4995. xmlGenericError(xmlGenericErrorContext,
  4996. "HPP: try CDATA_SECTION\n");break;
  4997. case XML_PARSER_END_TAG:
  4998. xmlGenericError(xmlGenericErrorContext,
  4999. "HPP: try END_TAG\n");break;
  5000. case XML_PARSER_ENTITY_DECL:
  5001. xmlGenericError(xmlGenericErrorContext,
  5002. "HPP: try ENTITY_DECL\n");break;
  5003. case XML_PARSER_ENTITY_VALUE:
  5004. xmlGenericError(xmlGenericErrorContext,
  5005. "HPP: try ENTITY_VALUE\n");break;
  5006. case XML_PARSER_ATTRIBUTE_VALUE:
  5007. xmlGenericError(xmlGenericErrorContext,
  5008. "HPP: try ATTRIBUTE_VALUE\n");break;
  5009. case XML_PARSER_DTD:
  5010. xmlGenericError(xmlGenericErrorContext,
  5011. "HPP: try DTD\n");break;
  5012. case XML_PARSER_EPILOG:
  5013. xmlGenericError(xmlGenericErrorContext,
  5014. "HPP: try EPILOG\n");break;
  5015. case XML_PARSER_PI:
  5016. xmlGenericError(xmlGenericErrorContext,
  5017. "HPP: try PI\n");break;
  5018. case XML_PARSER_SYSTEM_LITERAL:
  5019. xmlGenericError(xmlGenericErrorContext,
  5020. "HPP: try SYSTEM_LITERAL\n");break;
  5021. }
  5022. #endif
  5023. while (1) {
  5024. in = ctxt->input;
  5025. if (in == NULL) break;
  5026. if (in->buf == NULL)
  5027. avail = in->length - (in->cur - in->base);
  5028. else
  5029. avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
  5030. (in->cur - in->base);
  5031. if ((avail == 0) && (terminate)) {
  5032. htmlAutoCloseOnEnd(ctxt);
  5033. if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
  5034. /*
  5035. * SAX: end of the document processing.
  5036. */
  5037. ctxt->instate = XML_PARSER_EOF;
  5038. if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
  5039. ctxt->sax->endDocument(ctxt->userData);
  5040. }
  5041. }
  5042. if (avail < 1)
  5043. goto done;
  5044. /*
  5045. * This is done to make progress and avoid an infinite loop
  5046. * if a parsing attempt was aborted by hitting a NUL byte. After
  5047. * changing htmlCurrentChar, this probably isn't necessary anymore.
  5048. * We should consider removing this check.
  5049. */
  5050. cur = in->cur[0];
  5051. if (cur == 0) {
  5052. SKIP(1);
  5053. continue;
  5054. }
  5055. switch (ctxt->instate) {
  5056. case XML_PARSER_EOF:
  5057. /*
  5058. * Document parsing is done !
  5059. */
  5060. goto done;
  5061. case XML_PARSER_START:
  5062. /*
  5063. * Very first chars read from the document flow.
  5064. */
  5065. cur = in->cur[0];
  5066. if (IS_BLANK_CH(cur)) {
  5067. SKIP_BLANKS;
  5068. if (in->buf == NULL)
  5069. avail = in->length - (in->cur - in->base);
  5070. else
  5071. avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
  5072. (in->cur - in->base);
  5073. }
  5074. if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
  5075. ctxt->sax->setDocumentLocator(ctxt->userData,
  5076. &xmlDefaultSAXLocator);
  5077. if ((ctxt->sax) && (ctxt->sax->startDocument) &&
  5078. (!ctxt->disableSAX))
  5079. ctxt->sax->startDocument(ctxt->userData);
  5080. cur = in->cur[0];
  5081. next = in->cur[1];
  5082. if ((cur == '<') && (next == '!') &&
  5083. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  5084. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  5085. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  5086. (UPP(8) == 'E')) {
  5087. if ((!terminate) &&
  5088. (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
  5089. goto done;
  5090. #ifdef DEBUG_PUSH
  5091. xmlGenericError(xmlGenericErrorContext,
  5092. "HPP: Parsing internal subset\n");
  5093. #endif
  5094. htmlParseDocTypeDecl(ctxt);
  5095. ctxt->instate = XML_PARSER_PROLOG;
  5096. #ifdef DEBUG_PUSH
  5097. xmlGenericError(xmlGenericErrorContext,
  5098. "HPP: entering PROLOG\n");
  5099. #endif
  5100. } else {
  5101. ctxt->instate = XML_PARSER_MISC;
  5102. #ifdef DEBUG_PUSH
  5103. xmlGenericError(xmlGenericErrorContext,
  5104. "HPP: entering MISC\n");
  5105. #endif
  5106. }
  5107. break;
  5108. case XML_PARSER_MISC:
  5109. SKIP_BLANKS;
  5110. if (in->buf == NULL)
  5111. avail = in->length - (in->cur - in->base);
  5112. else
  5113. avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
  5114. (in->cur - in->base);
  5115. /*
  5116. * no chars in buffer
  5117. */
  5118. if (avail < 1)
  5119. goto done;
  5120. /*
  5121. * not enough chars in buffer
  5122. */
  5123. if (avail < 2) {
  5124. if (!terminate)
  5125. goto done;
  5126. else
  5127. next = ' ';
  5128. } else {
  5129. next = in->cur[1];
  5130. }
  5131. cur = in->cur[0];
  5132. if ((cur == '<') && (next == '!') &&
  5133. (in->cur[2] == '-') && (in->cur[3] == '-')) {
  5134. if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
  5135. goto done;
  5136. #ifdef DEBUG_PUSH
  5137. xmlGenericError(xmlGenericErrorContext,
  5138. "HPP: Parsing Comment\n");
  5139. #endif
  5140. htmlParseComment(ctxt);
  5141. ctxt->instate = XML_PARSER_MISC;
  5142. } else if ((cur == '<') && (next == '?')) {
  5143. if ((!terminate) &&
  5144. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
  5145. goto done;
  5146. #ifdef DEBUG_PUSH
  5147. xmlGenericError(xmlGenericErrorContext,
  5148. "HPP: Parsing PI\n");
  5149. #endif
  5150. htmlParsePI(ctxt);
  5151. ctxt->instate = XML_PARSER_MISC;
  5152. } else if ((cur == '<') && (next == '!') &&
  5153. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  5154. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  5155. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  5156. (UPP(8) == 'E')) {
  5157. if ((!terminate) &&
  5158. (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
  5159. goto done;
  5160. #ifdef DEBUG_PUSH
  5161. xmlGenericError(xmlGenericErrorContext,
  5162. "HPP: Parsing internal subset\n");
  5163. #endif
  5164. htmlParseDocTypeDecl(ctxt);
  5165. ctxt->instate = XML_PARSER_PROLOG;
  5166. #ifdef DEBUG_PUSH
  5167. xmlGenericError(xmlGenericErrorContext,
  5168. "HPP: entering PROLOG\n");
  5169. #endif
  5170. } else if ((cur == '<') && (next == '!') &&
  5171. (avail < 9)) {
  5172. goto done;
  5173. } else {
  5174. ctxt->instate = XML_PARSER_CONTENT;
  5175. #ifdef DEBUG_PUSH
  5176. xmlGenericError(xmlGenericErrorContext,
  5177. "HPP: entering START_TAG\n");
  5178. #endif
  5179. }
  5180. break;
  5181. case XML_PARSER_PROLOG:
  5182. SKIP_BLANKS;
  5183. if (in->buf == NULL)
  5184. avail = in->length - (in->cur - in->base);
  5185. else
  5186. avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
  5187. (in->cur - in->base);
  5188. if (avail < 2)
  5189. goto done;
  5190. cur = in->cur[0];
  5191. next = in->cur[1];
  5192. if ((cur == '<') && (next == '!') &&
  5193. (in->cur[2] == '-') && (in->cur[3] == '-')) {
  5194. if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
  5195. goto done;
  5196. #ifdef DEBUG_PUSH
  5197. xmlGenericError(xmlGenericErrorContext,
  5198. "HPP: Parsing Comment\n");
  5199. #endif
  5200. htmlParseComment(ctxt);
  5201. ctxt->instate = XML_PARSER_PROLOG;
  5202. } else if ((cur == '<') && (next == '?')) {
  5203. if ((!terminate) &&
  5204. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
  5205. goto done;
  5206. #ifdef DEBUG_PUSH
  5207. xmlGenericError(xmlGenericErrorContext,
  5208. "HPP: Parsing PI\n");
  5209. #endif
  5210. htmlParsePI(ctxt);
  5211. ctxt->instate = XML_PARSER_PROLOG;
  5212. } else if ((cur == '<') && (next == '!') &&
  5213. (avail < 4)) {
  5214. goto done;
  5215. } else {
  5216. ctxt->instate = XML_PARSER_CONTENT;
  5217. #ifdef DEBUG_PUSH
  5218. xmlGenericError(xmlGenericErrorContext,
  5219. "HPP: entering START_TAG\n");
  5220. #endif
  5221. }
  5222. break;
  5223. case XML_PARSER_EPILOG:
  5224. if (in->buf == NULL)
  5225. avail = in->length - (in->cur - in->base);
  5226. else
  5227. avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
  5228. (in->cur - in->base);
  5229. if (avail < 1)
  5230. goto done;
  5231. cur = in->cur[0];
  5232. if (IS_BLANK_CH(cur)) {
  5233. htmlParseCharData(ctxt);
  5234. goto done;
  5235. }
  5236. if (avail < 2)
  5237. goto done;
  5238. next = in->cur[1];
  5239. if ((cur == '<') && (next == '!') &&
  5240. (in->cur[2] == '-') && (in->cur[3] == '-')) {
  5241. if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
  5242. goto done;
  5243. #ifdef DEBUG_PUSH
  5244. xmlGenericError(xmlGenericErrorContext,
  5245. "HPP: Parsing Comment\n");
  5246. #endif
  5247. htmlParseComment(ctxt);
  5248. ctxt->instate = XML_PARSER_EPILOG;
  5249. } else if ((cur == '<') && (next == '?')) {
  5250. if ((!terminate) &&
  5251. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
  5252. goto done;
  5253. #ifdef DEBUG_PUSH
  5254. xmlGenericError(xmlGenericErrorContext,
  5255. "HPP: Parsing PI\n");
  5256. #endif
  5257. htmlParsePI(ctxt);
  5258. ctxt->instate = XML_PARSER_EPILOG;
  5259. } else if ((cur == '<') && (next == '!') &&
  5260. (avail < 4)) {
  5261. goto done;
  5262. } else {
  5263. ctxt->errNo = XML_ERR_DOCUMENT_END;
  5264. ctxt->wellFormed = 0;
  5265. ctxt->instate = XML_PARSER_EOF;
  5266. #ifdef DEBUG_PUSH
  5267. xmlGenericError(xmlGenericErrorContext,
  5268. "HPP: entering EOF\n");
  5269. #endif
  5270. if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
  5271. ctxt->sax->endDocument(ctxt->userData);
  5272. goto done;
  5273. }
  5274. break;
  5275. case XML_PARSER_START_TAG: {
  5276. const xmlChar *name;
  5277. int failed;
  5278. const htmlElemDesc * info;
  5279. /*
  5280. * no chars in buffer
  5281. */
  5282. if (avail < 1)
  5283. goto done;
  5284. /*
  5285. * not enough chars in buffer
  5286. */
  5287. if (avail < 2) {
  5288. if (!terminate)
  5289. goto done;
  5290. else
  5291. next = ' ';
  5292. } else {
  5293. next = in->cur[1];
  5294. }
  5295. cur = in->cur[0];
  5296. if (cur != '<') {
  5297. ctxt->instate = XML_PARSER_CONTENT;
  5298. #ifdef DEBUG_PUSH
  5299. xmlGenericError(xmlGenericErrorContext,
  5300. "HPP: entering CONTENT\n");
  5301. #endif
  5302. break;
  5303. }
  5304. if (next == '/') {
  5305. ctxt->instate = XML_PARSER_END_TAG;
  5306. ctxt->checkIndex = 0;
  5307. #ifdef DEBUG_PUSH
  5308. xmlGenericError(xmlGenericErrorContext,
  5309. "HPP: entering END_TAG\n");
  5310. #endif
  5311. break;
  5312. }
  5313. if ((!terminate) &&
  5314. (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
  5315. goto done;
  5316. /* Capture start position */
  5317. if (ctxt->record_info) {
  5318. node_info.begin_pos = ctxt->input->consumed +
  5319. (CUR_PTR - ctxt->input->base);
  5320. node_info.begin_line = ctxt->input->line;
  5321. }
  5322. failed = htmlParseStartTag(ctxt);
  5323. name = ctxt->name;
  5324. if ((failed == -1) ||
  5325. (name == NULL)) {
  5326. if (CUR == '>')
  5327. NEXT;
  5328. break;
  5329. }
  5330. /*
  5331. * Lookup the info for that element.
  5332. */
  5333. info = htmlTagLookup(name);
  5334. if (info == NULL) {
  5335. htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
  5336. "Tag %s invalid\n", name, NULL);
  5337. }
  5338. /*
  5339. * Check for an Empty Element labeled the XML/SGML way
  5340. */
  5341. if ((CUR == '/') && (NXT(1) == '>')) {
  5342. SKIP(2);
  5343. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  5344. ctxt->sax->endElement(ctxt->userData, name);
  5345. htmlnamePop(ctxt);
  5346. ctxt->instate = XML_PARSER_CONTENT;
  5347. #ifdef DEBUG_PUSH
  5348. xmlGenericError(xmlGenericErrorContext,
  5349. "HPP: entering CONTENT\n");
  5350. #endif
  5351. break;
  5352. }
  5353. if (CUR == '>') {
  5354. NEXT;
  5355. } else {
  5356. htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
  5357. "Couldn't find end of Start Tag %s\n",
  5358. name, NULL);
  5359. /*
  5360. * end of parsing of this node.
  5361. */
  5362. if (xmlStrEqual(name, ctxt->name)) {
  5363. nodePop(ctxt);
  5364. htmlnamePop(ctxt);
  5365. }
  5366. if (ctxt->record_info)
  5367. htmlNodeInfoPush(ctxt, &node_info);
  5368. ctxt->instate = XML_PARSER_CONTENT;
  5369. #ifdef DEBUG_PUSH
  5370. xmlGenericError(xmlGenericErrorContext,
  5371. "HPP: entering CONTENT\n");
  5372. #endif
  5373. break;
  5374. }
  5375. /*
  5376. * Check for an Empty Element from DTD definition
  5377. */
  5378. if ((info != NULL) && (info->empty)) {
  5379. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  5380. ctxt->sax->endElement(ctxt->userData, name);
  5381. htmlnamePop(ctxt);
  5382. }
  5383. if (ctxt->record_info)
  5384. htmlNodeInfoPush(ctxt, &node_info);
  5385. ctxt->instate = XML_PARSER_CONTENT;
  5386. #ifdef DEBUG_PUSH
  5387. xmlGenericError(xmlGenericErrorContext,
  5388. "HPP: entering CONTENT\n");
  5389. #endif
  5390. break;
  5391. }
  5392. case XML_PARSER_CONTENT: {
  5393. xmlChar chr[2] = { 0, 0 };
  5394. /*
  5395. * Handle preparsed entities and charRef
  5396. */
  5397. if (ctxt->token != 0) {
  5398. chr[0] = (xmlChar) ctxt->token;
  5399. htmlCheckParagraph(ctxt);
  5400. if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
  5401. ctxt->sax->characters(ctxt->userData, chr, 1);
  5402. ctxt->token = 0;
  5403. ctxt->checkIndex = 0;
  5404. }
  5405. if ((avail == 1) && (terminate)) {
  5406. cur = in->cur[0];
  5407. if ((cur != '<') && (cur != '&')) {
  5408. if (ctxt->sax != NULL) {
  5409. chr[0] = cur;
  5410. if (IS_BLANK_CH(cur)) {
  5411. if (ctxt->keepBlanks) {
  5412. if (ctxt->sax->characters != NULL)
  5413. ctxt->sax->characters(
  5414. ctxt->userData, chr, 1);
  5415. } else {
  5416. if (ctxt->sax->ignorableWhitespace != NULL)
  5417. ctxt->sax->ignorableWhitespace(
  5418. ctxt->userData, chr, 1);
  5419. }
  5420. } else {
  5421. htmlCheckParagraph(ctxt);
  5422. if (ctxt->sax->characters != NULL)
  5423. ctxt->sax->characters(
  5424. ctxt->userData, chr, 1);
  5425. }
  5426. }
  5427. ctxt->token = 0;
  5428. ctxt->checkIndex = 0;
  5429. in->cur++;
  5430. break;
  5431. }
  5432. }
  5433. if (avail < 2)
  5434. goto done;
  5435. cur = in->cur[0];
  5436. next = in->cur[1];
  5437. if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
  5438. (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
  5439. /*
  5440. * Handle SCRIPT/STYLE separately
  5441. */
  5442. if (!terminate) {
  5443. int idx;
  5444. xmlChar val;
  5445. idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
  5446. if (idx < 0)
  5447. goto done;
  5448. val = in->cur[idx + 2];
  5449. if (val == 0) /* bad cut of input */
  5450. goto done;
  5451. }
  5452. htmlParseScript(ctxt);
  5453. if ((cur == '<') && (next == '/')) {
  5454. ctxt->instate = XML_PARSER_END_TAG;
  5455. ctxt->checkIndex = 0;
  5456. #ifdef DEBUG_PUSH
  5457. xmlGenericError(xmlGenericErrorContext,
  5458. "HPP: entering END_TAG\n");
  5459. #endif
  5460. break;
  5461. }
  5462. } else {
  5463. /*
  5464. * Sometimes DOCTYPE arrives in the middle of the document
  5465. */
  5466. if ((cur == '<') && (next == '!') &&
  5467. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  5468. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  5469. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  5470. (UPP(8) == 'E')) {
  5471. if ((!terminate) &&
  5472. (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
  5473. goto done;
  5474. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  5475. "Misplaced DOCTYPE declaration\n",
  5476. BAD_CAST "DOCTYPE" , NULL);
  5477. htmlParseDocTypeDecl(ctxt);
  5478. } else if ((cur == '<') && (next == '!') &&
  5479. (in->cur[2] == '-') && (in->cur[3] == '-')) {
  5480. if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
  5481. goto done;
  5482. #ifdef DEBUG_PUSH
  5483. xmlGenericError(xmlGenericErrorContext,
  5484. "HPP: Parsing Comment\n");
  5485. #endif
  5486. htmlParseComment(ctxt);
  5487. ctxt->instate = XML_PARSER_CONTENT;
  5488. } else if ((cur == '<') && (next == '?')) {
  5489. if ((!terminate) &&
  5490. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
  5491. goto done;
  5492. #ifdef DEBUG_PUSH
  5493. xmlGenericError(xmlGenericErrorContext,
  5494. "HPP: Parsing PI\n");
  5495. #endif
  5496. htmlParsePI(ctxt);
  5497. ctxt->instate = XML_PARSER_CONTENT;
  5498. } else if ((cur == '<') && (next == '!') && (avail < 4)) {
  5499. goto done;
  5500. } else if ((cur == '<') && (next == '/')) {
  5501. ctxt->instate = XML_PARSER_END_TAG;
  5502. ctxt->checkIndex = 0;
  5503. #ifdef DEBUG_PUSH
  5504. xmlGenericError(xmlGenericErrorContext,
  5505. "HPP: entering END_TAG\n");
  5506. #endif
  5507. break;
  5508. } else if (cur == '<') {
  5509. if ((!terminate) && (next == 0))
  5510. goto done;
  5511. /*
  5512. * Only switch to START_TAG if the next character
  5513. * starts a valid name. Otherwise, htmlParseStartTag
  5514. * might return without consuming all characters
  5515. * up to the final '>'.
  5516. */
  5517. if ((IS_ASCII_LETTER(next)) ||
  5518. (next == '_') || (next == ':') || (next == '.')) {
  5519. ctxt->instate = XML_PARSER_START_TAG;
  5520. ctxt->checkIndex = 0;
  5521. #ifdef DEBUG_PUSH
  5522. xmlGenericError(xmlGenericErrorContext,
  5523. "HPP: entering START_TAG\n");
  5524. #endif
  5525. } else {
  5526. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  5527. "htmlParseTryOrFinish: "
  5528. "invalid element name\n",
  5529. NULL, NULL);
  5530. htmlCheckParagraph(ctxt);
  5531. if ((ctxt->sax != NULL) &&
  5532. (ctxt->sax->characters != NULL))
  5533. ctxt->sax->characters(ctxt->userData,
  5534. in->cur, 1);
  5535. NEXT;
  5536. }
  5537. break;
  5538. } else {
  5539. /*
  5540. * check that the text sequence is complete
  5541. * before handing out the data to the parser
  5542. * to avoid problems with erroneous end of
  5543. * data detection.
  5544. */
  5545. if ((!terminate) &&
  5546. (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
  5547. goto done;
  5548. ctxt->checkIndex = 0;
  5549. #ifdef DEBUG_PUSH
  5550. xmlGenericError(xmlGenericErrorContext,
  5551. "HPP: Parsing char data\n");
  5552. #endif
  5553. while ((ctxt->instate != XML_PARSER_EOF) &&
  5554. (cur != '<') && (in->cur < in->end)) {
  5555. if (cur == '&') {
  5556. htmlParseReference(ctxt);
  5557. } else {
  5558. htmlParseCharData(ctxt);
  5559. }
  5560. cur = in->cur[0];
  5561. }
  5562. }
  5563. }
  5564. break;
  5565. }
  5566. case XML_PARSER_END_TAG:
  5567. if (avail < 2)
  5568. goto done;
  5569. if ((!terminate) &&
  5570. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
  5571. goto done;
  5572. htmlParseEndTag(ctxt);
  5573. if (ctxt->nameNr == 0) {
  5574. ctxt->instate = XML_PARSER_EPILOG;
  5575. } else {
  5576. ctxt->instate = XML_PARSER_CONTENT;
  5577. }
  5578. ctxt->checkIndex = 0;
  5579. #ifdef DEBUG_PUSH
  5580. xmlGenericError(xmlGenericErrorContext,
  5581. "HPP: entering CONTENT\n");
  5582. #endif
  5583. break;
  5584. case XML_PARSER_CDATA_SECTION:
  5585. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5586. "HPP: internal error, state == CDATA\n",
  5587. NULL, NULL);
  5588. ctxt->instate = XML_PARSER_CONTENT;
  5589. ctxt->checkIndex = 0;
  5590. #ifdef DEBUG_PUSH
  5591. xmlGenericError(xmlGenericErrorContext,
  5592. "HPP: entering CONTENT\n");
  5593. #endif
  5594. break;
  5595. case XML_PARSER_DTD:
  5596. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5597. "HPP: internal error, state == DTD\n",
  5598. NULL, NULL);
  5599. ctxt->instate = XML_PARSER_CONTENT;
  5600. ctxt->checkIndex = 0;
  5601. #ifdef DEBUG_PUSH
  5602. xmlGenericError(xmlGenericErrorContext,
  5603. "HPP: entering CONTENT\n");
  5604. #endif
  5605. break;
  5606. case XML_PARSER_COMMENT:
  5607. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5608. "HPP: internal error, state == COMMENT\n",
  5609. NULL, NULL);
  5610. ctxt->instate = XML_PARSER_CONTENT;
  5611. ctxt->checkIndex = 0;
  5612. #ifdef DEBUG_PUSH
  5613. xmlGenericError(xmlGenericErrorContext,
  5614. "HPP: entering CONTENT\n");
  5615. #endif
  5616. break;
  5617. case XML_PARSER_PI:
  5618. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5619. "HPP: internal error, state == PI\n",
  5620. NULL, NULL);
  5621. ctxt->instate = XML_PARSER_CONTENT;
  5622. ctxt->checkIndex = 0;
  5623. #ifdef DEBUG_PUSH
  5624. xmlGenericError(xmlGenericErrorContext,
  5625. "HPP: entering CONTENT\n");
  5626. #endif
  5627. break;
  5628. case XML_PARSER_ENTITY_DECL:
  5629. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5630. "HPP: internal error, state == ENTITY_DECL\n",
  5631. NULL, NULL);
  5632. ctxt->instate = XML_PARSER_CONTENT;
  5633. ctxt->checkIndex = 0;
  5634. #ifdef DEBUG_PUSH
  5635. xmlGenericError(xmlGenericErrorContext,
  5636. "HPP: entering CONTENT\n");
  5637. #endif
  5638. break;
  5639. case XML_PARSER_ENTITY_VALUE:
  5640. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5641. "HPP: internal error, state == ENTITY_VALUE\n",
  5642. NULL, NULL);
  5643. ctxt->instate = XML_PARSER_CONTENT;
  5644. ctxt->checkIndex = 0;
  5645. #ifdef DEBUG_PUSH
  5646. xmlGenericError(xmlGenericErrorContext,
  5647. "HPP: entering DTD\n");
  5648. #endif
  5649. break;
  5650. case XML_PARSER_ATTRIBUTE_VALUE:
  5651. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5652. "HPP: internal error, state == ATTRIBUTE_VALUE\n",
  5653. NULL, NULL);
  5654. ctxt->instate = XML_PARSER_START_TAG;
  5655. ctxt->checkIndex = 0;
  5656. #ifdef DEBUG_PUSH
  5657. xmlGenericError(xmlGenericErrorContext,
  5658. "HPP: entering START_TAG\n");
  5659. #endif
  5660. break;
  5661. case XML_PARSER_SYSTEM_LITERAL:
  5662. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5663. "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
  5664. NULL, NULL);
  5665. ctxt->instate = XML_PARSER_CONTENT;
  5666. ctxt->checkIndex = 0;
  5667. #ifdef DEBUG_PUSH
  5668. xmlGenericError(xmlGenericErrorContext,
  5669. "HPP: entering CONTENT\n");
  5670. #endif
  5671. break;
  5672. case XML_PARSER_IGNORE:
  5673. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5674. "HPP: internal error, state == XML_PARSER_IGNORE\n",
  5675. NULL, NULL);
  5676. ctxt->instate = XML_PARSER_CONTENT;
  5677. ctxt->checkIndex = 0;
  5678. #ifdef DEBUG_PUSH
  5679. xmlGenericError(xmlGenericErrorContext,
  5680. "HPP: entering CONTENT\n");
  5681. #endif
  5682. break;
  5683. case XML_PARSER_PUBLIC_LITERAL:
  5684. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5685. "HPP: internal error, state == XML_PARSER_LITERAL\n",
  5686. NULL, NULL);
  5687. ctxt->instate = XML_PARSER_CONTENT;
  5688. ctxt->checkIndex = 0;
  5689. #ifdef DEBUG_PUSH
  5690. xmlGenericError(xmlGenericErrorContext,
  5691. "HPP: entering CONTENT\n");
  5692. #endif
  5693. break;
  5694. }
  5695. }
  5696. done:
  5697. if ((avail == 0) && (terminate)) {
  5698. htmlAutoCloseOnEnd(ctxt);
  5699. if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
  5700. /*
  5701. * SAX: end of the document processing.
  5702. */
  5703. ctxt->instate = XML_PARSER_EOF;
  5704. if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
  5705. ctxt->sax->endDocument(ctxt->userData);
  5706. }
  5707. }
  5708. if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
  5709. ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
  5710. (ctxt->instate == XML_PARSER_EPILOG))) {
  5711. xmlDtdPtr dtd;
  5712. dtd = xmlGetIntSubset(ctxt->myDoc);
  5713. if (dtd == NULL)
  5714. ctxt->myDoc->intSubset =
  5715. xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
  5716. BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
  5717. BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
  5718. }
  5719. #ifdef DEBUG_PUSH
  5720. xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
  5721. #endif
  5722. return(ret);
  5723. }
  5724. /**
  5725. * htmlParseChunk:
  5726. * @ctxt: an HTML parser context
  5727. * @chunk: an char array
  5728. * @size: the size in byte of the chunk
  5729. * @terminate: last chunk indicator
  5730. *
  5731. * Parse a Chunk of memory
  5732. *
  5733. * Returns zero if no error, the xmlParserErrors otherwise.
  5734. */
  5735. int
  5736. htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
  5737. int terminate) {
  5738. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  5739. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5740. "htmlParseChunk: context error\n", NULL, NULL);
  5741. return(XML_ERR_INTERNAL_ERROR);
  5742. }
  5743. if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
  5744. (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
  5745. size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
  5746. size_t cur = ctxt->input->cur - ctxt->input->base;
  5747. int res;
  5748. res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
  5749. xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
  5750. if (res < 0) {
  5751. ctxt->errNo = XML_PARSER_EOF;
  5752. ctxt->disableSAX = 1;
  5753. return (XML_PARSER_EOF);
  5754. }
  5755. #ifdef DEBUG_PUSH
  5756. xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
  5757. #endif
  5758. #if 0
  5759. if ((terminate) || (ctxt->input->buf->buffer->use > 80))
  5760. htmlParseTryOrFinish(ctxt, terminate);
  5761. #endif
  5762. } else if (ctxt->instate != XML_PARSER_EOF) {
  5763. if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
  5764. xmlParserInputBufferPtr in = ctxt->input->buf;
  5765. if ((in->encoder != NULL) && (in->buffer != NULL) &&
  5766. (in->raw != NULL)) {
  5767. int nbchars;
  5768. size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
  5769. size_t current = ctxt->input->cur - ctxt->input->base;
  5770. nbchars = xmlCharEncInput(in, terminate);
  5771. xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
  5772. if (nbchars < 0) {
  5773. htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
  5774. "encoder error\n", NULL, NULL);
  5775. return(XML_ERR_INVALID_ENCODING);
  5776. }
  5777. }
  5778. }
  5779. }
  5780. htmlParseTryOrFinish(ctxt, terminate);
  5781. if (terminate) {
  5782. if ((ctxt->instate != XML_PARSER_EOF) &&
  5783. (ctxt->instate != XML_PARSER_EPILOG) &&
  5784. (ctxt->instate != XML_PARSER_MISC)) {
  5785. ctxt->errNo = XML_ERR_DOCUMENT_END;
  5786. ctxt->wellFormed = 0;
  5787. }
  5788. if (ctxt->instate != XML_PARSER_EOF) {
  5789. if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
  5790. ctxt->sax->endDocument(ctxt->userData);
  5791. }
  5792. ctxt->instate = XML_PARSER_EOF;
  5793. }
  5794. return((xmlParserErrors) ctxt->errNo);
  5795. }
  5796. /************************************************************************
  5797. * *
  5798. * User entry points *
  5799. * *
  5800. ************************************************************************/
  5801. /**
  5802. * htmlCreatePushParserCtxt:
  5803. * @sax: a SAX handler
  5804. * @user_data: The user data returned on SAX callbacks
  5805. * @chunk: a pointer to an array of chars
  5806. * @size: number of chars in the array
  5807. * @filename: an optional file name or URI
  5808. * @enc: an optional encoding
  5809. *
  5810. * Create a parser context for using the HTML parser in push mode
  5811. * The value of @filename is used for fetching external entities
  5812. * and error/warning reports.
  5813. *
  5814. * Returns the new parser context or NULL
  5815. */
  5816. htmlParserCtxtPtr
  5817. htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
  5818. const char *chunk, int size, const char *filename,
  5819. xmlCharEncoding enc) {
  5820. htmlParserCtxtPtr ctxt;
  5821. htmlParserInputPtr inputStream;
  5822. xmlParserInputBufferPtr buf;
  5823. xmlInitParser();
  5824. buf = xmlAllocParserInputBuffer(enc);
  5825. if (buf == NULL) return(NULL);
  5826. ctxt = htmlNewParserCtxt();
  5827. if (ctxt == NULL) {
  5828. xmlFreeParserInputBuffer(buf);
  5829. return(NULL);
  5830. }
  5831. if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
  5832. ctxt->charset=XML_CHAR_ENCODING_UTF8;
  5833. if (sax != NULL) {
  5834. if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
  5835. xmlFree(ctxt->sax);
  5836. ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
  5837. if (ctxt->sax == NULL) {
  5838. xmlFree(buf);
  5839. xmlFree(ctxt);
  5840. return(NULL);
  5841. }
  5842. memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
  5843. if (user_data != NULL)
  5844. ctxt->userData = user_data;
  5845. }
  5846. if (filename == NULL) {
  5847. ctxt->directory = NULL;
  5848. } else {
  5849. ctxt->directory = xmlParserGetDirectory(filename);
  5850. }
  5851. inputStream = htmlNewInputStream(ctxt);
  5852. if (inputStream == NULL) {
  5853. xmlFreeParserCtxt(ctxt);
  5854. xmlFree(buf);
  5855. return(NULL);
  5856. }
  5857. if (filename == NULL)
  5858. inputStream->filename = NULL;
  5859. else
  5860. inputStream->filename = (char *)
  5861. xmlCanonicPath((const xmlChar *) filename);
  5862. inputStream->buf = buf;
  5863. xmlBufResetInput(buf->buffer, inputStream);
  5864. inputPush(ctxt, inputStream);
  5865. if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
  5866. (ctxt->input->buf != NULL)) {
  5867. size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
  5868. size_t cur = ctxt->input->cur - ctxt->input->base;
  5869. xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
  5870. xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
  5871. #ifdef DEBUG_PUSH
  5872. xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
  5873. #endif
  5874. }
  5875. ctxt->progressive = 1;
  5876. return(ctxt);
  5877. }
  5878. #endif /* LIBXML_PUSH_ENABLED */
  5879. /**
  5880. * htmlSAXParseDoc:
  5881. * @cur: a pointer to an array of xmlChar
  5882. * @encoding: a free form C string describing the HTML document encoding, or NULL
  5883. * @sax: the SAX handler block
  5884. * @userData: if using SAX, this pointer will be provided on callbacks.
  5885. *
  5886. * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
  5887. * to handle parse events. If sax is NULL, fallback to the default DOM
  5888. * behavior and return a tree.
  5889. *
  5890. * Returns the resulting document tree unless SAX is NULL or the document is
  5891. * not well formed.
  5892. */
  5893. htmlDocPtr
  5894. htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
  5895. htmlSAXHandlerPtr sax, void *userData) {
  5896. htmlDocPtr ret;
  5897. htmlParserCtxtPtr ctxt;
  5898. xmlInitParser();
  5899. if (cur == NULL) return(NULL);
  5900. ctxt = htmlCreateDocParserCtxt(cur, encoding);
  5901. if (ctxt == NULL) return(NULL);
  5902. if (sax != NULL) {
  5903. if (ctxt->sax != NULL) xmlFree (ctxt->sax);
  5904. ctxt->sax = sax;
  5905. ctxt->userData = userData;
  5906. }
  5907. htmlParseDocument(ctxt);
  5908. ret = ctxt->myDoc;
  5909. if (sax != NULL) {
  5910. ctxt->sax = NULL;
  5911. ctxt->userData = NULL;
  5912. }
  5913. htmlFreeParserCtxt(ctxt);
  5914. return(ret);
  5915. }
  5916. /**
  5917. * htmlParseDoc:
  5918. * @cur: a pointer to an array of xmlChar
  5919. * @encoding: a free form C string describing the HTML document encoding, or NULL
  5920. *
  5921. * parse an HTML in-memory document and build a tree.
  5922. *
  5923. * Returns the resulting document tree
  5924. */
  5925. htmlDocPtr
  5926. htmlParseDoc(const xmlChar *cur, const char *encoding) {
  5927. return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
  5928. }
  5929. /**
  5930. * htmlCreateFileParserCtxt:
  5931. * @filename: the filename
  5932. * @encoding: a free form C string describing the HTML document encoding, or NULL
  5933. *
  5934. * Create a parser context for a file content.
  5935. * Automatic support for ZLIB/Compress compressed document is provided
  5936. * by default if found at compile-time.
  5937. *
  5938. * Returns the new parser context or NULL
  5939. */
  5940. htmlParserCtxtPtr
  5941. htmlCreateFileParserCtxt(const char *filename, const char *encoding)
  5942. {
  5943. htmlParserCtxtPtr ctxt;
  5944. htmlParserInputPtr inputStream;
  5945. char *canonicFilename;
  5946. /* htmlCharEncoding enc; */
  5947. xmlChar *content, *content_line = (xmlChar *) "charset=";
  5948. if (filename == NULL)
  5949. return(NULL);
  5950. ctxt = htmlNewParserCtxt();
  5951. if (ctxt == NULL) {
  5952. return(NULL);
  5953. }
  5954. canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
  5955. if (canonicFilename == NULL) {
  5956. #ifdef LIBXML_SAX1_ENABLED
  5957. if (xmlDefaultSAXHandler.error != NULL) {
  5958. xmlDefaultSAXHandler.error(NULL, "out of memory\n");
  5959. }
  5960. #endif
  5961. xmlFreeParserCtxt(ctxt);
  5962. return(NULL);
  5963. }
  5964. inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
  5965. xmlFree(canonicFilename);
  5966. if (inputStream == NULL) {
  5967. xmlFreeParserCtxt(ctxt);
  5968. return(NULL);
  5969. }
  5970. inputPush(ctxt, inputStream);
  5971. /* set encoding */
  5972. if (encoding) {
  5973. size_t l = strlen(encoding);
  5974. if (l < 1000) {
  5975. content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
  5976. if (content) {
  5977. strcpy ((char *)content, (char *)content_line);
  5978. strcat ((char *)content, (char *)encoding);
  5979. htmlCheckEncoding (ctxt, content);
  5980. xmlFree (content);
  5981. }
  5982. }
  5983. }
  5984. return(ctxt);
  5985. }
  5986. /**
  5987. * htmlSAXParseFile:
  5988. * @filename: the filename
  5989. * @encoding: a free form C string describing the HTML document encoding, or NULL
  5990. * @sax: the SAX handler block
  5991. * @userData: if using SAX, this pointer will be provided on callbacks.
  5992. *
  5993. * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
  5994. * compressed document is provided by default if found at compile-time.
  5995. * It use the given SAX function block to handle the parsing callback.
  5996. * If sax is NULL, fallback to the default DOM tree building routines.
  5997. *
  5998. * Returns the resulting document tree unless SAX is NULL or the document is
  5999. * not well formed.
  6000. */
  6001. htmlDocPtr
  6002. htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
  6003. void *userData) {
  6004. htmlDocPtr ret;
  6005. htmlParserCtxtPtr ctxt;
  6006. htmlSAXHandlerPtr oldsax = NULL;
  6007. xmlInitParser();
  6008. ctxt = htmlCreateFileParserCtxt(filename, encoding);
  6009. if (ctxt == NULL) return(NULL);
  6010. if (sax != NULL) {
  6011. oldsax = ctxt->sax;
  6012. ctxt->sax = sax;
  6013. ctxt->userData = userData;
  6014. }
  6015. htmlParseDocument(ctxt);
  6016. ret = ctxt->myDoc;
  6017. if (sax != NULL) {
  6018. ctxt->sax = oldsax;
  6019. ctxt->userData = NULL;
  6020. }
  6021. htmlFreeParserCtxt(ctxt);
  6022. return(ret);
  6023. }
  6024. /**
  6025. * htmlParseFile:
  6026. * @filename: the filename
  6027. * @encoding: a free form C string describing the HTML document encoding, or NULL
  6028. *
  6029. * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
  6030. * compressed document is provided by default if found at compile-time.
  6031. *
  6032. * Returns the resulting document tree
  6033. */
  6034. htmlDocPtr
  6035. htmlParseFile(const char *filename, const char *encoding) {
  6036. return(htmlSAXParseFile(filename, encoding, NULL, NULL));
  6037. }
  6038. /**
  6039. * htmlHandleOmittedElem:
  6040. * @val: int 0 or 1
  6041. *
  6042. * Set and return the previous value for handling HTML omitted tags.
  6043. *
  6044. * Returns the last value for 0 for no handling, 1 for auto insertion.
  6045. */
  6046. int
  6047. htmlHandleOmittedElem(int val) {
  6048. int old = htmlOmittedDefaultValue;
  6049. htmlOmittedDefaultValue = val;
  6050. return(old);
  6051. }
  6052. /**
  6053. * htmlElementAllowedHere:
  6054. * @parent: HTML parent element
  6055. * @elt: HTML element
  6056. *
  6057. * Checks whether an HTML element may be a direct child of a parent element.
  6058. * Note - doesn't check for deprecated elements
  6059. *
  6060. * Returns 1 if allowed; 0 otherwise.
  6061. */
  6062. int
  6063. htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
  6064. const char** p ;
  6065. if ( ! elt || ! parent || ! parent->subelts )
  6066. return 0 ;
  6067. for ( p = parent->subelts; *p; ++p )
  6068. if ( !xmlStrcmp((const xmlChar *)*p, elt) )
  6069. return 1 ;
  6070. return 0 ;
  6071. }
  6072. /**
  6073. * htmlElementStatusHere:
  6074. * @parent: HTML parent element
  6075. * @elt: HTML element
  6076. *
  6077. * Checks whether an HTML element may be a direct child of a parent element.
  6078. * and if so whether it is valid or deprecated.
  6079. *
  6080. * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
  6081. */
  6082. htmlStatus
  6083. htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
  6084. if ( ! parent || ! elt )
  6085. return HTML_INVALID ;
  6086. if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
  6087. return HTML_INVALID ;
  6088. return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
  6089. }
  6090. /**
  6091. * htmlAttrAllowed:
  6092. * @elt: HTML element
  6093. * @attr: HTML attribute
  6094. * @legacy: whether to allow deprecated attributes
  6095. *
  6096. * Checks whether an attribute is valid for an element
  6097. * Has full knowledge of Required and Deprecated attributes
  6098. *
  6099. * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
  6100. */
  6101. htmlStatus
  6102. htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
  6103. const char** p ;
  6104. if ( !elt || ! attr )
  6105. return HTML_INVALID ;
  6106. if ( elt->attrs_req )
  6107. for ( p = elt->attrs_req; *p; ++p)
  6108. if ( !xmlStrcmp((const xmlChar*)*p, attr) )
  6109. return HTML_REQUIRED ;
  6110. if ( elt->attrs_opt )
  6111. for ( p = elt->attrs_opt; *p; ++p)
  6112. if ( !xmlStrcmp((const xmlChar*)*p, attr) )
  6113. return HTML_VALID ;
  6114. if ( legacy && elt->attrs_depr )
  6115. for ( p = elt->attrs_depr; *p; ++p)
  6116. if ( !xmlStrcmp((const xmlChar*)*p, attr) )
  6117. return HTML_DEPRECATED ;
  6118. return HTML_INVALID ;
  6119. }
  6120. /**
  6121. * htmlNodeStatus:
  6122. * @node: an htmlNodePtr in a tree
  6123. * @legacy: whether to allow deprecated elements (YES is faster here
  6124. * for Element nodes)
  6125. *
  6126. * Checks whether the tree node is valid. Experimental (the author
  6127. * only uses the HTML enhancements in a SAX parser)
  6128. *
  6129. * Return: for Element nodes, a return from htmlElementAllowedHere (if
  6130. * legacy allowed) or htmlElementStatusHere (otherwise).
  6131. * for Attribute nodes, a return from htmlAttrAllowed
  6132. * for other nodes, HTML_NA (no checks performed)
  6133. */
  6134. htmlStatus
  6135. htmlNodeStatus(const htmlNodePtr node, int legacy) {
  6136. if ( ! node )
  6137. return HTML_INVALID ;
  6138. switch ( node->type ) {
  6139. case XML_ELEMENT_NODE:
  6140. return legacy
  6141. ? ( htmlElementAllowedHere (
  6142. htmlTagLookup(node->parent->name) , node->name
  6143. ) ? HTML_VALID : HTML_INVALID )
  6144. : htmlElementStatusHere(
  6145. htmlTagLookup(node->parent->name) ,
  6146. htmlTagLookup(node->name) )
  6147. ;
  6148. case XML_ATTRIBUTE_NODE:
  6149. return htmlAttrAllowed(
  6150. htmlTagLookup(node->parent->name) , node->name, legacy) ;
  6151. default: return HTML_NA ;
  6152. }
  6153. }
  6154. /************************************************************************
  6155. * *
  6156. * New set (2.6.0) of simpler and more flexible APIs *
  6157. * *
  6158. ************************************************************************/
  6159. /**
  6160. * DICT_FREE:
  6161. * @str: a string
  6162. *
  6163. * Free a string if it is not owned by the "dict" dictionary in the
  6164. * current scope
  6165. */
  6166. #define DICT_FREE(str) \
  6167. if ((str) && ((!dict) || \
  6168. (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
  6169. xmlFree((char *)(str));
  6170. /**
  6171. * htmlCtxtReset:
  6172. * @ctxt: an HTML parser context
  6173. *
  6174. * Reset a parser context
  6175. */
  6176. void
  6177. htmlCtxtReset(htmlParserCtxtPtr ctxt)
  6178. {
  6179. xmlParserInputPtr input;
  6180. xmlDictPtr dict;
  6181. if (ctxt == NULL)
  6182. return;
  6183. xmlInitParser();
  6184. dict = ctxt->dict;
  6185. while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
  6186. xmlFreeInputStream(input);
  6187. }
  6188. ctxt->inputNr = 0;
  6189. ctxt->input = NULL;
  6190. ctxt->spaceNr = 0;
  6191. if (ctxt->spaceTab != NULL) {
  6192. ctxt->spaceTab[0] = -1;
  6193. ctxt->space = &ctxt->spaceTab[0];
  6194. } else {
  6195. ctxt->space = NULL;
  6196. }
  6197. ctxt->nodeNr = 0;
  6198. ctxt->node = NULL;
  6199. ctxt->nameNr = 0;
  6200. ctxt->name = NULL;
  6201. DICT_FREE(ctxt->version);
  6202. ctxt->version = NULL;
  6203. DICT_FREE(ctxt->encoding);
  6204. ctxt->encoding = NULL;
  6205. DICT_FREE(ctxt->directory);
  6206. ctxt->directory = NULL;
  6207. DICT_FREE(ctxt->extSubURI);
  6208. ctxt->extSubURI = NULL;
  6209. DICT_FREE(ctxt->extSubSystem);
  6210. ctxt->extSubSystem = NULL;
  6211. if (ctxt->myDoc != NULL)
  6212. xmlFreeDoc(ctxt->myDoc);
  6213. ctxt->myDoc = NULL;
  6214. ctxt->standalone = -1;
  6215. ctxt->hasExternalSubset = 0;
  6216. ctxt->hasPErefs = 0;
  6217. ctxt->html = 1;
  6218. ctxt->external = 0;
  6219. ctxt->instate = XML_PARSER_START;
  6220. ctxt->token = 0;
  6221. ctxt->wellFormed = 1;
  6222. ctxt->nsWellFormed = 1;
  6223. ctxt->disableSAX = 0;
  6224. ctxt->valid = 1;
  6225. ctxt->vctxt.userData = ctxt;
  6226. ctxt->vctxt.error = xmlParserValidityError;
  6227. ctxt->vctxt.warning = xmlParserValidityWarning;
  6228. ctxt->record_info = 0;
  6229. ctxt->checkIndex = 0;
  6230. ctxt->inSubset = 0;
  6231. ctxt->errNo = XML_ERR_OK;
  6232. ctxt->depth = 0;
  6233. ctxt->charset = XML_CHAR_ENCODING_NONE;
  6234. ctxt->catalogs = NULL;
  6235. xmlInitNodeInfoSeq(&ctxt->node_seq);
  6236. if (ctxt->attsDefault != NULL) {
  6237. xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
  6238. ctxt->attsDefault = NULL;
  6239. }
  6240. if (ctxt->attsSpecial != NULL) {
  6241. xmlHashFree(ctxt->attsSpecial, NULL);
  6242. ctxt->attsSpecial = NULL;
  6243. }
  6244. }
  6245. /**
  6246. * htmlCtxtUseOptions:
  6247. * @ctxt: an HTML parser context
  6248. * @options: a combination of htmlParserOption(s)
  6249. *
  6250. * Applies the options to the parser context
  6251. *
  6252. * Returns 0 in case of success, the set of unknown or unimplemented options
  6253. * in case of error.
  6254. */
  6255. int
  6256. htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
  6257. {
  6258. if (ctxt == NULL)
  6259. return(-1);
  6260. if (options & HTML_PARSE_NOWARNING) {
  6261. ctxt->sax->warning = NULL;
  6262. ctxt->vctxt.warning = NULL;
  6263. options -= XML_PARSE_NOWARNING;
  6264. ctxt->options |= XML_PARSE_NOWARNING;
  6265. }
  6266. if (options & HTML_PARSE_NOERROR) {
  6267. ctxt->sax->error = NULL;
  6268. ctxt->vctxt.error = NULL;
  6269. ctxt->sax->fatalError = NULL;
  6270. options -= XML_PARSE_NOERROR;
  6271. ctxt->options |= XML_PARSE_NOERROR;
  6272. }
  6273. if (options & HTML_PARSE_PEDANTIC) {
  6274. ctxt->pedantic = 1;
  6275. options -= XML_PARSE_PEDANTIC;
  6276. ctxt->options |= XML_PARSE_PEDANTIC;
  6277. } else
  6278. ctxt->pedantic = 0;
  6279. if (options & XML_PARSE_NOBLANKS) {
  6280. ctxt->keepBlanks = 0;
  6281. ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
  6282. options -= XML_PARSE_NOBLANKS;
  6283. ctxt->options |= XML_PARSE_NOBLANKS;
  6284. } else
  6285. ctxt->keepBlanks = 1;
  6286. if (options & HTML_PARSE_RECOVER) {
  6287. ctxt->recovery = 1;
  6288. options -= HTML_PARSE_RECOVER;
  6289. } else
  6290. ctxt->recovery = 0;
  6291. if (options & HTML_PARSE_COMPACT) {
  6292. ctxt->options |= HTML_PARSE_COMPACT;
  6293. options -= HTML_PARSE_COMPACT;
  6294. }
  6295. if (options & XML_PARSE_HUGE) {
  6296. ctxt->options |= XML_PARSE_HUGE;
  6297. options -= XML_PARSE_HUGE;
  6298. }
  6299. if (options & HTML_PARSE_NODEFDTD) {
  6300. ctxt->options |= HTML_PARSE_NODEFDTD;
  6301. options -= HTML_PARSE_NODEFDTD;
  6302. }
  6303. if (options & HTML_PARSE_IGNORE_ENC) {
  6304. ctxt->options |= HTML_PARSE_IGNORE_ENC;
  6305. options -= HTML_PARSE_IGNORE_ENC;
  6306. }
  6307. if (options & HTML_PARSE_NOIMPLIED) {
  6308. ctxt->options |= HTML_PARSE_NOIMPLIED;
  6309. options -= HTML_PARSE_NOIMPLIED;
  6310. }
  6311. ctxt->dictNames = 0;
  6312. return (options);
  6313. }
  6314. /**
  6315. * htmlDoRead:
  6316. * @ctxt: an HTML parser context
  6317. * @URL: the base URL to use for the document
  6318. * @encoding: the document encoding, or NULL
  6319. * @options: a combination of htmlParserOption(s)
  6320. * @reuse: keep the context for reuse
  6321. *
  6322. * Common front-end for the htmlRead functions
  6323. *
  6324. * Returns the resulting document tree or NULL
  6325. */
  6326. static htmlDocPtr
  6327. htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
  6328. int options, int reuse)
  6329. {
  6330. htmlDocPtr ret;
  6331. htmlCtxtUseOptions(ctxt, options);
  6332. ctxt->html = 1;
  6333. if (encoding != NULL) {
  6334. xmlCharEncodingHandlerPtr hdlr;
  6335. hdlr = xmlFindCharEncodingHandler(encoding);
  6336. if (hdlr != NULL) {
  6337. xmlSwitchToEncoding(ctxt, hdlr);
  6338. if (ctxt->input->encoding != NULL)
  6339. xmlFree((xmlChar *) ctxt->input->encoding);
  6340. ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
  6341. }
  6342. }
  6343. if ((URL != NULL) && (ctxt->input != NULL) &&
  6344. (ctxt->input->filename == NULL))
  6345. ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
  6346. htmlParseDocument(ctxt);
  6347. ret = ctxt->myDoc;
  6348. ctxt->myDoc = NULL;
  6349. if (!reuse) {
  6350. if ((ctxt->dictNames) &&
  6351. (ret != NULL) &&
  6352. (ret->dict == ctxt->dict))
  6353. ctxt->dict = NULL;
  6354. xmlFreeParserCtxt(ctxt);
  6355. }
  6356. return (ret);
  6357. }
  6358. /**
  6359. * htmlReadDoc:
  6360. * @cur: a pointer to a zero terminated string
  6361. * @URL: the base URL to use for the document
  6362. * @encoding: the document encoding, or NULL
  6363. * @options: a combination of htmlParserOption(s)
  6364. *
  6365. * parse an XML in-memory document and build a tree.
  6366. *
  6367. * Returns the resulting document tree
  6368. */
  6369. htmlDocPtr
  6370. htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
  6371. {
  6372. htmlParserCtxtPtr ctxt;
  6373. if (cur == NULL)
  6374. return (NULL);
  6375. xmlInitParser();
  6376. ctxt = htmlCreateDocParserCtxt(cur, NULL);
  6377. if (ctxt == NULL)
  6378. return (NULL);
  6379. return (htmlDoRead(ctxt, URL, encoding, options, 0));
  6380. }
  6381. /**
  6382. * htmlReadFile:
  6383. * @filename: a file or URL
  6384. * @encoding: the document encoding, or NULL
  6385. * @options: a combination of htmlParserOption(s)
  6386. *
  6387. * parse an XML file from the filesystem or the network.
  6388. *
  6389. * Returns the resulting document tree
  6390. */
  6391. htmlDocPtr
  6392. htmlReadFile(const char *filename, const char *encoding, int options)
  6393. {
  6394. htmlParserCtxtPtr ctxt;
  6395. xmlInitParser();
  6396. ctxt = htmlCreateFileParserCtxt(filename, encoding);
  6397. if (ctxt == NULL)
  6398. return (NULL);
  6399. return (htmlDoRead(ctxt, NULL, NULL, options, 0));
  6400. }
  6401. /**
  6402. * htmlReadMemory:
  6403. * @buffer: a pointer to a char array
  6404. * @size: the size of the array
  6405. * @URL: the base URL to use for the document
  6406. * @encoding: the document encoding, or NULL
  6407. * @options: a combination of htmlParserOption(s)
  6408. *
  6409. * parse an XML in-memory document and build a tree.
  6410. *
  6411. * Returns the resulting document tree
  6412. */
  6413. htmlDocPtr
  6414. htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
  6415. {
  6416. htmlParserCtxtPtr ctxt;
  6417. xmlInitParser();
  6418. ctxt = xmlCreateMemoryParserCtxt(buffer, size);
  6419. if (ctxt == NULL)
  6420. return (NULL);
  6421. htmlDefaultSAXHandlerInit();
  6422. if (ctxt->sax != NULL)
  6423. memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
  6424. return (htmlDoRead(ctxt, URL, encoding, options, 0));
  6425. }
  6426. /**
  6427. * htmlReadFd:
  6428. * @fd: an open file descriptor
  6429. * @URL: the base URL to use for the document
  6430. * @encoding: the document encoding, or NULL
  6431. * @options: a combination of htmlParserOption(s)
  6432. *
  6433. * parse an XML from a file descriptor and build a tree.
  6434. *
  6435. * Returns the resulting document tree
  6436. */
  6437. htmlDocPtr
  6438. htmlReadFd(int fd, const char *URL, const char *encoding, int options)
  6439. {
  6440. htmlParserCtxtPtr ctxt;
  6441. xmlParserInputBufferPtr input;
  6442. xmlParserInputPtr stream;
  6443. if (fd < 0)
  6444. return (NULL);
  6445. xmlInitParser();
  6446. xmlInitParser();
  6447. input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
  6448. if (input == NULL)
  6449. return (NULL);
  6450. ctxt = xmlNewParserCtxt();
  6451. if (ctxt == NULL) {
  6452. xmlFreeParserInputBuffer(input);
  6453. return (NULL);
  6454. }
  6455. stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
  6456. if (stream == NULL) {
  6457. xmlFreeParserInputBuffer(input);
  6458. xmlFreeParserCtxt(ctxt);
  6459. return (NULL);
  6460. }
  6461. inputPush(ctxt, stream);
  6462. return (htmlDoRead(ctxt, URL, encoding, options, 0));
  6463. }
  6464. /**
  6465. * htmlReadIO:
  6466. * @ioread: an I/O read function
  6467. * @ioclose: an I/O close function
  6468. * @ioctx: an I/O handler
  6469. * @URL: the base URL to use for the document
  6470. * @encoding: the document encoding, or NULL
  6471. * @options: a combination of htmlParserOption(s)
  6472. *
  6473. * parse an HTML document from I/O functions and source and build a tree.
  6474. *
  6475. * Returns the resulting document tree
  6476. */
  6477. htmlDocPtr
  6478. htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
  6479. void *ioctx, const char *URL, const char *encoding, int options)
  6480. {
  6481. htmlParserCtxtPtr ctxt;
  6482. xmlParserInputBufferPtr input;
  6483. xmlParserInputPtr stream;
  6484. if (ioread == NULL)
  6485. return (NULL);
  6486. xmlInitParser();
  6487. input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
  6488. XML_CHAR_ENCODING_NONE);
  6489. if (input == NULL) {
  6490. if (ioclose != NULL)
  6491. ioclose(ioctx);
  6492. return (NULL);
  6493. }
  6494. ctxt = htmlNewParserCtxt();
  6495. if (ctxt == NULL) {
  6496. xmlFreeParserInputBuffer(input);
  6497. return (NULL);
  6498. }
  6499. stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
  6500. if (stream == NULL) {
  6501. xmlFreeParserInputBuffer(input);
  6502. xmlFreeParserCtxt(ctxt);
  6503. return (NULL);
  6504. }
  6505. inputPush(ctxt, stream);
  6506. return (htmlDoRead(ctxt, URL, encoding, options, 0));
  6507. }
  6508. /**
  6509. * htmlCtxtReadDoc:
  6510. * @ctxt: an HTML parser context
  6511. * @cur: a pointer to a zero terminated string
  6512. * @URL: the base URL to use for the document
  6513. * @encoding: the document encoding, or NULL
  6514. * @options: a combination of htmlParserOption(s)
  6515. *
  6516. * parse an XML in-memory document and build a tree.
  6517. * This reuses the existing @ctxt parser context
  6518. *
  6519. * Returns the resulting document tree
  6520. */
  6521. htmlDocPtr
  6522. htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
  6523. const char *URL, const char *encoding, int options)
  6524. {
  6525. xmlParserInputPtr stream;
  6526. if (cur == NULL)
  6527. return (NULL);
  6528. if (ctxt == NULL)
  6529. return (NULL);
  6530. xmlInitParser();
  6531. htmlCtxtReset(ctxt);
  6532. stream = xmlNewStringInputStream(ctxt, cur);
  6533. if (stream == NULL) {
  6534. return (NULL);
  6535. }
  6536. inputPush(ctxt, stream);
  6537. return (htmlDoRead(ctxt, URL, encoding, options, 1));
  6538. }
  6539. /**
  6540. * htmlCtxtReadFile:
  6541. * @ctxt: an HTML parser context
  6542. * @filename: a file or URL
  6543. * @encoding: the document encoding, or NULL
  6544. * @options: a combination of htmlParserOption(s)
  6545. *
  6546. * parse an XML file from the filesystem or the network.
  6547. * This reuses the existing @ctxt parser context
  6548. *
  6549. * Returns the resulting document tree
  6550. */
  6551. htmlDocPtr
  6552. htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
  6553. const char *encoding, int options)
  6554. {
  6555. xmlParserInputPtr stream;
  6556. if (filename == NULL)
  6557. return (NULL);
  6558. if (ctxt == NULL)
  6559. return (NULL);
  6560. xmlInitParser();
  6561. htmlCtxtReset(ctxt);
  6562. stream = xmlLoadExternalEntity(filename, NULL, ctxt);
  6563. if (stream == NULL) {
  6564. return (NULL);
  6565. }
  6566. inputPush(ctxt, stream);
  6567. return (htmlDoRead(ctxt, NULL, encoding, options, 1));
  6568. }
  6569. /**
  6570. * htmlCtxtReadMemory:
  6571. * @ctxt: an HTML parser context
  6572. * @buffer: a pointer to a char array
  6573. * @size: the size of the array
  6574. * @URL: the base URL to use for the document
  6575. * @encoding: the document encoding, or NULL
  6576. * @options: a combination of htmlParserOption(s)
  6577. *
  6578. * parse an XML in-memory document and build a tree.
  6579. * This reuses the existing @ctxt parser context
  6580. *
  6581. * Returns the resulting document tree
  6582. */
  6583. htmlDocPtr
  6584. htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
  6585. const char *URL, const char *encoding, int options)
  6586. {
  6587. xmlParserInputBufferPtr input;
  6588. xmlParserInputPtr stream;
  6589. if (ctxt == NULL)
  6590. return (NULL);
  6591. if (buffer == NULL)
  6592. return (NULL);
  6593. xmlInitParser();
  6594. htmlCtxtReset(ctxt);
  6595. input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
  6596. if (input == NULL) {
  6597. return(NULL);
  6598. }
  6599. stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
  6600. if (stream == NULL) {
  6601. xmlFreeParserInputBuffer(input);
  6602. return(NULL);
  6603. }
  6604. inputPush(ctxt, stream);
  6605. return (htmlDoRead(ctxt, URL, encoding, options, 1));
  6606. }
  6607. /**
  6608. * htmlCtxtReadFd:
  6609. * @ctxt: an HTML parser context
  6610. * @fd: an open file descriptor
  6611. * @URL: the base URL to use for the document
  6612. * @encoding: the document encoding, or NULL
  6613. * @options: a combination of htmlParserOption(s)
  6614. *
  6615. * parse an XML from a file descriptor and build a tree.
  6616. * This reuses the existing @ctxt parser context
  6617. *
  6618. * Returns the resulting document tree
  6619. */
  6620. htmlDocPtr
  6621. htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
  6622. const char *URL, const char *encoding, int options)
  6623. {
  6624. xmlParserInputBufferPtr input;
  6625. xmlParserInputPtr stream;
  6626. if (fd < 0)
  6627. return (NULL);
  6628. if (ctxt == NULL)
  6629. return (NULL);
  6630. xmlInitParser();
  6631. htmlCtxtReset(ctxt);
  6632. input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
  6633. if (input == NULL)
  6634. return (NULL);
  6635. stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
  6636. if (stream == NULL) {
  6637. xmlFreeParserInputBuffer(input);
  6638. return (NULL);
  6639. }
  6640. inputPush(ctxt, stream);
  6641. return (htmlDoRead(ctxt, URL, encoding, options, 1));
  6642. }
  6643. /**
  6644. * htmlCtxtReadIO:
  6645. * @ctxt: an HTML parser context
  6646. * @ioread: an I/O read function
  6647. * @ioclose: an I/O close function
  6648. * @ioctx: an I/O handler
  6649. * @URL: the base URL to use for the document
  6650. * @encoding: the document encoding, or NULL
  6651. * @options: a combination of htmlParserOption(s)
  6652. *
  6653. * parse an HTML document from I/O functions and source and build a tree.
  6654. * This reuses the existing @ctxt parser context
  6655. *
  6656. * Returns the resulting document tree
  6657. */
  6658. htmlDocPtr
  6659. htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
  6660. xmlInputCloseCallback ioclose, void *ioctx,
  6661. const char *URL,
  6662. const char *encoding, int options)
  6663. {
  6664. xmlParserInputBufferPtr input;
  6665. xmlParserInputPtr stream;
  6666. if (ioread == NULL)
  6667. return (NULL);
  6668. if (ctxt == NULL)
  6669. return (NULL);
  6670. xmlInitParser();
  6671. htmlCtxtReset(ctxt);
  6672. input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
  6673. XML_CHAR_ENCODING_NONE);
  6674. if (input == NULL) {
  6675. if (ioclose != NULL)
  6676. ioclose(ioctx);
  6677. return (NULL);
  6678. }
  6679. stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
  6680. if (stream == NULL) {
  6681. xmlFreeParserInputBuffer(input);
  6682. return (NULL);
  6683. }
  6684. inputPush(ctxt, stream);
  6685. return (htmlDoRead(ctxt, URL, encoding, options, 1));
  6686. }
  6687. #define bottom_HTMLparser
  6688. #include "elfgcchack.h"
  6689. #endif /* LIBXML_HTML_ENABLED */