parserInternals.c 61 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164
  1. /*
  2. * parserInternals.c : Internal routines (and obsolete ones) needed for the
  3. * XML and HTML parsers.
  4. *
  5. * See Copyright for the status of this software.
  6. *
  7. * daniel@veillard.com
  8. */
  9. #define IN_LIBXML
  10. #include "libxml.h"
  11. #if defined(_WIN32) && !defined (__CYGWIN__)
  12. #define XML_DIR_SEP '\\'
  13. #else
  14. #define XML_DIR_SEP '/'
  15. #endif
  16. #include <string.h>
  17. #ifdef HAVE_CTYPE_H
  18. #include <ctype.h>
  19. #endif
  20. #ifdef HAVE_STDLIB_H
  21. #include <stdlib.h>
  22. #endif
  23. #ifdef HAVE_SYS_STAT_H
  24. #include <sys/stat.h>
  25. #endif
  26. #ifdef HAVE_FCNTL_H
  27. #include <fcntl.h>
  28. #endif
  29. #ifdef HAVE_UNISTD_H
  30. #include <unistd.h>
  31. #endif
  32. #ifdef LIBXML_ZLIB_ENABLED
  33. #include <zlib.h>
  34. #endif
  35. #include <libxml/xmlmemory.h>
  36. #include <libxml/tree.h>
  37. #include <libxml/parser.h>
  38. #include <libxml/parserInternals.h>
  39. #include <libxml/valid.h>
  40. #include <libxml/entities.h>
  41. #include <libxml/xmlerror.h>
  42. #include <libxml/encoding.h>
  43. #include <libxml/valid.h>
  44. #include <libxml/xmlIO.h>
  45. #include <libxml/uri.h>
  46. #include <libxml/dict.h>
  47. #include <libxml/SAX.h>
  48. #ifdef LIBXML_CATALOG_ENABLED
  49. #include <libxml/catalog.h>
  50. #endif
  51. #include <libxml/globals.h>
  52. #include <libxml/chvalid.h>
  53. #define CUR(ctxt) ctxt->input->cur
  54. #define END(ctxt) ctxt->input->end
  55. #define VALID_CTXT(ctxt) (CUR(ctxt) <= END(ctxt))
  56. #include "buf.h"
  57. #include "enc.h"
  58. /*
  59. * Various global defaults for parsing
  60. */
  61. /**
  62. * xmlCheckVersion:
  63. * @version: the include version number
  64. *
  65. * check the compiled lib version against the include one.
  66. * This can warn or immediately kill the application
  67. */
  68. void
  69. xmlCheckVersion(int version) {
  70. int myversion = (int) LIBXML_VERSION;
  71. xmlInitParser();
  72. if ((myversion / 10000) != (version / 10000)) {
  73. xmlGenericError(xmlGenericErrorContext,
  74. "Fatal: program compiled against libxml %d using libxml %d\n",
  75. (version / 10000), (myversion / 10000));
  76. fprintf(stderr,
  77. "Fatal: program compiled against libxml %d using libxml %d\n",
  78. (version / 10000), (myversion / 10000));
  79. }
  80. if ((myversion / 100) < (version / 100)) {
  81. xmlGenericError(xmlGenericErrorContext,
  82. "Warning: program compiled against libxml %d using older %d\n",
  83. (version / 100), (myversion / 100));
  84. }
  85. }
  86. /************************************************************************
  87. * *
  88. * Some factorized error routines *
  89. * *
  90. ************************************************************************/
  91. /**
  92. * xmlErrMemory:
  93. * @ctxt: an XML parser context
  94. * @extra: extra information
  95. *
  96. * Handle a redefinition of attribute error
  97. */
  98. void
  99. xmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
  100. {
  101. if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  102. (ctxt->instate == XML_PARSER_EOF))
  103. return;
  104. if (ctxt != NULL) {
  105. ctxt->errNo = XML_ERR_NO_MEMORY;
  106. ctxt->instate = XML_PARSER_EOF;
  107. ctxt->disableSAX = 1;
  108. }
  109. if (extra)
  110. __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  111. XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
  112. NULL, NULL, 0, 0,
  113. "Memory allocation failed : %s\n", extra);
  114. else
  115. __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  116. XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
  117. NULL, NULL, 0, 0, "Memory allocation failed\n");
  118. }
  119. /**
  120. * __xmlErrEncoding:
  121. * @ctxt: an XML parser context
  122. * @xmlerr: the error number
  123. * @msg: the error message
  124. * @str1: an string info
  125. * @str2: an string info
  126. *
  127. * Handle an encoding error
  128. */
  129. void
  130. __xmlErrEncoding(xmlParserCtxtPtr ctxt, xmlParserErrors xmlerr,
  131. const char *msg, const xmlChar * str1, const xmlChar * str2)
  132. {
  133. if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  134. (ctxt->instate == XML_PARSER_EOF))
  135. return;
  136. if (ctxt != NULL)
  137. ctxt->errNo = xmlerr;
  138. __xmlRaiseError(NULL, NULL, NULL,
  139. ctxt, NULL, XML_FROM_PARSER, xmlerr, XML_ERR_FATAL,
  140. NULL, 0, (const char *) str1, (const char *) str2,
  141. NULL, 0, 0, msg, str1, str2);
  142. if (ctxt != NULL) {
  143. ctxt->wellFormed = 0;
  144. if (ctxt->recovery == 0)
  145. ctxt->disableSAX = 1;
  146. }
  147. }
  148. /**
  149. * xmlErrInternal:
  150. * @ctxt: an XML parser context
  151. * @msg: the error message
  152. * @str: error information
  153. *
  154. * Handle an internal error
  155. */
  156. static void LIBXML_ATTR_FORMAT(2,0)
  157. xmlErrInternal(xmlParserCtxtPtr ctxt, const char *msg, const xmlChar * str)
  158. {
  159. if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  160. (ctxt->instate == XML_PARSER_EOF))
  161. return;
  162. if (ctxt != NULL)
  163. ctxt->errNo = XML_ERR_INTERNAL_ERROR;
  164. __xmlRaiseError(NULL, NULL, NULL,
  165. ctxt, NULL, XML_FROM_PARSER, XML_ERR_INTERNAL_ERROR,
  166. XML_ERR_FATAL, NULL, 0, (const char *) str, NULL, NULL,
  167. 0, 0, msg, str);
  168. if (ctxt != NULL) {
  169. ctxt->wellFormed = 0;
  170. if (ctxt->recovery == 0)
  171. ctxt->disableSAX = 1;
  172. }
  173. }
  174. /**
  175. * xmlErrEncodingInt:
  176. * @ctxt: an XML parser context
  177. * @error: the error number
  178. * @msg: the error message
  179. * @val: an integer value
  180. *
  181. * n encoding error
  182. */
  183. static void LIBXML_ATTR_FORMAT(3,0)
  184. xmlErrEncodingInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
  185. const char *msg, int val)
  186. {
  187. if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  188. (ctxt->instate == XML_PARSER_EOF))
  189. return;
  190. if (ctxt != NULL)
  191. ctxt->errNo = error;
  192. __xmlRaiseError(NULL, NULL, NULL,
  193. ctxt, NULL, XML_FROM_PARSER, error, XML_ERR_FATAL,
  194. NULL, 0, NULL, NULL, NULL, val, 0, msg, val);
  195. if (ctxt != NULL) {
  196. ctxt->wellFormed = 0;
  197. if (ctxt->recovery == 0)
  198. ctxt->disableSAX = 1;
  199. }
  200. }
  201. /**
  202. * xmlIsLetter:
  203. * @c: an unicode character (int)
  204. *
  205. * Check whether the character is allowed by the production
  206. * [84] Letter ::= BaseChar | Ideographic
  207. *
  208. * Returns 0 if not, non-zero otherwise
  209. */
  210. int
  211. xmlIsLetter(int c) {
  212. return(IS_BASECHAR(c) || IS_IDEOGRAPHIC(c));
  213. }
  214. /************************************************************************
  215. * *
  216. * Input handling functions for progressive parsing *
  217. * *
  218. ************************************************************************/
  219. /* #define DEBUG_INPUT */
  220. /* #define DEBUG_STACK */
  221. /* #define DEBUG_PUSH */
  222. /* we need to keep enough input to show errors in context */
  223. #define LINE_LEN 80
  224. #ifdef DEBUG_INPUT
  225. #define CHECK_BUFFER(in) check_buffer(in)
  226. static
  227. void check_buffer(xmlParserInputPtr in) {
  228. if (in->base != xmlBufContent(in->buf->buffer)) {
  229. xmlGenericError(xmlGenericErrorContext,
  230. "xmlParserInput: base mismatch problem\n");
  231. }
  232. if (in->cur < in->base) {
  233. xmlGenericError(xmlGenericErrorContext,
  234. "xmlParserInput: cur < base problem\n");
  235. }
  236. if (in->cur > in->base + xmlBufUse(in->buf->buffer)) {
  237. xmlGenericError(xmlGenericErrorContext,
  238. "xmlParserInput: cur > base + use problem\n");
  239. }
  240. xmlGenericError(xmlGenericErrorContext,"buffer %x : content %x, cur %d, use %d\n",
  241. (int) in, (int) xmlBufContent(in->buf->buffer), in->cur - in->base,
  242. xmlBufUse(in->buf->buffer));
  243. }
  244. #else
  245. #define CHECK_BUFFER(in)
  246. #endif
  247. /**
  248. * xmlParserInputRead:
  249. * @in: an XML parser input
  250. * @len: an indicative size for the lookahead
  251. *
  252. * This function was internal and is deprecated.
  253. *
  254. * Returns -1 as this is an error to use it.
  255. */
  256. int
  257. xmlParserInputRead(xmlParserInputPtr in ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED) {
  258. return(-1);
  259. }
  260. /**
  261. * xmlParserInputGrow:
  262. * @in: an XML parser input
  263. * @len: an indicative size for the lookahead
  264. *
  265. * This function increase the input for the parser. It tries to
  266. * preserve pointers to the input buffer, and keep already read data
  267. *
  268. * Returns the amount of char read, or -1 in case of error, 0 indicate the
  269. * end of this entity
  270. */
  271. int
  272. xmlParserInputGrow(xmlParserInputPtr in, int len) {
  273. int ret;
  274. size_t indx;
  275. const xmlChar *content;
  276. if ((in == NULL) || (len < 0)) return(-1);
  277. #ifdef DEBUG_INPUT
  278. xmlGenericError(xmlGenericErrorContext, "Grow\n");
  279. #endif
  280. if (in->buf == NULL) return(-1);
  281. if (in->base == NULL) return(-1);
  282. if (in->cur == NULL) return(-1);
  283. if (in->buf->buffer == NULL) return(-1);
  284. CHECK_BUFFER(in);
  285. indx = in->cur - in->base;
  286. if (xmlBufUse(in->buf->buffer) > (unsigned int) indx + INPUT_CHUNK) {
  287. CHECK_BUFFER(in);
  288. return(0);
  289. }
  290. if (in->buf->readcallback != NULL) {
  291. ret = xmlParserInputBufferGrow(in->buf, len);
  292. } else
  293. return(0);
  294. /*
  295. * NOTE : in->base may be a "dangling" i.e. freed pointer in this
  296. * block, but we use it really as an integer to do some
  297. * pointer arithmetic. Insure will raise it as a bug but in
  298. * that specific case, that's not !
  299. */
  300. content = xmlBufContent(in->buf->buffer);
  301. if (in->base != content) {
  302. /*
  303. * the buffer has been reallocated
  304. */
  305. indx = in->cur - in->base;
  306. in->base = content;
  307. in->cur = &content[indx];
  308. }
  309. in->end = xmlBufEnd(in->buf->buffer);
  310. CHECK_BUFFER(in);
  311. return(ret);
  312. }
  313. /**
  314. * xmlParserInputShrink:
  315. * @in: an XML parser input
  316. *
  317. * This function removes used input for the parser.
  318. */
  319. void
  320. xmlParserInputShrink(xmlParserInputPtr in) {
  321. size_t used;
  322. size_t ret;
  323. size_t indx;
  324. const xmlChar *content;
  325. #ifdef DEBUG_INPUT
  326. xmlGenericError(xmlGenericErrorContext, "Shrink\n");
  327. #endif
  328. if (in == NULL) return;
  329. if (in->buf == NULL) return;
  330. if (in->base == NULL) return;
  331. if (in->cur == NULL) return;
  332. if (in->buf->buffer == NULL) return;
  333. CHECK_BUFFER(in);
  334. used = in->cur - xmlBufContent(in->buf->buffer);
  335. /*
  336. * Do not shrink on large buffers whose only a tiny fraction
  337. * was consumed
  338. */
  339. if (used > INPUT_CHUNK) {
  340. ret = xmlBufShrink(in->buf->buffer, used - LINE_LEN);
  341. if (ret > 0) {
  342. in->cur -= ret;
  343. in->consumed += ret;
  344. }
  345. in->end = xmlBufEnd(in->buf->buffer);
  346. }
  347. CHECK_BUFFER(in);
  348. if (xmlBufUse(in->buf->buffer) > INPUT_CHUNK) {
  349. return;
  350. }
  351. xmlParserInputBufferRead(in->buf, 2 * INPUT_CHUNK);
  352. content = xmlBufContent(in->buf->buffer);
  353. if (in->base != content) {
  354. /*
  355. * the buffer has been reallocated
  356. */
  357. indx = in->cur - in->base;
  358. in->base = content;
  359. in->cur = &content[indx];
  360. }
  361. in->end = xmlBufEnd(in->buf->buffer);
  362. CHECK_BUFFER(in);
  363. }
  364. /************************************************************************
  365. * *
  366. * UTF8 character input and related functions *
  367. * *
  368. ************************************************************************/
  369. /**
  370. * xmlNextChar:
  371. * @ctxt: the XML parser context
  372. *
  373. * Skip to the next char input char.
  374. */
  375. void
  376. xmlNextChar(xmlParserCtxtPtr ctxt)
  377. {
  378. if ((ctxt == NULL) || (ctxt->instate == XML_PARSER_EOF) ||
  379. (ctxt->input == NULL))
  380. return;
  381. if (!(VALID_CTXT(ctxt))) {
  382. xmlErrInternal(ctxt, "Parser input data memory error\n", NULL);
  383. ctxt->errNo = XML_ERR_INTERNAL_ERROR;
  384. xmlStopParser(ctxt);
  385. return;
  386. }
  387. if ((*ctxt->input->cur == 0) &&
  388. (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
  389. return;
  390. }
  391. if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
  392. const unsigned char *cur;
  393. unsigned char c;
  394. /*
  395. * 2.11 End-of-Line Handling
  396. * the literal two-character sequence "#xD#xA" or a standalone
  397. * literal #xD, an XML processor must pass to the application
  398. * the single character #xA.
  399. */
  400. if (*(ctxt->input->cur) == '\n') {
  401. ctxt->input->line++; ctxt->input->col = 1;
  402. } else
  403. ctxt->input->col++;
  404. /*
  405. * We are supposed to handle UTF8, check it's valid
  406. * From rfc2044: encoding of the Unicode values on UTF-8:
  407. *
  408. * UCS-4 range (hex.) UTF-8 octet sequence (binary)
  409. * 0000 0000-0000 007F 0xxxxxxx
  410. * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
  411. * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
  412. *
  413. * Check for the 0x110000 limit too
  414. */
  415. cur = ctxt->input->cur;
  416. c = *cur;
  417. if (c & 0x80) {
  418. if (c == 0xC0)
  419. goto encoding_error;
  420. if (cur[1] == 0) {
  421. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  422. cur = ctxt->input->cur;
  423. }
  424. if ((cur[1] & 0xc0) != 0x80)
  425. goto encoding_error;
  426. if ((c & 0xe0) == 0xe0) {
  427. unsigned int val;
  428. if (cur[2] == 0) {
  429. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  430. cur = ctxt->input->cur;
  431. }
  432. if ((cur[2] & 0xc0) != 0x80)
  433. goto encoding_error;
  434. if ((c & 0xf0) == 0xf0) {
  435. if (cur[3] == 0) {
  436. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  437. cur = ctxt->input->cur;
  438. }
  439. if (((c & 0xf8) != 0xf0) ||
  440. ((cur[3] & 0xc0) != 0x80))
  441. goto encoding_error;
  442. /* 4-byte code */
  443. ctxt->input->cur += 4;
  444. val = (cur[0] & 0x7) << 18;
  445. val |= (cur[1] & 0x3f) << 12;
  446. val |= (cur[2] & 0x3f) << 6;
  447. val |= cur[3] & 0x3f;
  448. } else {
  449. /* 3-byte code */
  450. ctxt->input->cur += 3;
  451. val = (cur[0] & 0xf) << 12;
  452. val |= (cur[1] & 0x3f) << 6;
  453. val |= cur[2] & 0x3f;
  454. }
  455. if (((val > 0xd7ff) && (val < 0xe000)) ||
  456. ((val > 0xfffd) && (val < 0x10000)) ||
  457. (val >= 0x110000)) {
  458. xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
  459. "Char 0x%X out of allowed range\n",
  460. val);
  461. }
  462. } else
  463. /* 2-byte code */
  464. ctxt->input->cur += 2;
  465. } else
  466. /* 1-byte code */
  467. ctxt->input->cur++;
  468. } else {
  469. /*
  470. * Assume it's a fixed length encoding (1) with
  471. * a compatible encoding for the ASCII set, since
  472. * XML constructs only use < 128 chars
  473. */
  474. if (*(ctxt->input->cur) == '\n') {
  475. ctxt->input->line++; ctxt->input->col = 1;
  476. } else
  477. ctxt->input->col++;
  478. ctxt->input->cur++;
  479. }
  480. if (*ctxt->input->cur == 0)
  481. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  482. return;
  483. encoding_error:
  484. /*
  485. * If we detect an UTF8 error that probably mean that the
  486. * input encoding didn't get properly advertised in the
  487. * declaration header. Report the error and switch the encoding
  488. * to ISO-Latin-1 (if you don't like this policy, just declare the
  489. * encoding !)
  490. */
  491. if ((ctxt == NULL) || (ctxt->input == NULL) ||
  492. (ctxt->input->end - ctxt->input->cur < 4)) {
  493. __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
  494. "Input is not proper UTF-8, indicate encoding !\n",
  495. NULL, NULL);
  496. } else {
  497. char buffer[150];
  498. snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
  499. ctxt->input->cur[0], ctxt->input->cur[1],
  500. ctxt->input->cur[2], ctxt->input->cur[3]);
  501. __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
  502. "Input is not proper UTF-8, indicate encoding !\n%s",
  503. BAD_CAST buffer, NULL);
  504. }
  505. ctxt->charset = XML_CHAR_ENCODING_8859_1;
  506. ctxt->input->cur++;
  507. return;
  508. }
  509. /**
  510. * xmlCurrentChar:
  511. * @ctxt: the XML parser context
  512. * @len: pointer to the length of the char read
  513. *
  514. * The current char value, if using UTF-8 this may actually span multiple
  515. * bytes in the input buffer. Implement the end of line normalization:
  516. * 2.11 End-of-Line Handling
  517. * Wherever an external parsed entity or the literal entity value
  518. * of an internal parsed entity contains either the literal two-character
  519. * sequence "#xD#xA" or a standalone literal #xD, an XML processor
  520. * must pass to the application the single character #xA.
  521. * This behavior can conveniently be produced by normalizing all
  522. * line breaks to #xA on input, before parsing.)
  523. *
  524. * Returns the current char value and its length
  525. */
  526. int
  527. xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
  528. if ((ctxt == NULL) || (len == NULL) || (ctxt->input == NULL)) return(0);
  529. if (ctxt->instate == XML_PARSER_EOF)
  530. return(0);
  531. if ((*ctxt->input->cur >= 0x20) && (*ctxt->input->cur <= 0x7F)) {
  532. *len = 1;
  533. return((int) *ctxt->input->cur);
  534. }
  535. if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
  536. /*
  537. * We are supposed to handle UTF8, check it's valid
  538. * From rfc2044: encoding of the Unicode values on UTF-8:
  539. *
  540. * UCS-4 range (hex.) UTF-8 octet sequence (binary)
  541. * 0000 0000-0000 007F 0xxxxxxx
  542. * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
  543. * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
  544. *
  545. * Check for the 0x110000 limit too
  546. */
  547. const unsigned char *cur = ctxt->input->cur;
  548. unsigned char c;
  549. unsigned int val;
  550. c = *cur;
  551. if (c & 0x80) {
  552. if (((c & 0x40) == 0) || (c == 0xC0))
  553. goto encoding_error;
  554. if (cur[1] == 0) {
  555. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  556. cur = ctxt->input->cur;
  557. }
  558. if ((cur[1] & 0xc0) != 0x80)
  559. goto encoding_error;
  560. if ((c & 0xe0) == 0xe0) {
  561. if (cur[2] == 0) {
  562. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  563. cur = ctxt->input->cur;
  564. }
  565. if ((cur[2] & 0xc0) != 0x80)
  566. goto encoding_error;
  567. if ((c & 0xf0) == 0xf0) {
  568. if (cur[3] == 0) {
  569. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  570. cur = ctxt->input->cur;
  571. }
  572. if (((c & 0xf8) != 0xf0) ||
  573. ((cur[3] & 0xc0) != 0x80))
  574. goto encoding_error;
  575. /* 4-byte code */
  576. *len = 4;
  577. val = (cur[0] & 0x7) << 18;
  578. val |= (cur[1] & 0x3f) << 12;
  579. val |= (cur[2] & 0x3f) << 6;
  580. val |= cur[3] & 0x3f;
  581. if (val < 0x10000)
  582. goto encoding_error;
  583. } else {
  584. /* 3-byte code */
  585. *len = 3;
  586. val = (cur[0] & 0xf) << 12;
  587. val |= (cur[1] & 0x3f) << 6;
  588. val |= cur[2] & 0x3f;
  589. if (val < 0x800)
  590. goto encoding_error;
  591. }
  592. } else {
  593. /* 2-byte code */
  594. *len = 2;
  595. val = (cur[0] & 0x1f) << 6;
  596. val |= cur[1] & 0x3f;
  597. if (val < 0x80)
  598. goto encoding_error;
  599. }
  600. if (!IS_CHAR(val)) {
  601. xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
  602. "Char 0x%X out of allowed range\n", val);
  603. }
  604. return(val);
  605. } else {
  606. /* 1-byte code */
  607. *len = 1;
  608. if (*ctxt->input->cur == 0)
  609. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  610. if ((*ctxt->input->cur == 0) &&
  611. (ctxt->input->end > ctxt->input->cur)) {
  612. xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
  613. "Char 0x0 out of allowed range\n", 0);
  614. }
  615. if (*ctxt->input->cur == 0xD) {
  616. if (ctxt->input->cur[1] == 0xA) {
  617. ctxt->input->cur++;
  618. }
  619. return(0xA);
  620. }
  621. return((int) *ctxt->input->cur);
  622. }
  623. }
  624. /*
  625. * Assume it's a fixed length encoding (1) with
  626. * a compatible encoding for the ASCII set, since
  627. * XML constructs only use < 128 chars
  628. */
  629. *len = 1;
  630. if (*ctxt->input->cur == 0xD) {
  631. if (ctxt->input->cur[1] == 0xA) {
  632. ctxt->input->cur++;
  633. }
  634. return(0xA);
  635. }
  636. return((int) *ctxt->input->cur);
  637. encoding_error:
  638. /*
  639. * An encoding problem may arise from a truncated input buffer
  640. * splitting a character in the middle. In that case do not raise
  641. * an error but return 0 to indicate an end of stream problem
  642. */
  643. if (ctxt->input->end - ctxt->input->cur < 4) {
  644. *len = 0;
  645. return(0);
  646. }
  647. /*
  648. * If we detect an UTF8 error that probably mean that the
  649. * input encoding didn't get properly advertised in the
  650. * declaration header. Report the error and switch the encoding
  651. * to ISO-Latin-1 (if you don't like this policy, just declare the
  652. * encoding !)
  653. */
  654. {
  655. char buffer[150];
  656. snprintf(&buffer[0], 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
  657. ctxt->input->cur[0], ctxt->input->cur[1],
  658. ctxt->input->cur[2], ctxt->input->cur[3]);
  659. __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
  660. "Input is not proper UTF-8, indicate encoding !\n%s",
  661. BAD_CAST buffer, NULL);
  662. }
  663. ctxt->charset = XML_CHAR_ENCODING_8859_1;
  664. *len = 1;
  665. return((int) *ctxt->input->cur);
  666. }
  667. /**
  668. * xmlStringCurrentChar:
  669. * @ctxt: the XML parser context
  670. * @cur: pointer to the beginning of the char
  671. * @len: pointer to the length of the char read
  672. *
  673. * The current char value, if using UTF-8 this may actually span multiple
  674. * bytes in the input buffer.
  675. *
  676. * Returns the current char value and its length
  677. */
  678. int
  679. xmlStringCurrentChar(xmlParserCtxtPtr ctxt, const xmlChar * cur, int *len)
  680. {
  681. if ((len == NULL) || (cur == NULL)) return(0);
  682. if ((ctxt == NULL) || (ctxt->charset == XML_CHAR_ENCODING_UTF8)) {
  683. /*
  684. * We are supposed to handle UTF8, check it's valid
  685. * From rfc2044: encoding of the Unicode values on UTF-8:
  686. *
  687. * UCS-4 range (hex.) UTF-8 octet sequence (binary)
  688. * 0000 0000-0000 007F 0xxxxxxx
  689. * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
  690. * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
  691. *
  692. * Check for the 0x110000 limit too
  693. */
  694. unsigned char c;
  695. unsigned int val;
  696. c = *cur;
  697. if (c & 0x80) {
  698. if ((cur[1] & 0xc0) != 0x80)
  699. goto encoding_error;
  700. if ((c & 0xe0) == 0xe0) {
  701. if ((cur[2] & 0xc0) != 0x80)
  702. goto encoding_error;
  703. if ((c & 0xf0) == 0xf0) {
  704. if (((c & 0xf8) != 0xf0) || ((cur[3] & 0xc0) != 0x80))
  705. goto encoding_error;
  706. /* 4-byte code */
  707. *len = 4;
  708. val = (cur[0] & 0x7) << 18;
  709. val |= (cur[1] & 0x3f) << 12;
  710. val |= (cur[2] & 0x3f) << 6;
  711. val |= cur[3] & 0x3f;
  712. } else {
  713. /* 3-byte code */
  714. *len = 3;
  715. val = (cur[0] & 0xf) << 12;
  716. val |= (cur[1] & 0x3f) << 6;
  717. val |= cur[2] & 0x3f;
  718. }
  719. } else {
  720. /* 2-byte code */
  721. *len = 2;
  722. val = (cur[0] & 0x1f) << 6;
  723. val |= cur[1] & 0x3f;
  724. }
  725. if (!IS_CHAR(val)) {
  726. xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
  727. "Char 0x%X out of allowed range\n", val);
  728. }
  729. return (val);
  730. } else {
  731. /* 1-byte code */
  732. *len = 1;
  733. return ((int) *cur);
  734. }
  735. }
  736. /*
  737. * Assume it's a fixed length encoding (1) with
  738. * a compatible encoding for the ASCII set, since
  739. * XML constructs only use < 128 chars
  740. */
  741. *len = 1;
  742. return ((int) *cur);
  743. encoding_error:
  744. /*
  745. * An encoding problem may arise from a truncated input buffer
  746. * splitting a character in the middle. In that case do not raise
  747. * an error but return 0 to indicate an end of stream problem
  748. */
  749. if ((ctxt == NULL) || (ctxt->input == NULL) ||
  750. (ctxt->input->end - ctxt->input->cur < 4)) {
  751. *len = 0;
  752. return(0);
  753. }
  754. /*
  755. * If we detect an UTF8 error that probably mean that the
  756. * input encoding didn't get properly advertised in the
  757. * declaration header. Report the error and switch the encoding
  758. * to ISO-Latin-1 (if you don't like this policy, just declare the
  759. * encoding !)
  760. */
  761. {
  762. char buffer[150];
  763. snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
  764. ctxt->input->cur[0], ctxt->input->cur[1],
  765. ctxt->input->cur[2], ctxt->input->cur[3]);
  766. __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
  767. "Input is not proper UTF-8, indicate encoding !\n%s",
  768. BAD_CAST buffer, NULL);
  769. }
  770. *len = 1;
  771. return ((int) *cur);
  772. }
  773. /**
  774. * xmlCopyCharMultiByte:
  775. * @out: pointer to an array of xmlChar
  776. * @val: the char value
  777. *
  778. * append the char value in the array
  779. *
  780. * Returns the number of xmlChar written
  781. */
  782. int
  783. xmlCopyCharMultiByte(xmlChar *out, int val) {
  784. if (out == NULL) return(0);
  785. /*
  786. * We are supposed to handle UTF8, check it's valid
  787. * From rfc2044: encoding of the Unicode values on UTF-8:
  788. *
  789. * UCS-4 range (hex.) UTF-8 octet sequence (binary)
  790. * 0000 0000-0000 007F 0xxxxxxx
  791. * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
  792. * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
  793. */
  794. if (val >= 0x80) {
  795. xmlChar *savedout = out;
  796. int bits;
  797. if (val < 0x800) { *out++= (val >> 6) | 0xC0; bits= 0; }
  798. else if (val < 0x10000) { *out++= (val >> 12) | 0xE0; bits= 6;}
  799. else if (val < 0x110000) { *out++= (val >> 18) | 0xF0; bits= 12; }
  800. else {
  801. xmlErrEncodingInt(NULL, XML_ERR_INVALID_CHAR,
  802. "Internal error, xmlCopyCharMultiByte 0x%X out of bound\n",
  803. val);
  804. return(0);
  805. }
  806. for ( ; bits >= 0; bits-= 6)
  807. *out++= ((val >> bits) & 0x3F) | 0x80 ;
  808. return (out - savedout);
  809. }
  810. *out = (xmlChar) val;
  811. return 1;
  812. }
  813. /**
  814. * xmlCopyChar:
  815. * @len: Ignored, compatibility
  816. * @out: pointer to an array of xmlChar
  817. * @val: the char value
  818. *
  819. * append the char value in the array
  820. *
  821. * Returns the number of xmlChar written
  822. */
  823. int
  824. xmlCopyChar(int len ATTRIBUTE_UNUSED, xmlChar *out, int val) {
  825. if (out == NULL) return(0);
  826. /* the len parameter is ignored */
  827. if (val >= 0x80) {
  828. return(xmlCopyCharMultiByte (out, val));
  829. }
  830. *out = (xmlChar) val;
  831. return 1;
  832. }
  833. /************************************************************************
  834. * *
  835. * Commodity functions to switch encodings *
  836. * *
  837. ************************************************************************/
  838. static int
  839. xmlSwitchToEncodingInt(xmlParserCtxtPtr ctxt,
  840. xmlCharEncodingHandlerPtr handler, int len);
  841. static int
  842. xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
  843. xmlCharEncodingHandlerPtr handler, int len);
  844. /**
  845. * xmlSwitchEncoding:
  846. * @ctxt: the parser context
  847. * @enc: the encoding value (number)
  848. *
  849. * change the input functions when discovering the character encoding
  850. * of a given entity.
  851. *
  852. * Returns 0 in case of success, -1 otherwise
  853. */
  854. int
  855. xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
  856. {
  857. xmlCharEncodingHandlerPtr handler;
  858. int len = -1;
  859. int ret;
  860. if (ctxt == NULL) return(-1);
  861. switch (enc) {
  862. case XML_CHAR_ENCODING_ERROR:
  863. __xmlErrEncoding(ctxt, XML_ERR_UNKNOWN_ENCODING,
  864. "encoding unknown\n", NULL, NULL);
  865. return(-1);
  866. case XML_CHAR_ENCODING_NONE:
  867. /* let's assume it's UTF-8 without the XML decl */
  868. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  869. return(0);
  870. case XML_CHAR_ENCODING_UTF8:
  871. /* default encoding, no conversion should be needed */
  872. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  873. /*
  874. * Errata on XML-1.0 June 20 2001
  875. * Specific handling of the Byte Order Mark for
  876. * UTF-8
  877. */
  878. if ((ctxt->input != NULL) &&
  879. (ctxt->input->cur[0] == 0xEF) &&
  880. (ctxt->input->cur[1] == 0xBB) &&
  881. (ctxt->input->cur[2] == 0xBF)) {
  882. ctxt->input->cur += 3;
  883. }
  884. return(0);
  885. case XML_CHAR_ENCODING_UTF16LE:
  886. case XML_CHAR_ENCODING_UTF16BE:
  887. /*The raw input characters are encoded
  888. *in UTF-16. As we expect this function
  889. *to be called after xmlCharEncInFunc, we expect
  890. *ctxt->input->cur to contain UTF-8 encoded characters.
  891. *So the raw UTF16 Byte Order Mark
  892. *has also been converted into
  893. *an UTF-8 BOM. Let's skip that BOM.
  894. */
  895. if ((ctxt->input != NULL) && (ctxt->input->cur != NULL) &&
  896. (ctxt->input->cur[0] == 0xEF) &&
  897. (ctxt->input->cur[1] == 0xBB) &&
  898. (ctxt->input->cur[2] == 0xBF)) {
  899. ctxt->input->cur += 3;
  900. }
  901. len = 90;
  902. break;
  903. case XML_CHAR_ENCODING_UCS2:
  904. len = 90;
  905. break;
  906. case XML_CHAR_ENCODING_UCS4BE:
  907. case XML_CHAR_ENCODING_UCS4LE:
  908. case XML_CHAR_ENCODING_UCS4_2143:
  909. case XML_CHAR_ENCODING_UCS4_3412:
  910. len = 180;
  911. break;
  912. case XML_CHAR_ENCODING_EBCDIC:
  913. case XML_CHAR_ENCODING_8859_1:
  914. case XML_CHAR_ENCODING_8859_2:
  915. case XML_CHAR_ENCODING_8859_3:
  916. case XML_CHAR_ENCODING_8859_4:
  917. case XML_CHAR_ENCODING_8859_5:
  918. case XML_CHAR_ENCODING_8859_6:
  919. case XML_CHAR_ENCODING_8859_7:
  920. case XML_CHAR_ENCODING_8859_8:
  921. case XML_CHAR_ENCODING_8859_9:
  922. case XML_CHAR_ENCODING_ASCII:
  923. case XML_CHAR_ENCODING_2022_JP:
  924. case XML_CHAR_ENCODING_SHIFT_JIS:
  925. case XML_CHAR_ENCODING_EUC_JP:
  926. len = 45;
  927. break;
  928. }
  929. handler = xmlGetCharEncodingHandler(enc);
  930. if (handler == NULL) {
  931. /*
  932. * Default handlers.
  933. */
  934. switch (enc) {
  935. case XML_CHAR_ENCODING_ASCII:
  936. /* default encoding, no conversion should be needed */
  937. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  938. return(0);
  939. case XML_CHAR_ENCODING_UTF16LE:
  940. break;
  941. case XML_CHAR_ENCODING_UTF16BE:
  942. break;
  943. case XML_CHAR_ENCODING_UCS4LE:
  944. __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  945. "encoding not supported %s\n",
  946. BAD_CAST "USC4 little endian", NULL);
  947. break;
  948. case XML_CHAR_ENCODING_UCS4BE:
  949. __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  950. "encoding not supported %s\n",
  951. BAD_CAST "USC4 big endian", NULL);
  952. break;
  953. case XML_CHAR_ENCODING_EBCDIC:
  954. __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  955. "encoding not supported %s\n",
  956. BAD_CAST "EBCDIC", NULL);
  957. break;
  958. case XML_CHAR_ENCODING_UCS4_2143:
  959. __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  960. "encoding not supported %s\n",
  961. BAD_CAST "UCS4 2143", NULL);
  962. break;
  963. case XML_CHAR_ENCODING_UCS4_3412:
  964. __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  965. "encoding not supported %s\n",
  966. BAD_CAST "UCS4 3412", NULL);
  967. break;
  968. case XML_CHAR_ENCODING_UCS2:
  969. __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  970. "encoding not supported %s\n",
  971. BAD_CAST "UCS2", NULL);
  972. break;
  973. case XML_CHAR_ENCODING_8859_1:
  974. case XML_CHAR_ENCODING_8859_2:
  975. case XML_CHAR_ENCODING_8859_3:
  976. case XML_CHAR_ENCODING_8859_4:
  977. case XML_CHAR_ENCODING_8859_5:
  978. case XML_CHAR_ENCODING_8859_6:
  979. case XML_CHAR_ENCODING_8859_7:
  980. case XML_CHAR_ENCODING_8859_8:
  981. case XML_CHAR_ENCODING_8859_9:
  982. /*
  983. * We used to keep the internal content in the
  984. * document encoding however this turns being unmaintainable
  985. * So xmlGetCharEncodingHandler() will return non-null
  986. * values for this now.
  987. */
  988. if ((ctxt->inputNr == 1) &&
  989. (ctxt->encoding == NULL) &&
  990. (ctxt->input != NULL) &&
  991. (ctxt->input->encoding != NULL)) {
  992. ctxt->encoding = xmlStrdup(ctxt->input->encoding);
  993. }
  994. ctxt->charset = enc;
  995. return(0);
  996. case XML_CHAR_ENCODING_2022_JP:
  997. __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  998. "encoding not supported %s\n",
  999. BAD_CAST "ISO-2022-JP", NULL);
  1000. break;
  1001. case XML_CHAR_ENCODING_SHIFT_JIS:
  1002. __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  1003. "encoding not supported %s\n",
  1004. BAD_CAST "Shift_JIS", NULL);
  1005. break;
  1006. case XML_CHAR_ENCODING_EUC_JP:
  1007. __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  1008. "encoding not supported %s\n",
  1009. BAD_CAST "EUC-JP", NULL);
  1010. break;
  1011. default:
  1012. break;
  1013. }
  1014. }
  1015. /*
  1016. * TODO: We could recover from errors in external entities if we
  1017. * didn't stop the parser. But most callers of this function don't
  1018. * check the return value.
  1019. */
  1020. if (handler == NULL) {
  1021. xmlStopParser(ctxt);
  1022. return(-1);
  1023. }
  1024. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  1025. ret = xmlSwitchToEncodingInt(ctxt, handler, len);
  1026. if ((ret < 0) || (ctxt->errNo == XML_I18N_CONV_FAILED)) {
  1027. /*
  1028. * on encoding conversion errors, stop the parser
  1029. */
  1030. xmlStopParser(ctxt);
  1031. ctxt->errNo = XML_I18N_CONV_FAILED;
  1032. }
  1033. return(ret);
  1034. }
  1035. /**
  1036. * xmlSwitchInputEncoding:
  1037. * @ctxt: the parser context
  1038. * @input: the input stream
  1039. * @handler: the encoding handler
  1040. * @len: the number of bytes to convert for the first line or -1
  1041. *
  1042. * change the input functions when discovering the character encoding
  1043. * of a given entity.
  1044. *
  1045. * Returns 0 in case of success, -1 otherwise
  1046. */
  1047. static int
  1048. xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
  1049. xmlCharEncodingHandlerPtr handler, int len)
  1050. {
  1051. int nbchars;
  1052. if (handler == NULL)
  1053. return (-1);
  1054. if (input == NULL)
  1055. return (-1);
  1056. if (input->buf != NULL) {
  1057. if (input->buf->encoder != NULL) {
  1058. /*
  1059. * Check in case the auto encoding detection triggered
  1060. * in already.
  1061. */
  1062. if (input->buf->encoder == handler)
  1063. return (0);
  1064. /*
  1065. * "UTF-16" can be used for both LE and BE
  1066. if ((!xmlStrncmp(BAD_CAST input->buf->encoder->name,
  1067. BAD_CAST "UTF-16", 6)) &&
  1068. (!xmlStrncmp(BAD_CAST handler->name,
  1069. BAD_CAST "UTF-16", 6))) {
  1070. return(0);
  1071. }
  1072. */
  1073. /*
  1074. * Note: this is a bit dangerous, but that's what it
  1075. * takes to use nearly compatible signature for different
  1076. * encodings.
  1077. *
  1078. * FIXME: Encoders might buffer partial byte sequences, so
  1079. * this probably can't work. We should return an error and
  1080. * make sure that callers never try to switch the encoding
  1081. * twice.
  1082. */
  1083. xmlCharEncCloseFunc(input->buf->encoder);
  1084. input->buf->encoder = handler;
  1085. return (0);
  1086. }
  1087. input->buf->encoder = handler;
  1088. /*
  1089. * Is there already some content down the pipe to convert ?
  1090. */
  1091. if (xmlBufIsEmpty(input->buf->buffer) == 0) {
  1092. int processed;
  1093. unsigned int use;
  1094. /*
  1095. * Specific handling of the Byte Order Mark for
  1096. * UTF-16
  1097. */
  1098. if ((handler->name != NULL) &&
  1099. (!strcmp(handler->name, "UTF-16LE") ||
  1100. !strcmp(handler->name, "UTF-16")) &&
  1101. (input->cur[0] == 0xFF) && (input->cur[1] == 0xFE)) {
  1102. input->cur += 2;
  1103. }
  1104. if ((handler->name != NULL) &&
  1105. (!strcmp(handler->name, "UTF-16BE")) &&
  1106. (input->cur[0] == 0xFE) && (input->cur[1] == 0xFF)) {
  1107. input->cur += 2;
  1108. }
  1109. /*
  1110. * Errata on XML-1.0 June 20 2001
  1111. * Specific handling of the Byte Order Mark for
  1112. * UTF-8
  1113. */
  1114. if ((handler->name != NULL) &&
  1115. (!strcmp(handler->name, "UTF-8")) &&
  1116. (input->cur[0] == 0xEF) &&
  1117. (input->cur[1] == 0xBB) && (input->cur[2] == 0xBF)) {
  1118. input->cur += 3;
  1119. }
  1120. /*
  1121. * Shrink the current input buffer.
  1122. * Move it as the raw buffer and create a new input buffer
  1123. */
  1124. processed = input->cur - input->base;
  1125. xmlBufShrink(input->buf->buffer, processed);
  1126. input->buf->raw = input->buf->buffer;
  1127. input->buf->buffer = xmlBufCreate();
  1128. input->buf->rawconsumed = processed;
  1129. use = xmlBufUse(input->buf->raw);
  1130. if (ctxt->html) {
  1131. /*
  1132. * convert as much as possible of the buffer
  1133. */
  1134. nbchars = xmlCharEncInput(input->buf, 1);
  1135. } else {
  1136. /*
  1137. * convert just enough to get
  1138. * '<?xml version="1.0" encoding="xxx"?>'
  1139. * parsed with the autodetected encoding
  1140. * into the parser reading buffer.
  1141. */
  1142. nbchars = xmlCharEncFirstLineInput(input->buf, len);
  1143. }
  1144. xmlBufResetInput(input->buf->buffer, input);
  1145. if (nbchars < 0) {
  1146. xmlErrInternal(ctxt,
  1147. "switching encoding: encoder error\n",
  1148. NULL);
  1149. return (-1);
  1150. }
  1151. input->buf->rawconsumed += use - xmlBufUse(input->buf->raw);
  1152. }
  1153. return (0);
  1154. } else if (input->length == 0) {
  1155. /*
  1156. * When parsing a static memory array one must know the
  1157. * size to be able to convert the buffer.
  1158. */
  1159. xmlErrInternal(ctxt, "switching encoding : no input\n", NULL);
  1160. /*
  1161. * Callers assume that the input buffer takes ownership of the
  1162. * encoding handler. xmlCharEncCloseFunc frees unregistered
  1163. * handlers and avoids a memory leak.
  1164. */
  1165. xmlCharEncCloseFunc(handler);
  1166. return (-1);
  1167. }
  1168. /*
  1169. * We should actually raise an error here, see issue #34.
  1170. */
  1171. xmlCharEncCloseFunc(handler);
  1172. return (0);
  1173. }
  1174. /**
  1175. * xmlSwitchInputEncoding:
  1176. * @ctxt: the parser context
  1177. * @input: the input stream
  1178. * @handler: the encoding handler
  1179. *
  1180. * change the input functions when discovering the character encoding
  1181. * of a given entity.
  1182. *
  1183. * Returns 0 in case of success, -1 otherwise
  1184. */
  1185. int
  1186. xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
  1187. xmlCharEncodingHandlerPtr handler) {
  1188. return(xmlSwitchInputEncodingInt(ctxt, input, handler, -1));
  1189. }
  1190. /**
  1191. * xmlSwitchToEncodingInt:
  1192. * @ctxt: the parser context
  1193. * @handler: the encoding handler
  1194. * @len: the length to convert or -1
  1195. *
  1196. * change the input functions when discovering the character encoding
  1197. * of a given entity, and convert only @len bytes of the output, this
  1198. * is needed on auto detect to allows any declared encoding later to
  1199. * convert the actual content after the xmlDecl
  1200. *
  1201. * Returns 0 in case of success, -1 otherwise
  1202. */
  1203. static int
  1204. xmlSwitchToEncodingInt(xmlParserCtxtPtr ctxt,
  1205. xmlCharEncodingHandlerPtr handler, int len) {
  1206. int ret = 0;
  1207. if (handler != NULL) {
  1208. if (ctxt->input != NULL) {
  1209. ret = xmlSwitchInputEncodingInt(ctxt, ctxt->input, handler, len);
  1210. } else {
  1211. xmlErrInternal(ctxt, "xmlSwitchToEncoding : no input\n",
  1212. NULL);
  1213. return(-1);
  1214. }
  1215. /*
  1216. * The parsing is now done in UTF8 natively
  1217. */
  1218. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  1219. } else
  1220. return(-1);
  1221. return(ret);
  1222. }
  1223. /**
  1224. * xmlSwitchToEncoding:
  1225. * @ctxt: the parser context
  1226. * @handler: the encoding handler
  1227. *
  1228. * change the input functions when discovering the character encoding
  1229. * of a given entity.
  1230. *
  1231. * Returns 0 in case of success, -1 otherwise
  1232. */
  1233. int
  1234. xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler)
  1235. {
  1236. return (xmlSwitchToEncodingInt(ctxt, handler, -1));
  1237. }
  1238. /************************************************************************
  1239. * *
  1240. * Commodity functions to handle entities processing *
  1241. * *
  1242. ************************************************************************/
  1243. /**
  1244. * xmlFreeInputStream:
  1245. * @input: an xmlParserInputPtr
  1246. *
  1247. * Free up an input stream.
  1248. */
  1249. void
  1250. xmlFreeInputStream(xmlParserInputPtr input) {
  1251. if (input == NULL) return;
  1252. if (input->filename != NULL) xmlFree((char *) input->filename);
  1253. if (input->directory != NULL) xmlFree((char *) input->directory);
  1254. if (input->encoding != NULL) xmlFree((char *) input->encoding);
  1255. if (input->version != NULL) xmlFree((char *) input->version);
  1256. if ((input->free != NULL) && (input->base != NULL))
  1257. input->free((xmlChar *) input->base);
  1258. if (input->buf != NULL)
  1259. xmlFreeParserInputBuffer(input->buf);
  1260. xmlFree(input);
  1261. }
  1262. /**
  1263. * xmlNewInputStream:
  1264. * @ctxt: an XML parser context
  1265. *
  1266. * Create a new input stream structure.
  1267. *
  1268. * Returns the new input stream or NULL
  1269. */
  1270. xmlParserInputPtr
  1271. xmlNewInputStream(xmlParserCtxtPtr ctxt) {
  1272. xmlParserInputPtr input;
  1273. input = (xmlParserInputPtr) xmlMalloc(sizeof(xmlParserInput));
  1274. if (input == NULL) {
  1275. xmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
  1276. return(NULL);
  1277. }
  1278. memset(input, 0, sizeof(xmlParserInput));
  1279. input->line = 1;
  1280. input->col = 1;
  1281. input->standalone = -1;
  1282. /*
  1283. * If the context is NULL the id cannot be initialized, but that
  1284. * should not happen while parsing which is the situation where
  1285. * the id is actually needed.
  1286. */
  1287. if (ctxt != NULL)
  1288. input->id = ctxt->input_id++;
  1289. return(input);
  1290. }
  1291. /**
  1292. * xmlNewIOInputStream:
  1293. * @ctxt: an XML parser context
  1294. * @input: an I/O Input
  1295. * @enc: the charset encoding if known
  1296. *
  1297. * Create a new input stream structure encapsulating the @input into
  1298. * a stream suitable for the parser.
  1299. *
  1300. * Returns the new input stream or NULL
  1301. */
  1302. xmlParserInputPtr
  1303. xmlNewIOInputStream(xmlParserCtxtPtr ctxt, xmlParserInputBufferPtr input,
  1304. xmlCharEncoding enc) {
  1305. xmlParserInputPtr inputStream;
  1306. if (input == NULL) return(NULL);
  1307. if (xmlParserDebugEntities)
  1308. xmlGenericError(xmlGenericErrorContext, "new input from I/O\n");
  1309. inputStream = xmlNewInputStream(ctxt);
  1310. if (inputStream == NULL) {
  1311. return(NULL);
  1312. }
  1313. inputStream->filename = NULL;
  1314. inputStream->buf = input;
  1315. xmlBufResetInput(inputStream->buf->buffer, inputStream);
  1316. if (enc != XML_CHAR_ENCODING_NONE) {
  1317. xmlSwitchEncoding(ctxt, enc);
  1318. }
  1319. return(inputStream);
  1320. }
  1321. /**
  1322. * xmlNewEntityInputStream:
  1323. * @ctxt: an XML parser context
  1324. * @entity: an Entity pointer
  1325. *
  1326. * Create a new input stream based on an xmlEntityPtr
  1327. *
  1328. * Returns the new input stream or NULL
  1329. */
  1330. xmlParserInputPtr
  1331. xmlNewEntityInputStream(xmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
  1332. xmlParserInputPtr input;
  1333. if (entity == NULL) {
  1334. xmlErrInternal(ctxt, "xmlNewEntityInputStream entity = NULL\n",
  1335. NULL);
  1336. return(NULL);
  1337. }
  1338. if (xmlParserDebugEntities)
  1339. xmlGenericError(xmlGenericErrorContext,
  1340. "new input from entity: %s\n", entity->name);
  1341. if (entity->content == NULL) {
  1342. switch (entity->etype) {
  1343. case XML_EXTERNAL_GENERAL_UNPARSED_ENTITY:
  1344. xmlErrInternal(ctxt, "Cannot parse entity %s\n",
  1345. entity->name);
  1346. break;
  1347. case XML_EXTERNAL_GENERAL_PARSED_ENTITY:
  1348. case XML_EXTERNAL_PARAMETER_ENTITY:
  1349. return(xmlLoadExternalEntity((char *) entity->URI,
  1350. (char *) entity->ExternalID, ctxt));
  1351. case XML_INTERNAL_GENERAL_ENTITY:
  1352. xmlErrInternal(ctxt,
  1353. "Internal entity %s without content !\n",
  1354. entity->name);
  1355. break;
  1356. case XML_INTERNAL_PARAMETER_ENTITY:
  1357. xmlErrInternal(ctxt,
  1358. "Internal parameter entity %s without content !\n",
  1359. entity->name);
  1360. break;
  1361. case XML_INTERNAL_PREDEFINED_ENTITY:
  1362. xmlErrInternal(ctxt,
  1363. "Predefined entity %s without content !\n",
  1364. entity->name);
  1365. break;
  1366. }
  1367. return(NULL);
  1368. }
  1369. input = xmlNewInputStream(ctxt);
  1370. if (input == NULL) {
  1371. return(NULL);
  1372. }
  1373. if (entity->URI != NULL)
  1374. input->filename = (char *) xmlStrdup((xmlChar *) entity->URI);
  1375. input->base = entity->content;
  1376. if (entity->length == 0)
  1377. entity->length = xmlStrlen(entity->content);
  1378. input->cur = entity->content;
  1379. input->length = entity->length;
  1380. input->end = &entity->content[input->length];
  1381. return(input);
  1382. }
  1383. /**
  1384. * xmlNewStringInputStream:
  1385. * @ctxt: an XML parser context
  1386. * @buffer: an memory buffer
  1387. *
  1388. * Create a new input stream based on a memory buffer.
  1389. * Returns the new input stream
  1390. */
  1391. xmlParserInputPtr
  1392. xmlNewStringInputStream(xmlParserCtxtPtr ctxt, const xmlChar *buffer) {
  1393. xmlParserInputPtr input;
  1394. if (buffer == NULL) {
  1395. xmlErrInternal(ctxt, "xmlNewStringInputStream string = NULL\n",
  1396. NULL);
  1397. return(NULL);
  1398. }
  1399. if (xmlParserDebugEntities)
  1400. xmlGenericError(xmlGenericErrorContext,
  1401. "new fixed input: %.30s\n", buffer);
  1402. input = xmlNewInputStream(ctxt);
  1403. if (input == NULL) {
  1404. xmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
  1405. return(NULL);
  1406. }
  1407. input->base = buffer;
  1408. input->cur = buffer;
  1409. input->length = xmlStrlen(buffer);
  1410. input->end = &buffer[input->length];
  1411. return(input);
  1412. }
  1413. /**
  1414. * xmlNewInputFromFile:
  1415. * @ctxt: an XML parser context
  1416. * @filename: the filename to use as entity
  1417. *
  1418. * Create a new input stream based on a file or an URL.
  1419. *
  1420. * Returns the new input stream or NULL in case of error
  1421. */
  1422. xmlParserInputPtr
  1423. xmlNewInputFromFile(xmlParserCtxtPtr ctxt, const char *filename) {
  1424. xmlParserInputBufferPtr buf;
  1425. xmlParserInputPtr inputStream;
  1426. char *directory = NULL;
  1427. xmlChar *URI = NULL;
  1428. if (xmlParserDebugEntities)
  1429. xmlGenericError(xmlGenericErrorContext,
  1430. "new input from file: %s\n", filename);
  1431. if (ctxt == NULL) return(NULL);
  1432. buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
  1433. if (buf == NULL) {
  1434. if (filename == NULL)
  1435. __xmlLoaderErr(ctxt,
  1436. "failed to load external entity: NULL filename \n",
  1437. NULL);
  1438. else
  1439. __xmlLoaderErr(ctxt, "failed to load external entity \"%s\"\n",
  1440. (const char *) filename);
  1441. return(NULL);
  1442. }
  1443. inputStream = xmlNewInputStream(ctxt);
  1444. if (inputStream == NULL)
  1445. return(NULL);
  1446. inputStream->buf = buf;
  1447. inputStream = xmlCheckHTTPInput(ctxt, inputStream);
  1448. if (inputStream == NULL)
  1449. return(NULL);
  1450. if (inputStream->filename == NULL)
  1451. URI = xmlStrdup((xmlChar *) filename);
  1452. else
  1453. URI = xmlStrdup((xmlChar *) inputStream->filename);
  1454. directory = xmlParserGetDirectory((const char *) URI);
  1455. if (inputStream->filename != NULL) xmlFree((char *)inputStream->filename);
  1456. inputStream->filename = (char *) xmlCanonicPath((const xmlChar *) URI);
  1457. if (URI != NULL) xmlFree((char *) URI);
  1458. inputStream->directory = directory;
  1459. xmlBufResetInput(inputStream->buf->buffer, inputStream);
  1460. if ((ctxt->directory == NULL) && (directory != NULL))
  1461. ctxt->directory = (char *) xmlStrdup((const xmlChar *) directory);
  1462. return(inputStream);
  1463. }
  1464. /************************************************************************
  1465. * *
  1466. * Commodity functions to handle parser contexts *
  1467. * *
  1468. ************************************************************************/
  1469. /**
  1470. * xmlInitParserCtxt:
  1471. * @ctxt: an XML parser context
  1472. *
  1473. * Initialize a parser context
  1474. *
  1475. * Returns 0 in case of success and -1 in case of error
  1476. */
  1477. int
  1478. xmlInitParserCtxt(xmlParserCtxtPtr ctxt)
  1479. {
  1480. xmlParserInputPtr input;
  1481. if(ctxt==NULL) {
  1482. xmlErrInternal(NULL, "Got NULL parser context\n", NULL);
  1483. return(-1);
  1484. }
  1485. xmlDefaultSAXHandlerInit();
  1486. if (ctxt->dict == NULL)
  1487. ctxt->dict = xmlDictCreate();
  1488. if (ctxt->dict == NULL) {
  1489. xmlErrMemory(NULL, "cannot initialize parser context\n");
  1490. return(-1);
  1491. }
  1492. xmlDictSetLimit(ctxt->dict, XML_MAX_DICTIONARY_LIMIT);
  1493. if (ctxt->sax == NULL)
  1494. ctxt->sax = (xmlSAXHandler *) xmlMalloc(sizeof(xmlSAXHandler));
  1495. if (ctxt->sax == NULL) {
  1496. xmlErrMemory(NULL, "cannot initialize parser context\n");
  1497. return(-1);
  1498. }
  1499. else
  1500. xmlSAXVersion(ctxt->sax, 2);
  1501. ctxt->maxatts = 0;
  1502. ctxt->atts = NULL;
  1503. /* Allocate the Input stack */
  1504. if (ctxt->inputTab == NULL) {
  1505. ctxt->inputTab = (xmlParserInputPtr *)
  1506. xmlMalloc(5 * sizeof(xmlParserInputPtr));
  1507. ctxt->inputMax = 5;
  1508. }
  1509. if (ctxt->inputTab == NULL) {
  1510. xmlErrMemory(NULL, "cannot initialize parser context\n");
  1511. ctxt->inputNr = 0;
  1512. ctxt->inputMax = 0;
  1513. ctxt->input = NULL;
  1514. return(-1);
  1515. }
  1516. while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
  1517. xmlFreeInputStream(input);
  1518. }
  1519. ctxt->inputNr = 0;
  1520. ctxt->input = NULL;
  1521. ctxt->version = NULL;
  1522. ctxt->encoding = NULL;
  1523. ctxt->standalone = -1;
  1524. ctxt->hasExternalSubset = 0;
  1525. ctxt->hasPErefs = 0;
  1526. ctxt->html = 0;
  1527. ctxt->external = 0;
  1528. ctxt->instate = XML_PARSER_START;
  1529. ctxt->token = 0;
  1530. ctxt->directory = NULL;
  1531. /* Allocate the Node stack */
  1532. if (ctxt->nodeTab == NULL) {
  1533. ctxt->nodeTab = (xmlNodePtr *) xmlMalloc(10 * sizeof(xmlNodePtr));
  1534. ctxt->nodeMax = 10;
  1535. }
  1536. if (ctxt->nodeTab == NULL) {
  1537. xmlErrMemory(NULL, "cannot initialize parser context\n");
  1538. ctxt->nodeNr = 0;
  1539. ctxt->nodeMax = 0;
  1540. ctxt->node = NULL;
  1541. ctxt->inputNr = 0;
  1542. ctxt->inputMax = 0;
  1543. ctxt->input = NULL;
  1544. return(-1);
  1545. }
  1546. ctxt->nodeNr = 0;
  1547. ctxt->node = NULL;
  1548. /* Allocate the Name stack */
  1549. if (ctxt->nameTab == NULL) {
  1550. ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
  1551. ctxt->nameMax = 10;
  1552. }
  1553. if (ctxt->nameTab == NULL) {
  1554. xmlErrMemory(NULL, "cannot initialize parser context\n");
  1555. ctxt->nodeNr = 0;
  1556. ctxt->nodeMax = 0;
  1557. ctxt->node = NULL;
  1558. ctxt->inputNr = 0;
  1559. ctxt->inputMax = 0;
  1560. ctxt->input = NULL;
  1561. ctxt->nameNr = 0;
  1562. ctxt->nameMax = 0;
  1563. ctxt->name = NULL;
  1564. return(-1);
  1565. }
  1566. ctxt->nameNr = 0;
  1567. ctxt->name = NULL;
  1568. /* Allocate the space stack */
  1569. if (ctxt->spaceTab == NULL) {
  1570. ctxt->spaceTab = (int *) xmlMalloc(10 * sizeof(int));
  1571. ctxt->spaceMax = 10;
  1572. }
  1573. if (ctxt->spaceTab == NULL) {
  1574. xmlErrMemory(NULL, "cannot initialize parser context\n");
  1575. ctxt->nodeNr = 0;
  1576. ctxt->nodeMax = 0;
  1577. ctxt->node = NULL;
  1578. ctxt->inputNr = 0;
  1579. ctxt->inputMax = 0;
  1580. ctxt->input = NULL;
  1581. ctxt->nameNr = 0;
  1582. ctxt->nameMax = 0;
  1583. ctxt->name = NULL;
  1584. ctxt->spaceNr = 0;
  1585. ctxt->spaceMax = 0;
  1586. ctxt->space = NULL;
  1587. return(-1);
  1588. }
  1589. ctxt->spaceNr = 1;
  1590. ctxt->spaceMax = 10;
  1591. ctxt->spaceTab[0] = -1;
  1592. ctxt->space = &ctxt->spaceTab[0];
  1593. ctxt->userData = ctxt;
  1594. ctxt->myDoc = NULL;
  1595. ctxt->wellFormed = 1;
  1596. ctxt->nsWellFormed = 1;
  1597. ctxt->valid = 1;
  1598. ctxt->loadsubset = xmlLoadExtDtdDefaultValue;
  1599. if (ctxt->loadsubset) {
  1600. ctxt->options |= XML_PARSE_DTDLOAD;
  1601. }
  1602. ctxt->validate = xmlDoValidityCheckingDefaultValue;
  1603. ctxt->pedantic = xmlPedanticParserDefaultValue;
  1604. if (ctxt->pedantic) {
  1605. ctxt->options |= XML_PARSE_PEDANTIC;
  1606. }
  1607. ctxt->linenumbers = xmlLineNumbersDefaultValue;
  1608. ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
  1609. if (ctxt->keepBlanks == 0) {
  1610. ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
  1611. ctxt->options |= XML_PARSE_NOBLANKS;
  1612. }
  1613. ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
  1614. ctxt->vctxt.userData = ctxt;
  1615. ctxt->vctxt.error = xmlParserValidityError;
  1616. ctxt->vctxt.warning = xmlParserValidityWarning;
  1617. if (ctxt->validate) {
  1618. if (xmlGetWarningsDefaultValue == 0)
  1619. ctxt->vctxt.warning = NULL;
  1620. else
  1621. ctxt->vctxt.warning = xmlParserValidityWarning;
  1622. ctxt->vctxt.nodeMax = 0;
  1623. ctxt->options |= XML_PARSE_DTDVALID;
  1624. }
  1625. ctxt->replaceEntities = xmlSubstituteEntitiesDefaultValue;
  1626. if (ctxt->replaceEntities) {
  1627. ctxt->options |= XML_PARSE_NOENT;
  1628. }
  1629. ctxt->record_info = 0;
  1630. ctxt->checkIndex = 0;
  1631. ctxt->inSubset = 0;
  1632. ctxt->errNo = XML_ERR_OK;
  1633. ctxt->depth = 0;
  1634. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  1635. ctxt->catalogs = NULL;
  1636. ctxt->nbentities = 0;
  1637. ctxt->sizeentities = 0;
  1638. ctxt->sizeentcopy = 0;
  1639. ctxt->input_id = 1;
  1640. xmlInitNodeInfoSeq(&ctxt->node_seq);
  1641. return(0);
  1642. }
  1643. /**
  1644. * xmlFreeParserCtxt:
  1645. * @ctxt: an XML parser context
  1646. *
  1647. * Free all the memory used by a parser context. However the parsed
  1648. * document in ctxt->myDoc is not freed.
  1649. */
  1650. void
  1651. xmlFreeParserCtxt(xmlParserCtxtPtr ctxt)
  1652. {
  1653. xmlParserInputPtr input;
  1654. if (ctxt == NULL) return;
  1655. while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
  1656. xmlFreeInputStream(input);
  1657. }
  1658. if (ctxt->spaceTab != NULL) xmlFree(ctxt->spaceTab);
  1659. if (ctxt->nameTab != NULL) xmlFree((xmlChar * *)ctxt->nameTab);
  1660. if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
  1661. if (ctxt->nodeInfoTab != NULL) xmlFree(ctxt->nodeInfoTab);
  1662. if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
  1663. if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
  1664. if (ctxt->encoding != NULL) xmlFree((char *) ctxt->encoding);
  1665. if (ctxt->extSubURI != NULL) xmlFree((char *) ctxt->extSubURI);
  1666. if (ctxt->extSubSystem != NULL) xmlFree((char *) ctxt->extSubSystem);
  1667. #ifdef LIBXML_SAX1_ENABLED
  1668. if ((ctxt->sax != NULL) &&
  1669. (ctxt->sax != (xmlSAXHandlerPtr) &xmlDefaultSAXHandler))
  1670. #else
  1671. if (ctxt->sax != NULL)
  1672. #endif /* LIBXML_SAX1_ENABLED */
  1673. xmlFree(ctxt->sax);
  1674. if (ctxt->directory != NULL) xmlFree((char *) ctxt->directory);
  1675. if (ctxt->vctxt.nodeTab != NULL) xmlFree(ctxt->vctxt.nodeTab);
  1676. if (ctxt->atts != NULL) xmlFree((xmlChar * *)ctxt->atts);
  1677. if (ctxt->dict != NULL) xmlDictFree(ctxt->dict);
  1678. if (ctxt->nsTab != NULL) xmlFree((char *) ctxt->nsTab);
  1679. if (ctxt->pushTab != NULL) xmlFree(ctxt->pushTab);
  1680. if (ctxt->attallocs != NULL) xmlFree(ctxt->attallocs);
  1681. if (ctxt->attsDefault != NULL)
  1682. xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
  1683. if (ctxt->attsSpecial != NULL)
  1684. xmlHashFree(ctxt->attsSpecial, NULL);
  1685. if (ctxt->freeElems != NULL) {
  1686. xmlNodePtr cur, next;
  1687. cur = ctxt->freeElems;
  1688. while (cur != NULL) {
  1689. next = cur->next;
  1690. xmlFree(cur);
  1691. cur = next;
  1692. }
  1693. }
  1694. if (ctxt->freeAttrs != NULL) {
  1695. xmlAttrPtr cur, next;
  1696. cur = ctxt->freeAttrs;
  1697. while (cur != NULL) {
  1698. next = cur->next;
  1699. xmlFree(cur);
  1700. cur = next;
  1701. }
  1702. }
  1703. /*
  1704. * cleanup the error strings
  1705. */
  1706. if (ctxt->lastError.message != NULL)
  1707. xmlFree(ctxt->lastError.message);
  1708. if (ctxt->lastError.file != NULL)
  1709. xmlFree(ctxt->lastError.file);
  1710. if (ctxt->lastError.str1 != NULL)
  1711. xmlFree(ctxt->lastError.str1);
  1712. if (ctxt->lastError.str2 != NULL)
  1713. xmlFree(ctxt->lastError.str2);
  1714. if (ctxt->lastError.str3 != NULL)
  1715. xmlFree(ctxt->lastError.str3);
  1716. #ifdef LIBXML_CATALOG_ENABLED
  1717. if (ctxt->catalogs != NULL)
  1718. xmlCatalogFreeLocal(ctxt->catalogs);
  1719. #endif
  1720. xmlFree(ctxt);
  1721. }
  1722. /**
  1723. * xmlNewParserCtxt:
  1724. *
  1725. * Allocate and initialize a new parser context.
  1726. *
  1727. * Returns the xmlParserCtxtPtr or NULL
  1728. */
  1729. xmlParserCtxtPtr
  1730. xmlNewParserCtxt(void)
  1731. {
  1732. xmlParserCtxtPtr ctxt;
  1733. ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
  1734. if (ctxt == NULL) {
  1735. xmlErrMemory(NULL, "cannot allocate parser context\n");
  1736. return(NULL);
  1737. }
  1738. memset(ctxt, 0, sizeof(xmlParserCtxt));
  1739. if (xmlInitParserCtxt(ctxt) < 0) {
  1740. xmlFreeParserCtxt(ctxt);
  1741. return(NULL);
  1742. }
  1743. return(ctxt);
  1744. }
  1745. /************************************************************************
  1746. * *
  1747. * Handling of node information *
  1748. * *
  1749. ************************************************************************/
  1750. /**
  1751. * xmlClearParserCtxt:
  1752. * @ctxt: an XML parser context
  1753. *
  1754. * Clear (release owned resources) and reinitialize a parser context
  1755. */
  1756. void
  1757. xmlClearParserCtxt(xmlParserCtxtPtr ctxt)
  1758. {
  1759. if (ctxt==NULL)
  1760. return;
  1761. xmlClearNodeInfoSeq(&ctxt->node_seq);
  1762. xmlCtxtReset(ctxt);
  1763. }
  1764. /**
  1765. * xmlParserFindNodeInfo:
  1766. * @ctx: an XML parser context
  1767. * @node: an XML node within the tree
  1768. *
  1769. * Find the parser node info struct for a given node
  1770. *
  1771. * Returns an xmlParserNodeInfo block pointer or NULL
  1772. */
  1773. const xmlParserNodeInfo *
  1774. xmlParserFindNodeInfo(const xmlParserCtxtPtr ctx, const xmlNodePtr node)
  1775. {
  1776. unsigned long pos;
  1777. if ((ctx == NULL) || (node == NULL))
  1778. return (NULL);
  1779. /* Find position where node should be at */
  1780. pos = xmlParserFindNodeInfoIndex(&ctx->node_seq, node);
  1781. if (pos < ctx->node_seq.length
  1782. && ctx->node_seq.buffer[pos].node == node)
  1783. return &ctx->node_seq.buffer[pos];
  1784. else
  1785. return NULL;
  1786. }
  1787. /**
  1788. * xmlInitNodeInfoSeq:
  1789. * @seq: a node info sequence pointer
  1790. *
  1791. * -- Initialize (set to initial state) node info sequence
  1792. */
  1793. void
  1794. xmlInitNodeInfoSeq(xmlParserNodeInfoSeqPtr seq)
  1795. {
  1796. if (seq == NULL)
  1797. return;
  1798. seq->length = 0;
  1799. seq->maximum = 0;
  1800. seq->buffer = NULL;
  1801. }
  1802. /**
  1803. * xmlClearNodeInfoSeq:
  1804. * @seq: a node info sequence pointer
  1805. *
  1806. * -- Clear (release memory and reinitialize) node
  1807. * info sequence
  1808. */
  1809. void
  1810. xmlClearNodeInfoSeq(xmlParserNodeInfoSeqPtr seq)
  1811. {
  1812. if (seq == NULL)
  1813. return;
  1814. if (seq->buffer != NULL)
  1815. xmlFree(seq->buffer);
  1816. xmlInitNodeInfoSeq(seq);
  1817. }
  1818. /**
  1819. * xmlParserFindNodeInfoIndex:
  1820. * @seq: a node info sequence pointer
  1821. * @node: an XML node pointer
  1822. *
  1823. *
  1824. * xmlParserFindNodeInfoIndex : Find the index that the info record for
  1825. * the given node is or should be at in a sorted sequence
  1826. *
  1827. * Returns a long indicating the position of the record
  1828. */
  1829. unsigned long
  1830. xmlParserFindNodeInfoIndex(const xmlParserNodeInfoSeqPtr seq,
  1831. const xmlNodePtr node)
  1832. {
  1833. unsigned long upper, lower, middle;
  1834. int found = 0;
  1835. if ((seq == NULL) || (node == NULL))
  1836. return ((unsigned long) -1);
  1837. /* Do a binary search for the key */
  1838. lower = 1;
  1839. upper = seq->length;
  1840. middle = 0;
  1841. while (lower <= upper && !found) {
  1842. middle = lower + (upper - lower) / 2;
  1843. if (node == seq->buffer[middle - 1].node)
  1844. found = 1;
  1845. else if (node < seq->buffer[middle - 1].node)
  1846. upper = middle - 1;
  1847. else
  1848. lower = middle + 1;
  1849. }
  1850. /* Return position */
  1851. if (middle == 0 || seq->buffer[middle - 1].node < node)
  1852. return middle;
  1853. else
  1854. return middle - 1;
  1855. }
  1856. /**
  1857. * xmlParserAddNodeInfo:
  1858. * @ctxt: an XML parser context
  1859. * @info: a node info sequence pointer
  1860. *
  1861. * Insert node info record into the sorted sequence
  1862. */
  1863. void
  1864. xmlParserAddNodeInfo(xmlParserCtxtPtr ctxt,
  1865. const xmlParserNodeInfoPtr info)
  1866. {
  1867. unsigned long pos;
  1868. if ((ctxt == NULL) || (info == NULL)) return;
  1869. /* Find pos and check to see if node is already in the sequence */
  1870. pos = xmlParserFindNodeInfoIndex(&ctxt->node_seq, (xmlNodePtr)
  1871. info->node);
  1872. if ((pos < ctxt->node_seq.length) &&
  1873. (ctxt->node_seq.buffer != NULL) &&
  1874. (ctxt->node_seq.buffer[pos].node == info->node)) {
  1875. ctxt->node_seq.buffer[pos] = *info;
  1876. }
  1877. /* Otherwise, we need to add new node to buffer */
  1878. else {
  1879. if ((ctxt->node_seq.length + 1 > ctxt->node_seq.maximum) ||
  1880. (ctxt->node_seq.buffer == NULL)) {
  1881. xmlParserNodeInfo *tmp_buffer;
  1882. unsigned int byte_size;
  1883. if (ctxt->node_seq.maximum == 0)
  1884. ctxt->node_seq.maximum = 2;
  1885. byte_size = (sizeof(*ctxt->node_seq.buffer) *
  1886. (2 * ctxt->node_seq.maximum));
  1887. if (ctxt->node_seq.buffer == NULL)
  1888. tmp_buffer = (xmlParserNodeInfo *) xmlMalloc(byte_size);
  1889. else
  1890. tmp_buffer =
  1891. (xmlParserNodeInfo *) xmlRealloc(ctxt->node_seq.buffer,
  1892. byte_size);
  1893. if (tmp_buffer == NULL) {
  1894. xmlErrMemory(ctxt, "failed to allocate buffer\n");
  1895. return;
  1896. }
  1897. ctxt->node_seq.buffer = tmp_buffer;
  1898. ctxt->node_seq.maximum *= 2;
  1899. }
  1900. /* If position is not at end, move elements out of the way */
  1901. if (pos != ctxt->node_seq.length) {
  1902. unsigned long i;
  1903. for (i = ctxt->node_seq.length; i > pos; i--)
  1904. ctxt->node_seq.buffer[i] = ctxt->node_seq.buffer[i - 1];
  1905. }
  1906. /* Copy element and increase length */
  1907. ctxt->node_seq.buffer[pos] = *info;
  1908. ctxt->node_seq.length++;
  1909. }
  1910. }
  1911. /************************************************************************
  1912. * *
  1913. * Defaults settings *
  1914. * *
  1915. ************************************************************************/
  1916. /**
  1917. * xmlPedanticParserDefault:
  1918. * @val: int 0 or 1
  1919. *
  1920. * Set and return the previous value for enabling pedantic warnings.
  1921. *
  1922. * Returns the last value for 0 for no substitution, 1 for substitution.
  1923. */
  1924. int
  1925. xmlPedanticParserDefault(int val) {
  1926. int old = xmlPedanticParserDefaultValue;
  1927. xmlPedanticParserDefaultValue = val;
  1928. return(old);
  1929. }
  1930. /**
  1931. * xmlLineNumbersDefault:
  1932. * @val: int 0 or 1
  1933. *
  1934. * Set and return the previous value for enabling line numbers in elements
  1935. * contents. This may break on old application and is turned off by default.
  1936. *
  1937. * Returns the last value for 0 for no substitution, 1 for substitution.
  1938. */
  1939. int
  1940. xmlLineNumbersDefault(int val) {
  1941. int old = xmlLineNumbersDefaultValue;
  1942. xmlLineNumbersDefaultValue = val;
  1943. return(old);
  1944. }
  1945. /**
  1946. * xmlSubstituteEntitiesDefault:
  1947. * @val: int 0 or 1
  1948. *
  1949. * Set and return the previous value for default entity support.
  1950. * Initially the parser always keep entity references instead of substituting
  1951. * entity values in the output. This function has to be used to change the
  1952. * default parser behavior
  1953. * SAX::substituteEntities() has to be used for changing that on a file by
  1954. * file basis.
  1955. *
  1956. * Returns the last value for 0 for no substitution, 1 for substitution.
  1957. */
  1958. int
  1959. xmlSubstituteEntitiesDefault(int val) {
  1960. int old = xmlSubstituteEntitiesDefaultValue;
  1961. xmlSubstituteEntitiesDefaultValue = val;
  1962. return(old);
  1963. }
  1964. /**
  1965. * xmlKeepBlanksDefault:
  1966. * @val: int 0 or 1
  1967. *
  1968. * Set and return the previous value for default blanks text nodes support.
  1969. * The 1.x version of the parser used an heuristic to try to detect
  1970. * ignorable white spaces. As a result the SAX callback was generating
  1971. * xmlSAX2IgnorableWhitespace() callbacks instead of characters() one, and when
  1972. * using the DOM output text nodes containing those blanks were not generated.
  1973. * The 2.x and later version will switch to the XML standard way and
  1974. * ignorableWhitespace() are only generated when running the parser in
  1975. * validating mode and when the current element doesn't allow CDATA or
  1976. * mixed content.
  1977. * This function is provided as a way to force the standard behavior
  1978. * on 1.X libs and to switch back to the old mode for compatibility when
  1979. * running 1.X client code on 2.X . Upgrade of 1.X code should be done
  1980. * by using xmlIsBlankNode() commodity function to detect the "empty"
  1981. * nodes generated.
  1982. * This value also affect autogeneration of indentation when saving code
  1983. * if blanks sections are kept, indentation is not generated.
  1984. *
  1985. * Returns the last value for 0 for no substitution, 1 for substitution.
  1986. */
  1987. int
  1988. xmlKeepBlanksDefault(int val) {
  1989. int old = xmlKeepBlanksDefaultValue;
  1990. xmlKeepBlanksDefaultValue = val;
  1991. if (!val) xmlIndentTreeOutput = 1;
  1992. return(old);
  1993. }
  1994. #define bottom_parserInternals
  1995. #include "elfgcchack.h"