rapidxml_sax3.hpp 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111
  1. #ifndef RAPIDXML_SAX3_HPP_INCLUDED
  2. #define RAPIDXML_SAX3_HPP_INCLUDED
  3. // Copyright (C) 2006, 2009 Marcin Kalicinski
  4. // Version 1.13
  5. // Revision $DateTime: 2009/05/13 01:46:17 $
  6. //! \file rapidxml_sax3.hpp This file contains rapidxml SAX parser implementation
  7. #include <vector>
  8. #include <utility>
  9. #include "rapidxml.hpp"
  10. // On MSVC, disable "conditional expression is constant" warning (level 4).
  11. // This warning is almost impossible to avoid with certain types of templated code
  12. #ifdef _MSC_VER
  13. #pragma warning(push)
  14. #pragma warning(disable:4127) // Conditional expression is constant
  15. #endif
  16. #if !defined(RAPIDXML_PARSE_ERROR)
  17. #define RAPIDXML_PARSE_ERROR(what, where) throw parse_error(what, where)
  18. #endif
  19. namespace rapidxml
  20. {
  21. const int parse_normal = parse_no_data_nodes;
  22. typedef std::pair<char*, size_t> tok_string;
  23. typedef std::pair<const char*, size_t> const_tok_string;
  24. class xml_sax3_handler
  25. {
  26. public:
  27. virtual ~xml_sax3_handler() {}
  28. virtual void xmlSAX3StartElement(char *name, size_t) = 0;
  29. virtual void xmlSAX3Attr(const char* name, size_t,
  30. const char* value, size_t) = 0;
  31. virtual void xmlSAX3EndAttr() = 0;
  32. virtual void xmlSAX3EndElement(const char *name, size_t) = 0;
  33. virtual void xmlSAX3Text(const char *text, size_t len) = 0;
  34. };
  35. ///////////////////////////////////////////////////////////////////////////
  36. // XML sax parser
  37. class xml_sax2_handler : public xml_sax3_handler
  38. {
  39. public:
  40. xml_sax2_handler() { elementAttrs.reserve(64); }
  41. /**
  42. * @remark: The parameter 'name' without null terminator charactor
  43. */
  44. virtual void xmlSAX2StartElement(const char *name, size_t, const char **atts, size_t) = 0;
  45. /**
  46. * @remark: The parameter 'name' has null terminator charactor
  47. */
  48. virtual void xmlSAX2EndElement(const char *name, size_t) = 0;
  49. /**
  50. * @remark: The parameter 's' has null terminator charactor
  51. */
  52. virtual void xmlSAX2Text(const char *s, size_t) = 0;
  53. /// Implement SAX3 interfaces:
  54. virtual void xmlSAX3StartElement(char * name, size_t size) final
  55. {
  56. elementName.first = name;
  57. elementName.second = size;
  58. }
  59. virtual void xmlSAX3Attr(
  60. const char* name, size_t,
  61. const char* value, size_t) final
  62. {
  63. elementAttrs.push_back(name);
  64. elementAttrs.push_back(value);
  65. }
  66. void xmlSAX3EndAttr() final
  67. {
  68. auto chTemp = elementName.first[elementName.second];
  69. elementName.first[elementName.second] = '\0';
  70. if (!elementAttrs.empty()) {
  71. elementAttrs.push_back(nullptr);
  72. xmlSAX2StartElement(elementName.first, elementName.second, &elementAttrs[0], elementAttrs.size() - 1);
  73. elementAttrs.clear();
  74. }
  75. else {
  76. const char* attr = nullptr;
  77. const char** attrs = &attr;
  78. xmlSAX2StartElement(elementName.first, elementName.second, attrs, 0);
  79. }
  80. elementName.first[elementName.second] = chTemp;
  81. }
  82. virtual void xmlSAX3EndElement(const char *name, size_t len) final
  83. {
  84. xmlSAX2EndElement(name, len);
  85. }
  86. virtual void xmlSAX3Text(const char *s, size_t len) final
  87. {
  88. xmlSAX2Text(s, len);
  89. }
  90. private:
  91. tok_string elementName;
  92. std::vector<const char*> elementAttrs;
  93. };
  94. //! This class represents root of the DOM hierarchy.
  95. //! It is also an xml_node and a memory_pool through public inheritance.
  96. //! Use parse() function to build a DOM tree from a zero-terminated XML text string.
  97. //! parse() function allocates memory for nodes and attributes by using functions of xml_document,
  98. //! which are inherited from memory_pool.
  99. //! To access root node of the document, use the document itself, as if it was an xml_node.
  100. //! \param Ch Character type to use.
  101. template<class Ch = char>
  102. class xml_sax3_parser
  103. {
  104. xml_sax3_handler* handler_;
  105. public:
  106. //! Constructs empty XML document
  107. xml_sax3_parser(xml_sax3_handler* handler)
  108. {
  109. handler_ = handler;
  110. endptr_ = nullptr;
  111. }
  112. Ch *endptr_;
  113. //! Parses zero-terminated XML string according to given flags.
  114. //! Passed string will be modified by the parser, unless rapidxml::parse_non_destructive flag is used.
  115. //! The string must persist for the lifetime of the document.
  116. //! In case of error, rapidxml::parse_error exception will be thrown.
  117. //! <br><br>
  118. //! If you want to parse contents of a file, you must first load the file into the memory, and pass pointer to its beginning.
  119. //! Make sure that data is zero-terminated.
  120. //! <br><br>
  121. //! Document can be parsed into multiple times.
  122. //! Each new call to parse removes previous nodes and attributes (if any), but does not clear memory pool.
  123. //! \param text XML data to parse; pointer is non-const to denote fact that this data may be modified by the parser.
  124. template<int Flags = parse_normal>
  125. void parse(Ch *text, int nLen)
  126. {
  127. assert(text);
  128. // Remove current contents
  129. //this->remove_all_nodes();
  130. //this->remove_all_attributes();
  131. endptr_ = nullptr;
  132. if (nLen > 0)
  133. {
  134. endptr_ = text + nLen;
  135. }
  136. // Parse BOM, if any
  137. parse_bom<Flags>(text);
  138. // Parse children
  139. while (1)
  140. {
  141. // Skip whitespace before node
  142. skip<whitespace_pred, Flags>(text, endptr_);
  143. if (*text == 0 || text >= endptr_)
  144. break;
  145. // Parse and append new child
  146. if (*text == Ch('<'))
  147. {
  148. ++text; // Skip '<'
  149. parse_node<Flags>(text);
  150. }
  151. else
  152. RAPIDXML_PARSE_ERROR("expected <", text);
  153. }
  154. }
  155. //! Clears the document by deleting all nodes and clearing the memory pool.
  156. //! All nodes owned by document pool are destroyed.
  157. void clear()
  158. {
  159. //this->remove_all_nodes();
  160. //this->remove_all_attributes();
  161. //memory_pool<Ch>::clear();
  162. }
  163. private:
  164. ///////////////////////////////////////////////////////////////////////
  165. // Internal character utility functions
  166. // Detect whitespace character
  167. struct whitespace_pred
  168. {
  169. static unsigned char test(Ch ch)
  170. {
  171. return internal::lookup_tables<0>::lookup_whitespace[static_cast<unsigned char>(ch)];
  172. }
  173. };
  174. // Detect node name character
  175. struct node_name_pred
  176. {
  177. static unsigned char test(Ch ch)
  178. {
  179. return internal::lookup_tables<0>::lookup_node_name[static_cast<unsigned char>(ch)];
  180. }
  181. };
  182. // Detect attribute name character
  183. struct attribute_name_pred
  184. {
  185. static unsigned char test(Ch ch)
  186. {
  187. return internal::lookup_tables<0>::lookup_attribute_name[static_cast<unsigned char>(ch)];
  188. }
  189. };
  190. // Detect text character (PCDATA)
  191. struct text_pred
  192. {
  193. static unsigned char test(Ch ch)
  194. {
  195. return internal::lookup_tables<0>::lookup_text[static_cast<unsigned char>(ch)];
  196. }
  197. };
  198. // Detect text character (PCDATA) that does not require processing
  199. struct text_pure_no_ws_pred
  200. {
  201. static unsigned char test(Ch ch)
  202. {
  203. return internal::lookup_tables<0>::lookup_text_pure_no_ws[static_cast<unsigned char>(ch)];
  204. }
  205. };
  206. // Detect text character (PCDATA) that does not require processing
  207. struct text_pure_with_ws_pred
  208. {
  209. static unsigned char test(Ch ch)
  210. {
  211. return internal::lookup_tables<0>::lookup_text_pure_with_ws[static_cast<unsigned char>(ch)];
  212. }
  213. };
  214. // Detect attribute value character
  215. template<Ch Quote>
  216. struct attribute_value_pred
  217. {
  218. static unsigned char test(Ch ch)
  219. {
  220. if (Quote == Ch('\''))
  221. return internal::lookup_tables<0>::lookup_attribute_data_1[static_cast<unsigned char>(ch)];
  222. if (Quote == Ch('\"'))
  223. return internal::lookup_tables<0>::lookup_attribute_data_2[static_cast<unsigned char>(ch)];
  224. return 0; // Should never be executed, to avoid warnings on Comeau
  225. }
  226. };
  227. // Detect attribute value character
  228. template<Ch Quote>
  229. struct attribute_value_pure_pred
  230. {
  231. static unsigned char test(Ch ch)
  232. {
  233. if (Quote == Ch('\''))
  234. return internal::lookup_tables<0>::lookup_attribute_data_1_pure[static_cast<unsigned char>(ch)];
  235. if (Quote == Ch('\"'))
  236. return internal::lookup_tables<0>::lookup_attribute_data_2_pure[static_cast<unsigned char>(ch)];
  237. return 0; // Should never be executed, to avoid warnings on Comeau
  238. }
  239. };
  240. // Insert coded character, using UTF8 or 8-bit ASCII
  241. template<int Flags>
  242. static void insert_coded_character(Ch *&text, unsigned long code)
  243. {
  244. if (Flags & parse_no_utf8)
  245. {
  246. // Insert 8-bit ASCII character
  247. // Todo: possibly verify that code is less than 256 and use replacement char otherwise?
  248. text[0] = static_cast<unsigned char>(code);
  249. text += 1;
  250. }
  251. else
  252. {
  253. // Insert UTF8 sequence
  254. if (code < 0x80) // 1 byte sequence
  255. {
  256. text[0] = static_cast<unsigned char>(code);
  257. text += 1;
  258. }
  259. else if (code < 0x800) // 2 byte sequence
  260. {
  261. text[1] = static_cast<unsigned char>((code | 0x80) & 0xBF); code >>= 6;
  262. text[0] = static_cast<unsigned char>(code | 0xC0);
  263. text += 2;
  264. }
  265. else if (code < 0x10000) // 3 byte sequence
  266. {
  267. text[2] = static_cast<unsigned char>((code | 0x80) & 0xBF); code >>= 6;
  268. text[1] = static_cast<unsigned char>((code | 0x80) & 0xBF); code >>= 6;
  269. text[0] = static_cast<unsigned char>(code | 0xE0);
  270. text += 3;
  271. }
  272. else if (code < 0x110000) // 4 byte sequence
  273. {
  274. text[3] = static_cast<unsigned char>((code | 0x80) & 0xBF); code >>= 6;
  275. text[2] = static_cast<unsigned char>((code | 0x80) & 0xBF); code >>= 6;
  276. text[1] = static_cast<unsigned char>((code | 0x80) & 0xBF); code >>= 6;
  277. text[0] = static_cast<unsigned char>(code | 0xF0);
  278. text += 4;
  279. }
  280. else // Invalid, only codes up to 0x10FFFF are allowed in Unicode
  281. {
  282. RAPIDXML_PARSE_ERROR("invalid numeric character entity", text);
  283. }
  284. }
  285. }
  286. // Skip characters until predicate evaluates to true
  287. template<class StopPred, int Flags>
  288. static void skip(Ch *&text, Ch *textEnd = NULL)
  289. {
  290. Ch *tmp = text;
  291. while ((textEnd == NULL || tmp < textEnd) && StopPred::test(*tmp))
  292. ++tmp;
  293. text = tmp;
  294. }
  295. // Skip characters until predicate evaluates to true while doing the following:
  296. // - replacing XML character entity references with proper characters (&apos; &amp; &quot; &lt; &gt; &#...;)
  297. // - condensing whitespace sequences to single space character
  298. template<class StopPred, class StopPredPure, int Flags>
  299. static Ch *skip_and_expand_character_refs(Ch *&text)
  300. {
  301. // If entity translation, whitespace condense and whitespace trimming is disabled, use plain skip
  302. if (Flags & parse_no_entity_translation &&
  303. !(Flags & parse_normalize_whitespace) &&
  304. !(Flags & parse_trim_whitespace))
  305. {
  306. skip<StopPred, Flags>(text);
  307. return text;
  308. }
  309. // Use simple skip until first modification is detected
  310. skip<StopPredPure, Flags>(text);
  311. // Use translation skip
  312. Ch *src = text;
  313. Ch *dest = src;
  314. while (StopPred::test(*src))
  315. {
  316. // If entity translation is enabled
  317. if (!(Flags & parse_no_entity_translation))
  318. {
  319. // Test if replacement is needed
  320. if (src[0] == Ch('&'))
  321. {
  322. switch (src[1])
  323. {
  324. // &amp; &apos;
  325. case Ch('a'):
  326. if (src[2] == Ch('m') && src[3] == Ch('p') && src[4] == Ch(';'))
  327. {
  328. *dest = Ch('&');
  329. ++dest;
  330. src += 5;
  331. continue;
  332. }
  333. if (src[2] == Ch('p') && src[3] == Ch('o') && src[4] == Ch('s') && src[5] == Ch(';'))
  334. {
  335. *dest = Ch('\'');
  336. ++dest;
  337. src += 6;
  338. continue;
  339. }
  340. break;
  341. // &quot;
  342. case Ch('q'):
  343. if (src[2] == Ch('u') && src[3] == Ch('o') && src[4] == Ch('t') && src[5] == Ch(';'))
  344. {
  345. *dest = Ch('"');
  346. ++dest;
  347. src += 6;
  348. continue;
  349. }
  350. break;
  351. // &gt;
  352. case Ch('g'):
  353. if (src[2] == Ch('t') && src[3] == Ch(';'))
  354. {
  355. *dest = Ch('>');
  356. ++dest;
  357. src += 4;
  358. continue;
  359. }
  360. break;
  361. // &lt;
  362. case Ch('l'):
  363. if (src[2] == Ch('t') && src[3] == Ch(';'))
  364. {
  365. *dest = Ch('<');
  366. ++dest;
  367. src += 4;
  368. continue;
  369. }
  370. break;
  371. // &#...; - assumes ASCII
  372. case Ch('#'):
  373. if (src[2] == Ch('x'))
  374. {
  375. unsigned long code = 0;
  376. src += 3; // Skip &#x
  377. while (1)
  378. {
  379. unsigned char digit = internal::lookup_tables<0>::lookup_digits[static_cast<unsigned char>(*src)];
  380. if (digit == 0xFF)
  381. break;
  382. code = code * 16 + digit;
  383. ++src;
  384. }
  385. insert_coded_character<Flags>(dest, code); // Put character in output
  386. }
  387. else
  388. {
  389. unsigned long code = 0;
  390. src += 2; // Skip &#
  391. while (1)
  392. {
  393. unsigned char digit = internal::lookup_tables<0>::lookup_digits[static_cast<unsigned char>(*src)];
  394. if (digit == 0xFF)
  395. break;
  396. code = code * 10 + digit;
  397. ++src;
  398. }
  399. insert_coded_character<Flags>(dest, code); // Put character in output
  400. }
  401. if (*src == Ch(';'))
  402. ++src;
  403. else
  404. RAPIDXML_PARSE_ERROR("expected ;", src);
  405. continue;
  406. // Something else
  407. default:
  408. // Ignore, just copy '&' verbatim
  409. break;
  410. }
  411. }
  412. }
  413. // If whitespace condensing is enabled
  414. if (Flags & parse_normalize_whitespace)
  415. {
  416. // Test if condensing is needed
  417. if (whitespace_pred::test(*src))
  418. {
  419. *dest = Ch(' '); ++dest; // Put single space in dest
  420. ++src; // Skip first whitespace char
  421. // Skip remaining whitespace chars
  422. while (whitespace_pred::test(*src))
  423. ++src;
  424. continue;
  425. }
  426. }
  427. // No replacement, only copy character
  428. *dest++ = *src++;
  429. }
  430. // Return new end
  431. text = src;
  432. return dest;
  433. }
  434. ///////////////////////////////////////////////////////////////////////
  435. // Internal parsing functions
  436. // Parse UTF-8 BOM, if any
  437. template<int Flags>
  438. void parse_bom(char *&text)
  439. {
  440. if (static_cast<unsigned char>(text[0]) == 0xEF &&
  441. static_cast<unsigned char>(text[1]) == 0xBB &&
  442. static_cast<unsigned char>(text[2]) == 0xBF)
  443. {
  444. text += 3;
  445. }
  446. }
  447. // Parse UTF-16/32 BOM, if any
  448. template<int Flags>
  449. void parse_bom(wchar_t *&text)
  450. {
  451. const wchar_t bom = 0xFEFF;
  452. if (text[0] == bom)
  453. {
  454. ++text;
  455. }
  456. }
  457. // Parse XML declaration (<?xml...)
  458. template<int Flags>
  459. void parse_xml_declaration(Ch *&text)
  460. {
  461. // If parsing of declaration is disabled
  462. if (!(Flags & parse_declaration_node))
  463. {
  464. // Skip until end of declaration
  465. while (text[0] != Ch('?') || text[1] != Ch('>'))
  466. {
  467. if (!text[0])
  468. RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  469. ++text;
  470. }
  471. text += 2; // Skip '?>'
  472. return; // return 0;
  473. }
  474. // Create declaration
  475. // xml_node<Ch> *declaration = this->allocate_node(node_declaration);
  476. // Skip whitespace before attributes or ?>
  477. skip<whitespace_pred, Flags>(text, endptr_);
  478. // Parse declaration attributes
  479. parse_node_attributes<Flags>(text/*, declaration*/);
  480. // Skip ?>
  481. if (text[0] != Ch('?') || text[1] != Ch('>'))
  482. RAPIDXML_PARSE_ERROR("expected ?>", text);
  483. text += 2;
  484. // return declaration;
  485. }
  486. // Parse XML comment (<!--...)
  487. template<int Flags>
  488. void parse_comment(Ch *&text)
  489. {
  490. // If parsing of comments is disabled
  491. if (!(Flags & parse_comment_nodes))
  492. {
  493. // Skip until end of comment
  494. while (text[0] != Ch('-') || text[1] != Ch('-') || text[2] != Ch('>'))
  495. {
  496. if (!text[0])
  497. RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  498. ++text;
  499. }
  500. text += 3; // Skip '-->'
  501. return;// return 0; // Do not produce comment node
  502. }
  503. // Skip until end of comment
  504. while (text[0] != Ch('-') || text[1] != Ch('-') || text[2] != Ch('>'))
  505. {
  506. if (!text[0])
  507. RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  508. ++text;
  509. }
  510. // Create comment node
  511. // xml_node<Ch> *comment = this->allocate_node(node_comment);
  512. // comment->value(value, text - value); // TODO: DNT implement comment
  513. // Place zero terminator after comment value
  514. if (!(Flags & parse_no_string_terminators))
  515. *text = Ch('\0');
  516. text += 3; // Skip '-->'
  517. return;
  518. }
  519. // Parse DOCTYPE
  520. template<int Flags>
  521. void parse_doctype(Ch *&text)
  522. {
  523. // Skip to >
  524. while (*text != Ch('>'))
  525. {
  526. // Determine character type
  527. switch (*text)
  528. {
  529. // If '[' encountered, scan for matching ending ']' using naive algorithm with depth
  530. // This works for all W3C test files except for 2 most wicked
  531. case Ch('['):
  532. {
  533. ++text; // Skip '['
  534. int depth = 1;
  535. while (depth > 0)
  536. {
  537. switch (*text)
  538. {
  539. case Ch('['): ++depth; break;
  540. case Ch(']'): --depth; break;
  541. case 0: RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  542. default: break;
  543. }
  544. ++text;
  545. }
  546. break;
  547. }
  548. // Error on end of text
  549. case Ch('\0'):
  550. RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  551. // Other character, skip it
  552. default:
  553. ++text;
  554. }
  555. }
  556. // If DOCTYPE nodes enabled
  557. if (Flags & parse_doctype_node)
  558. {
  559. #if 0
  560. // Create a new doctype node
  561. xml_node<Ch> *doctype = this->allocate_node(node_doctype);
  562. doctype->value(value, text - value);
  563. #endif
  564. // Place zero terminator after value
  565. if (!(Flags & parse_no_string_terminators))
  566. *text = Ch('\0');
  567. text += 1; // skip '>'
  568. return;// return doctype;
  569. }
  570. else
  571. {
  572. text += 1; // skip '>'
  573. return;// return 0;
  574. }
  575. }
  576. // Parse PI
  577. template<int Flags>
  578. void parse_pi(Ch *&text)
  579. {
  580. // If creation of PI nodes is enabled
  581. if (Flags & parse_pi_nodes)
  582. {
  583. // Create pi node
  584. // xml_node<Ch> *pi = this->allocate_node(node_pi);
  585. // Extract PI target name
  586. Ch *name = text;
  587. skip<node_name_pred, Flags>(text, endptr_);
  588. if (text == name)
  589. RAPIDXML_PARSE_ERROR("expected PI target", text);
  590. // pi->name(name, text - name); // TODO: DNT notify for pi
  591. // Skip whitespace between pi target and pi
  592. skip<whitespace_pred, Flags>(text, endptr_);
  593. // Skip to '?>'
  594. while (text[0] != Ch('?') || text[1] != Ch('>'))
  595. {
  596. if (*text == Ch('\0'))
  597. RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  598. ++text;
  599. }
  600. #if 0 // TODO: DNT notify for pi
  601. // Set pi value (verbatim, no entity expansion or whitespace normalization)
  602. pi->value(value, text - value);
  603. // Place zero terminator after name and value
  604. if (!(Flags & parse_no_string_terminators))
  605. {
  606. pi->name()[pi->name_size()] = Ch('\0');
  607. pi->value()[pi->value_size()] = Ch('\0');
  608. }
  609. #endif
  610. text += 2; // Skip '?>'
  611. return; // return pi;
  612. }
  613. else
  614. {
  615. // Skip to '?>'
  616. while (text[0] != Ch('?') || text[1] != Ch('>'))
  617. {
  618. if (*text == Ch('\0'))
  619. RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  620. ++text;
  621. }
  622. text += 2; // Skip '?>'
  623. return; // return 0;
  624. }
  625. }
  626. // Parse and append data
  627. // Return character that ends data.
  628. // This is necessary because this character might have been overwritten by a terminating 0
  629. template<int Flags>
  630. Ch parse_and_append_data(/*const tok_string& elementName unused for SAX,*/ Ch *&text, Ch *contents_start)
  631. {
  632. // Backup to contents start if whitespace trimming is disabled
  633. if (!(Flags & parse_trim_whitespace))
  634. text = contents_start;
  635. // Skip until end of data
  636. Ch *value = text, *end;
  637. if (Flags & parse_normalize_whitespace)
  638. end = skip_and_expand_character_refs<text_pred, text_pure_with_ws_pred, Flags>(text);
  639. else
  640. end = skip_and_expand_character_refs<text_pred, text_pure_no_ws_pred, Flags>(text);
  641. // Trim trailing whitespace if flag is set; leading was already trimmed by whitespace skip after >
  642. if (Flags & parse_trim_whitespace)
  643. {
  644. if (Flags & parse_normalize_whitespace)
  645. {
  646. // Whitespace is already condensed to single space characters by skipping function, so just trim 1 char off the end
  647. if (*(end - 1) == Ch(' '))
  648. --end;
  649. }
  650. else
  651. {
  652. // Backup until non-whitespace character is found
  653. while (whitespace_pred::test(*(end - 1)))
  654. --end;
  655. }
  656. }
  657. #if 0 // disable data node
  658. // If characters are still left between end and value (this test is only necessary if normalization is enabled)
  659. // Create new data node
  660. if (!(Flags & parse_no_data_nodes))
  661. {
  662. xml_node<Ch> *data = this->allocate_node(node_data);
  663. data->value(value, end - value);
  664. node->append_node(data);
  665. }
  666. #endif
  667. // Add data to parent node if no data exists yet
  668. #if 0
  669. if (!(Flags & parse_no_element_values))
  670. if (*node->value() == Ch('\0'))
  671. ;// node->value(value, end - value);
  672. #endif
  673. Ch ch = *text;
  674. // Place zero terminator after value
  675. if (!(Flags & parse_no_string_terminators))
  676. {
  677. //Ch ch = *text;
  678. *end = Ch('\0');
  679. //return ch; // Return character that ends data; this is required because zero terminator overwritten it
  680. }
  681. handler_->xmlSAX3Text(value, end - value);
  682. // Return character that ends data
  683. return ch;
  684. }
  685. // Parse CDATA
  686. template<int Flags>
  687. void parse_cdata(Ch *&text)
  688. {
  689. // If CDATA is disabled
  690. if (Flags & parse_no_data_nodes)
  691. {
  692. // Skip until end of cdata
  693. while (text[0] != Ch(']') || text[1] != Ch(']') || text[2] != Ch('>'))
  694. {
  695. if (!text[0])
  696. RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  697. ++text;
  698. }
  699. text += 3; // Skip ]]>
  700. return; // return 0; // Do not produce CDATA node
  701. }
  702. // Skip until end of cdata
  703. while (text[0] != Ch(']') || text[1] != Ch(']') || text[2] != Ch('>'))
  704. {
  705. if (!text[0])
  706. RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  707. ++text;
  708. }
  709. #if 0 // TODO: disable CDATA
  710. // Create new cdata node
  711. xml_node<Ch> *cdata = this->allocate_node(node_cdata);
  712. cdata->value(value, text - value);
  713. #endif
  714. // Place zero terminator after value
  715. if (!(Flags & parse_no_string_terminators))
  716. *text = Ch('\0');
  717. text += 3; // Skip ]]>
  718. return;// return cdata;
  719. }
  720. // Parse element node
  721. template<int Flags>
  722. void parse_element(Ch *&text)
  723. {
  724. // Create element node
  725. // xml_node<Ch> *element = this->allocate_node(node_element);
  726. // Extract element name
  727. tok_string elementName(text, 0);
  728. skip<node_name_pred, Flags>(text, endptr_);
  729. elementName.second = text - elementName.first;
  730. if (0 == elementName.second)
  731. RAPIDXML_PARSE_ERROR("expected element name", text);
  732. handler_->xmlSAX3StartElement(elementName.first, elementName.second);
  733. // Skip whitespace between element name and attributes or >
  734. skip<whitespace_pred, Flags>(text, endptr_);
  735. // Parse attributes, if any
  736. parse_node_attributes<Flags>(text);
  737. handler_->xmlSAX3EndAttr();
  738. // Determine ending type
  739. if (*text == Ch('>'))
  740. {
  741. ++text;
  742. parse_node_contents<Flags>(text, elementName);
  743. }
  744. else if (*text == Ch('/'))
  745. {
  746. ++text;
  747. if (*text != Ch('>'))
  748. RAPIDXML_PARSE_ERROR("expected >", text);
  749. ++text;
  750. }
  751. else
  752. RAPIDXML_PARSE_ERROR("expected >", text);
  753. // Place zero terminator after name
  754. if (!(Flags & parse_no_string_terminators)) {
  755. elementName.first[elementName.second] = (Ch)'\0';
  756. }
  757. // Return parsed element
  758. handler_->xmlSAX3EndElement(elementName.first, elementName.second);
  759. // return element;
  760. }
  761. // Determine node type, and parse it
  762. template<int Flags>
  763. void parse_node(Ch *&text)
  764. {
  765. // Parse proper node type
  766. switch (text[0])
  767. {
  768. // <...
  769. default:
  770. // Parse and append element node
  771. return parse_element<Flags>(text);
  772. // <?...
  773. case Ch('?'):
  774. ++text; // Skip ?
  775. if ((text[0] == Ch('x') || text[0] == Ch('X')) &&
  776. (text[1] == Ch('m') || text[1] == Ch('M')) &&
  777. (text[2] == Ch('l') || text[2] == Ch('L')) &&
  778. whitespace_pred::test(text[3]))
  779. {
  780. // '<?xml ' - xml declaration
  781. text += 4; // Skip 'xml '
  782. return parse_xml_declaration<Flags>(text);
  783. }
  784. else
  785. {
  786. // Parse PI
  787. return parse_pi<Flags>(text);
  788. }
  789. // <!...
  790. case Ch('!'):
  791. // Parse proper subset of <! node
  792. switch (text[1])
  793. {
  794. // <!-
  795. case Ch('-'):
  796. if (text[2] == Ch('-'))
  797. {
  798. // '<!--' - xml comment
  799. text += 3; // Skip '!--'
  800. return parse_comment<Flags>(text);
  801. }
  802. break;
  803. // <![
  804. case Ch('['):
  805. if (text[2] == Ch('C') && text[3] == Ch('D') && text[4] == Ch('A') &&
  806. text[5] == Ch('T') && text[6] == Ch('A') && text[7] == Ch('['))
  807. {
  808. // '<![CDATA[' - cdata
  809. text += 8; // Skip '![CDATA['
  810. return parse_cdata<Flags>(text);
  811. }
  812. break;
  813. // <!D
  814. case Ch('D'):
  815. if (text[2] == Ch('O') && text[3] == Ch('C') && text[4] == Ch('T') &&
  816. text[5] == Ch('Y') && text[6] == Ch('P') && text[7] == Ch('E') &&
  817. whitespace_pred::test(text[8]))
  818. {
  819. // '<!DOCTYPE ' - doctype
  820. text += 9; // skip '!DOCTYPE '
  821. return parse_doctype<Flags>(text);
  822. }
  823. break;
  824. default: break;
  825. } // switch
  826. // Attempt to skip other, unrecognized node types starting with <!
  827. ++text; // Skip !
  828. while (*text != Ch('>'))
  829. {
  830. if (*text == 0)
  831. RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  832. ++text;
  833. }
  834. ++text; // Skip '>'
  835. return; // return 0; // No node recognized
  836. }
  837. }
  838. // Parse contents of the node - children, data etc.
  839. template<int Flags>
  840. void parse_node_contents(Ch *&text, const tok_string& elementName/*element name*/)
  841. {
  842. // For all children and text
  843. while (1)
  844. {
  845. // Skip whitespace between > and node contents
  846. Ch *contents_start = text; // Store start of node contents before whitespace is skipped
  847. skip<whitespace_pred, Flags>(text, endptr_);
  848. Ch next_char = *text;
  849. // After data nodes, instead of continuing the loop, control jumps here.
  850. // This is because zero termination inside parse_and_append_data() function
  851. // would wreak havoc with the above code.
  852. // Also, skipping whitespace after data nodes is unnecessary.
  853. after_data_node:
  854. // Determine what comes next: node closing, child node, data node, or 0?
  855. switch (next_char)
  856. {
  857. // Node closing or child node
  858. case Ch('<'):
  859. if (text[1] == Ch('/'))
  860. {
  861. // Node closing
  862. text += 2; // Skip '</'
  863. if (Flags & parse_validate_closing_tags)
  864. {
  865. // Skip and validate closing tag name
  866. Ch *closing_name = text;
  867. skip<node_name_pred, Flags>(text, endptr_);
  868. if (!internal::compare(elementName.first, elementName.second, closing_name, text - closing_name, true))
  869. RAPIDXML_PARSE_ERROR("invalid closing tag name", text);
  870. }
  871. else
  872. {
  873. // No validation, just skip name
  874. skip<node_name_pred, Flags>(text, endptr_);
  875. }
  876. // Skip remaining whitespace after node name
  877. skip<whitespace_pred, Flags>(text, endptr_);
  878. if (*text != Ch('>'))
  879. RAPIDXML_PARSE_ERROR("expected >", text);
  880. ++text; // Skip '>'
  881. return; // Node closed, finished parsing contents
  882. }
  883. else
  884. {
  885. // Child node
  886. ++text; // Skip '<'
  887. parse_node<Flags>(text);
  888. /*if (xml_node<Ch> *child = parse_node<Flags>(text))
  889. node->append_node(child);*/
  890. }
  891. break;
  892. // End of data - error
  893. case Ch('\0'):
  894. RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  895. // Data node
  896. default:
  897. next_char = parse_and_append_data<Flags>(/*elementName, */text, contents_start);
  898. goto after_data_node; // Bypass regular processing after data nodes
  899. }
  900. }
  901. }
  902. // Parse XML attributes of the node
  903. template<int Flags>
  904. void parse_node_attributes(Ch *&text)
  905. {
  906. // For all attributes
  907. while (attribute_name_pred::test(*text))
  908. {
  909. // Extract attribute name
  910. Ch *name = text;
  911. ++text; // Skip first character of attribute name
  912. skip<attribute_name_pred, Flags>(text, endptr_);
  913. if (text == name)
  914. RAPIDXML_PARSE_ERROR("expected attribute name", name);
  915. // Create new attribute
  916. // xml_attribute<Ch> *attribute = this->allocate_attribute();
  917. // attribute->name(name, text - name);
  918. auto namesize = text - name;
  919. // node->append_attribute(attribute);
  920. // Skip whitespace after attribute name
  921. skip<whitespace_pred, Flags>(text, endptr_);
  922. // Skip =
  923. if (*text != Ch('='))
  924. RAPIDXML_PARSE_ERROR("expected =", text);
  925. ++text;
  926. // Add terminating zero after name
  927. if (!(Flags & parse_no_string_terminators))
  928. name[namesize] = 0;
  929. // Skip whitespace after =
  930. skip<whitespace_pred, Flags>(text, endptr_);
  931. // Skip quote and remember if it was ' or "
  932. Ch quote = *text;
  933. if (quote != Ch('\'') && quote != Ch('"'))
  934. RAPIDXML_PARSE_ERROR("expected ' or \"", text);
  935. ++text;
  936. // Extract attribute value and expand char refs in it
  937. Ch *value = text, *end;
  938. const int AttFlags = Flags & ~parse_normalize_whitespace; // No whitespace normalization in attributes
  939. if (quote == Ch('\''))
  940. end = skip_and_expand_character_refs<attribute_value_pred<Ch('\'')>, attribute_value_pure_pred<Ch('\'')>, AttFlags>(text);
  941. else
  942. end = skip_and_expand_character_refs<attribute_value_pred<Ch('"')>, attribute_value_pure_pred<Ch('"')>, AttFlags>(text);
  943. // Set attribute value
  944. // attribute->value(value, end - value);
  945. auto valuesize = end - value;
  946. // Make sure that end quote is present
  947. if (*text != quote)
  948. RAPIDXML_PARSE_ERROR("expected ' or \"", text);
  949. ++text; // Skip quote
  950. // Add terminating zero after value
  951. if (!(Flags & parse_no_string_terminators))
  952. value[valuesize] = 0;
  953. handler_->xmlSAX3Attr(name, namesize, value, valuesize);
  954. // Skip whitespace after attribute value
  955. skip<whitespace_pred, Flags>(text, endptr_);
  956. }
  957. }
  958. };
  959. }
  960. // Undefine internal macros
  961. #undef RAPIDXML_PARSE_ERROR
  962. // On MSVC, restore warnings state
  963. #ifdef _MSC_VER
  964. #pragma warning(pop)
  965. #endif
  966. #endif