Simple XML subset parser comparable to glib's Markup parser, but without any dependencies in one self contained file. Forked from https://github.com/ooxi/xml.c
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

738 lines
14 KiB

  1. /**
  2. * Copyright (c) 2012 ooxi/xml.c
  3. * https://github.com/ooxi/xml.c
  4. *
  5. * This software is provided 'as-is', without any express or implied warranty.
  6. * In no event will the authors be held liable for any damages arising from the
  7. * use of this software.
  8. *
  9. * Permission is granted to anyone to use this software for any purpose,
  10. * including commercial applications, and to alter it and redistribute it
  11. * freely, subject to the following restrictions:
  12. *
  13. * 1. The origin of this software must not be misrepresented; you must not
  14. * claim that you wrote the original software. If you use this software in a
  15. * product, an acknowledgment in the product documentation would be
  16. * appreciated but is not required.
  17. *
  18. * 2. Altered source versions must be plainly marked as such, and must not be
  19. * misrepresented as being the original software.
  20. *
  21. * 3. This notice may not be removed or altered from any source distribution.
  22. */
  23. #include <ctype.h>
  24. #include <malloc.h>
  25. #include <stdbool.h>
  26. #include <stdio.h>
  27. #include <stdlib.h>
  28. #include "xml.h"
  29. /**
  30. * [OPAQUE API]
  31. *
  32. * UTF-8 text
  33. */
  34. struct xml_string {
  35. uint8_t* buffer;
  36. size_t length;
  37. };
  38. /**
  39. * [OPAQUE API]
  40. *
  41. * An xml_node will always contain a tag name and a 0-terminated list of
  42. * children. Moreover it may contain text content.
  43. */
  44. struct xml_node {
  45. struct xml_string* name;
  46. struct xml_string* content;
  47. struct xml_node** children;
  48. };
  49. /**
  50. * [OPAQUE API]
  51. *
  52. * An xml_document simply contains the root node and the underlying buffer
  53. */
  54. struct xml_document {
  55. struct xml_string buffer;
  56. struct xml_node* root;
  57. };
  58. /**
  59. * [PRIVATE]
  60. *
  61. * Parser context
  62. */
  63. struct xml_parser {
  64. uint8_t* buffer;
  65. size_t position;
  66. size_t length;
  67. };
  68. /**
  69. * [PRIVATE]
  70. *
  71. * Character offsets
  72. */
  73. enum xml_parser_offset {
  74. NO_CHARACTER = -1,
  75. CURRENT_CHARACTER = 0,
  76. NEXT_CHARACTER = 1,
  77. };
  78. /**
  79. * [PRIVATE]
  80. *
  81. * @return Number of elements in 0-terminated array
  82. */
  83. static size_t get_zero_terminated_array_elements(struct xml_node** nodes) {
  84. size_t elements = 0;
  85. while (nodes[elements]) {
  86. ++elements;
  87. }
  88. return elements;
  89. }
  90. /**
  91. * [PRIVATE]
  92. *
  93. * @warning No UTF conversions will be attempted
  94. *
  95. * @return true gdw. a == b
  96. */
  97. static _Bool xml_string_equals(struct xml_string* a, struct xml_string* b) {
  98. if (a->length != b->length) {
  99. return false;
  100. }
  101. size_t i = 0; for (; i < a->length; ++i) {
  102. if (a->buffer[i] != b->buffer[i]) {
  103. return false;
  104. }
  105. }
  106. return true;
  107. }
  108. /**
  109. * [PRIVATE]
  110. *
  111. * Frees the resources allocated by the string
  112. *
  113. * @waring `buffer` must _not_ be freed, since it is a reference to the
  114. * document's buffer
  115. */
  116. static void xml_string_free(struct xml_string* string) {
  117. free(string);
  118. }
  119. /**
  120. * [PRIVATE]
  121. *
  122. * Frees the resources allocated by the node
  123. */
  124. static void xml_node_free(struct xml_node* node) {
  125. xml_string_free(node->name);
  126. if (node->content) {
  127. xml_string_free(node->content);
  128. }
  129. struct xml_node** it = node->children;
  130. while (*it) {
  131. xml_node_free(*it);
  132. ++it;
  133. }
  134. free(node->children);
  135. free(node);
  136. }
  137. /**
  138. * [PRIVATE]
  139. *
  140. * Echos the parsers call stack for debugging purposes
  141. */
  142. #ifdef DEBUG
  143. static void xml_parser_info(struct xml_parser* parser, char const* message) {
  144. fprintf(stdout, "xml_parser_info %s\n", message);
  145. }
  146. #else
  147. #define xml_parser_info(parser, message) {}
  148. #endif
  149. /**
  150. * [PRIVATE]
  151. *
  152. * Echos an error regarding the parser's source to the console
  153. */
  154. static void xml_parser_error(struct xml_parser* parser, enum xml_parser_offset offset, char const* message) {
  155. int row = 0;
  156. int column = 0;
  157. #define min(X,Y) ((X) < (Y) ? (X) : (Y))
  158. #define max(X,Y) ((X) > (Y) ? (X) : (Y))
  159. size_t character = max(0, min(parser->length, parser->position + offset));
  160. #undef min
  161. #undef max
  162. size_t position = 0; for (; position < character; ++position) {
  163. column++;
  164. if ('\n' == parser->buffer[position]) {
  165. row++;
  166. column = 0;
  167. }
  168. }
  169. if (NO_CHARACTER != offset) {
  170. fprintf(stderr, "xml_parser_error at %i:%i (is %c): %s\n",
  171. row + 1, column, parser->buffer[character], message
  172. );
  173. } else {
  174. fprintf(stderr, "xml_parser_error at %i:%i: %s\n",
  175. row + 1, column, message
  176. );
  177. }
  178. }
  179. /**
  180. * [PRIVATE]
  181. *
  182. * Returns the n-th not-whitespace byte in parser and 0 if such a byte does not
  183. * exist
  184. */
  185. static uint8_t xml_parser_peek(struct xml_parser* parser, size_t n) {
  186. size_t position = parser->position;
  187. while (position < parser->length) {
  188. if (!isspace(parser->buffer[position])) {
  189. if (n == 0) {
  190. return parser->buffer[position];
  191. } else {
  192. --n;
  193. }
  194. }
  195. position++;
  196. }
  197. return 0;
  198. }
  199. /**
  200. * [PRIVATE]
  201. *
  202. * Moves the parser's position n bytes. If the new position would be out of
  203. * bounds, it will be converted to the bounds itself
  204. */
  205. static void xml_parser_consume(struct xml_parser* parser, size_t n) {
  206. /* Debug information
  207. */
  208. #ifdef DEBUG
  209. #define min(X,Y) ((X) < (Y) ? (X) : (Y))
  210. char* consumed = alloca((n + 1) * sizeof(char));
  211. memcpy(consumed, &parser->buffer[parser->position], min(n, parser->length - parser->position));
  212. consumed[n] = 0;
  213. #undef min
  214. size_t message_buffer_length = 512;
  215. char* message_buffer = alloca(512 * sizeof(char));
  216. snprintf(message_buffer, message_buffer_length, "Consuming %li bytes \"%s\"", (long)n, consumed);
  217. message_buffer[message_buffer_length - 1] = 0;
  218. xml_parser_info(parser, message_buffer);
  219. #endif
  220. /* Move the position forward
  221. */
  222. parser->position += n;
  223. /* Don't go too far
  224. *
  225. * @warning Valid because parser->length must be greater than 0
  226. */
  227. if (parser->position >= parser->length) {
  228. parser->position = parser->length - 1;
  229. }
  230. }
  231. /**
  232. * [PRIVATE]
  233. *
  234. * Skips to the next non-whitespace character
  235. */
  236. static void xml_skip_whitespace(struct xml_parser* parser) {
  237. xml_parser_info(parser, "whitespace");
  238. while (isspace(parser->buffer[parser->position])) {
  239. if (parser->position + 1 >= parser->length) {
  240. return;
  241. } else {
  242. parser->position++;
  243. }
  244. }
  245. }
  246. /**
  247. * [PRIVATE]
  248. *
  249. * Parses the name out of the an XML tag's ending
  250. *
  251. * ---( Example )---
  252. * tag_name>
  253. * ---
  254. */
  255. static struct xml_string* xml_parse_tag_end(struct xml_parser* parser) {
  256. xml_parser_info(parser, "tag_end");
  257. size_t start = parser->position;
  258. size_t length = 0;
  259. /* Parse until `>' or a whitespace is reached
  260. */
  261. while (start + length < parser->length) {
  262. uint8_t current = xml_parser_peek(parser, CURRENT_CHARACTER);
  263. if (('>' == current) || isspace(current)) {
  264. break;
  265. } else {
  266. xml_parser_consume(parser, 1);
  267. length++;
  268. }
  269. }
  270. /* Consume `>'
  271. */
  272. if ('>' != xml_parser_peek(parser, CURRENT_CHARACTER)) {
  273. xml_parser_error(parser, CURRENT_CHARACTER, "xml_parse_tag_end::expected tag end");
  274. return 0;
  275. }
  276. xml_parser_consume(parser, 1);
  277. /* Return parsed tag name
  278. */
  279. struct xml_string* name = malloc(sizeof(struct xml_string));
  280. name->buffer = &parser->buffer[start];
  281. name->length = length;
  282. return name;
  283. }
  284. /**
  285. * [PRIVATE]
  286. *
  287. * Parses an opening XML tag without attributes
  288. *
  289. * ---( Example )---
  290. * <tag_name>
  291. * ---
  292. */
  293. static struct xml_string* xml_parse_tag_open(struct xml_parser* parser) {
  294. xml_parser_info(parser, "tag_open");
  295. xml_skip_whitespace(parser);
  296. /* Consume `<'
  297. */
  298. if ('<' != xml_parser_peek(parser, CURRENT_CHARACTER)) {
  299. xml_parser_error(parser, CURRENT_CHARACTER, "xml_parse_tag_open::expected opening tag");
  300. return 0;
  301. }
  302. xml_parser_consume(parser, 1);
  303. /* Consume tag name
  304. */
  305. return xml_parse_tag_end(parser);
  306. }
  307. /**
  308. * [PRIVATE]
  309. *
  310. * Parses an closing XML tag without attributes
  311. *
  312. * ---( Example )---
  313. * </tag_name>
  314. * ---
  315. */
  316. static struct xml_string* xml_parse_tag_close(struct xml_parser* parser) {
  317. xml_parser_info(parser, "tag_close");
  318. xml_skip_whitespace(parser);
  319. /* Consume `</'
  320. */
  321. if ( ('<' != xml_parser_peek(parser, CURRENT_CHARACTER))
  322. || ('/' != xml_parser_peek(parser, NEXT_CHARACTER))) {
  323. if ('<' != xml_parser_peek(parser, CURRENT_CHARACTER)) {
  324. xml_parser_error(parser, CURRENT_CHARACTER, "xml_parse_tag_close::expected closing tag `<'");
  325. }
  326. if ('/' != xml_parser_peek(parser, NEXT_CHARACTER)) {
  327. xml_parser_error(parser, NEXT_CHARACTER, "xml_parse_tag_close::expected closing tag `/'");
  328. }
  329. return 0;
  330. }
  331. xml_parser_consume(parser, 2);
  332. /* Consume tag name
  333. */
  334. return xml_parse_tag_end(parser);
  335. }
  336. /**
  337. * [PRIVATE]
  338. *
  339. * Parses a tag's content
  340. *
  341. * ---( Example )---
  342. * this is
  343. * a
  344. * tag {} content
  345. * ---
  346. *
  347. * @warning CDATA etc. is _not_ and will never be supported
  348. */
  349. static struct xml_string* xml_parse_content(struct xml_parser* parser) {
  350. xml_parser_info(parser, "content");
  351. /* Whitespace will be ignored
  352. */
  353. xml_skip_whitespace(parser);
  354. size_t start = parser->position;
  355. size_t length = 0;
  356. /* Consume until `<' is reached
  357. */
  358. while (start + length < parser->length) {
  359. uint8_t current = xml_parser_peek(parser, CURRENT_CHARACTER);
  360. if ('<' == current) {
  361. break;
  362. } else {
  363. xml_parser_consume(parser, 1);
  364. length++;
  365. }
  366. }
  367. /* Next character must be an `<' or we have reached end of file
  368. */
  369. if ('<' != xml_parser_peek(parser, CURRENT_CHARACTER)) {
  370. xml_parser_error(parser, CURRENT_CHARACTER, "xml_parse_content::expected <");
  371. return 0;
  372. }
  373. /* Ignore tailing whitespace
  374. */
  375. while ((length > 0) && isspace(parser->buffer[start + length - 1])) {
  376. length--;
  377. }
  378. /* Return text
  379. */
  380. struct xml_string* content = malloc(sizeof(struct xml_string));
  381. content->buffer = &parser->buffer[start];
  382. content->length = length;
  383. return content;
  384. }
  385. /**
  386. * [PRIVATE]
  387. *
  388. * Parses an XML fragment node
  389. *
  390. * ---( Example without children )---
  391. * <Node>Text</Node>
  392. * ---
  393. *
  394. * ---( Example with children )---
  395. * <Parent>
  396. * <Child>Text</Child>
  397. * <Child>Text</Child>
  398. * <Test>Content</Test>
  399. * </Parent>
  400. * ---
  401. */
  402. static struct xml_node* xml_parse_node(struct xml_parser* parser) {
  403. xml_parser_info(parser, "node");
  404. /* Setup variables
  405. */
  406. struct xml_string* tag_open = 0;
  407. struct xml_string* tag_close = 0;
  408. struct xml_string* content = 0;
  409. struct xml_node** children = calloc(1, sizeof(struct xml_node*));
  410. children[0] = 0;
  411. /* Parse open tag
  412. */
  413. tag_open = xml_parse_tag_open(parser);
  414. if (!tag_open) {
  415. xml_parser_error(parser, NO_CHARACTER, "xml_parse_node::tag_open");
  416. goto exit_failure;
  417. }
  418. /* If the content does not start with '<', a text content is assumed
  419. */
  420. if ('<' != xml_parser_peek(parser, CURRENT_CHARACTER)) {
  421. content = xml_parse_content(parser);
  422. if (!content) {
  423. xml_parser_error(parser, 0, "xml_parse_node::content");
  424. goto exit_failure;
  425. }
  426. /* Otherwise children are to be expected
  427. */
  428. } else while ('/' != xml_parser_peek(parser, NEXT_CHARACTER)) {
  429. /* Parse child node
  430. */
  431. struct xml_node* child = xml_parse_node(parser);
  432. if (!child) {
  433. xml_parser_error(parser, NEXT_CHARACTER, "xml_parse_node::child");
  434. goto exit_failure;
  435. }
  436. /* Grow child array :)
  437. */
  438. size_t old_elements = get_zero_terminated_array_elements(children);
  439. size_t new_elements = old_elements + 1;
  440. children = realloc(children, (new_elements + 1) * sizeof(struct xml_node*));
  441. /* Save child
  442. */
  443. children[new_elements - 1] = child;
  444. children[new_elements] = 0;
  445. }
  446. /* Parse close tag
  447. */
  448. tag_close = xml_parse_tag_close(parser);
  449. if (!tag_close) {
  450. xml_parser_error(parser, NO_CHARACTER, "xml_parse_node::tag_close");
  451. goto exit_failure;
  452. }
  453. /* Close tag has to match open tag
  454. */
  455. if (!xml_string_equals(tag_open, tag_close)) {
  456. xml_parser_error(parser, NO_CHARACTER, "xml_parse_node::tag missmatch");
  457. goto exit_failure;
  458. }
  459. /* Return parsed node
  460. */
  461. xml_string_free(tag_close);
  462. struct xml_node* node = malloc(sizeof(struct xml_node));
  463. node->name = tag_open;
  464. node->content = content;
  465. node->children = children;
  466. return node;
  467. /* A failure occured, so free all allocalted resources
  468. */
  469. exit_failure:
  470. if (tag_open) {
  471. xml_string_free(tag_open);
  472. }
  473. if (tag_close) {
  474. xml_string_free(tag_close);
  475. }
  476. if (content) {
  477. xml_string_free(content);
  478. }
  479. struct xml_node** it = children;
  480. while (*it) {
  481. xml_node_free(*it);
  482. ++it;
  483. }
  484. free(children);
  485. return 0;
  486. }
  487. /**
  488. * [PUBLIC API]
  489. *
  490. *
  491. */
  492. struct xml_document* xml_parse_document(uint8_t* buffer, size_t length) {
  493. /* Initialize parser
  494. */
  495. struct xml_parser parser = {
  496. .buffer = buffer,
  497. .position = 0,
  498. .length = length
  499. };
  500. /* An empty buffer can never contain a valid document
  501. */
  502. if (!length) {
  503. xml_parser_error(&parser, NO_CHARACTER, "xml_parse_document::length equals zero");
  504. return 0;
  505. }
  506. /* Parse the root node
  507. */
  508. struct xml_node* root = xml_parse_node(&parser);
  509. if (!root) {
  510. xml_parser_error(&parser, NO_CHARACTER, "xml_parse_document::parsing document failed");
  511. return 0;
  512. }
  513. /* Return parsed document
  514. */
  515. struct xml_document* document = malloc(sizeof(struct xml_document));
  516. document->buffer.buffer = buffer;
  517. document->buffer.length = length;
  518. document->root = root;
  519. return document;
  520. }
  521. /**
  522. * [PUBLIC API]
  523. */
  524. void xml_document_free(struct xml_document* document, _Bool free_buffer) {
  525. xml_node_free(document->root);
  526. if (free_buffer) {
  527. free(document->buffer.buffer);
  528. }
  529. free(document);
  530. }
  531. /**
  532. * [PUBLIC API]
  533. */
  534. struct xml_node* xml_document_root(struct xml_document* document) {
  535. return document->root;
  536. }
  537. /**
  538. * [PUBLIC API]
  539. */
  540. struct xml_string* xml_node_name(struct xml_node* node) {
  541. return node->name;
  542. }
  543. /**
  544. * [PUBLIC API]
  545. */
  546. struct xml_string* xml_node_content(struct xml_node* node) {
  547. return node->content;
  548. }
  549. /**
  550. * [PUBLIC API]
  551. *
  552. * @warning O(n)
  553. */
  554. size_t xml_node_children(struct xml_node* node) {
  555. return get_zero_terminated_array_elements(node->children);
  556. }
  557. /**
  558. * [PUBLIC API]
  559. */
  560. struct xml_node* xml_node_child(struct xml_node* node, size_t child) {
  561. if (child >= xml_node_children(node)) {
  562. return 0;
  563. }
  564. return node->children[child];
  565. }
  566. /**
  567. * [PUBLIC API]
  568. */
  569. size_t xml_string_length(struct xml_string* string) {
  570. return string->length;
  571. }
  572. /**
  573. * [PUBLIC API]
  574. */
  575. void xml_string_copy(struct xml_string* string, uint8_t* buffer, size_t length) {
  576. #define min(X,Y) ((X) < (Y) ? (X) : (Y))
  577. length = min(length, string->length);
  578. #undef min
  579. memcpy(buffer, string->buffer, length);
  580. }