Simple XML subset parser comparable to glib's Markup parser, but without any dependencies in one self contained file. Forked from
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

738 lines
14 KiB

  1. /**
  2. * Copyright (c) 2012 ooxi/xml.c
  3. *
  4. *
  5. * This software is provided 'as-is', without any express or implied warranty.
  6. * In no event will the authors be held liable for any damages arising from the
  7. * use of this software.
  8. *
  9. * Permission is granted to anyone to use this software for any purpose,
  10. * including commercial applications, and to alter it and redistribute it
  11. * freely, subject to the following restrictions:
  12. *
  13. * 1. The origin of this software must not be misrepresented; you must not
  14. * claim that you wrote the original software. If you use this software in a
  15. * product, an acknowledgment in the product documentation would be
  16. * appreciated but is not required.
  17. *
  18. * 2. Altered source versions must be plainly marked as such, and must not be
  19. * misrepresented as being the original software.
  20. *
  21. * 3. This notice may not be removed or altered from any source distribution.
  22. */
  23. #include <ctype.h>
  24. #include <malloc.h>
  25. #include <stdbool.h>
  26. #include <stdio.h>
  27. #include <stdlib.h>
  28. #include "xml.h"
  29. /**
  30. * [OPAQUE API]
  31. *
  32. * UTF-8 text
  33. */
  34. struct xml_string {
  35. uint8_t* buffer;
  36. size_t length;
  37. };
  38. /**
  39. * [OPAQUE API]
  40. *
  41. * An xml_node will always contain a tag name and a 0-terminated list of
  42. * children. Moreover it may contain text content.
  43. */
  44. struct xml_node {
  45. struct xml_string* name;
  46. struct xml_string* content;
  47. struct xml_node** children;
  48. };
  49. /**
  50. * [OPAQUE API]
  51. *
  52. * An xml_document simply contains the root node and the underlying buffer
  53. */
  54. struct xml_document {
  55. struct xml_string buffer;
  56. struct xml_node* root;
  57. };
  58. /**
  59. * [PRIVATE]
  60. *
  61. * Parser context
  62. */
  63. struct xml_parser {
  64. uint8_t* buffer;
  65. size_t position;
  66. size_t length;
  67. };
  68. /**
  69. * [PRIVATE]
  70. *
  71. * Character offsets
  72. */
  73. enum xml_parser_offset {
  74. NO_CHARACTER = -1,
  77. };
  78. /**
  79. * [PRIVATE]
  80. *
  81. * @return Number of elements in 0-terminated array
  82. */
  83. static size_t get_zero_terminated_array_elements(struct xml_node** nodes) {
  84. size_t elements = 0;
  85. while (nodes[elements]) {
  86. ++elements;
  87. }
  88. return elements;
  89. }
  90. /**
  91. * [PRIVATE]
  92. *
  93. * @warning No UTF conversions will be attempted
  94. *
  95. * @return true gdw. a == b
  96. */
  97. static _Bool xml_string_equals(struct xml_string* a, struct xml_string* b) {
  98. if (a->length != b->length) {
  99. return false;
  100. }
  101. size_t i = 0; for (; i < a->length; ++i) {
  102. if (a->buffer[i] != b->buffer[i]) {
  103. return false;
  104. }
  105. }
  106. return true;
  107. }
  108. /**
  109. * [PRIVATE]
  110. *
  111. * Frees the resources allocated by the string
  112. *
  113. * @waring `buffer` must _not_ be freed, since it is a reference to the
  114. * document's buffer
  115. */
  116. static void xml_string_free(struct xml_string* string) {
  117. free(string);
  118. }
  119. /**
  120. * [PRIVATE]
  121. *
  122. * Frees the resources allocated by the node
  123. */
  124. static void xml_node_free(struct xml_node* node) {
  125. xml_string_free(node->name);
  126. if (node->content) {
  127. xml_string_free(node->content);
  128. }
  129. struct xml_node** it = node->children;
  130. while (*it) {
  131. xml_node_free(*it);
  132. ++it;
  133. }
  134. free(node->children);
  135. free(node);
  136. }
  137. /**
  138. * [PRIVATE]
  139. *
  140. * Echos the parsers call stack for debugging purposes
  141. */
  142. #ifdef DEBUG
  143. static void xml_parser_info(struct xml_parser* parser, char const* message) {
  144. fprintf(stdout, "xml_parser_info %s\n", message);
  145. }
  146. #else
  147. #define xml_parser_info(parser, message) {}
  148. #endif
  149. /**
  150. * [PRIVATE]
  151. *
  152. * Echos an error regarding the parser's source to the console
  153. */
  154. static void xml_parser_error(struct xml_parser* parser, enum xml_parser_offset offset, char const* message) {
  155. int row = 0;
  156. int column = 0;
  157. #define min(X,Y) ((X) < (Y) ? (X) : (Y))
  158. #define max(X,Y) ((X) > (Y) ? (X) : (Y))
  159. size_t character = max(0, min(parser->length, parser->position + offset));
  160. #undef min
  161. #undef max
  162. size_t position = 0; for (; position < character; ++position) {
  163. column++;
  164. if ('\n' == parser->buffer[position]) {
  165. row++;
  166. column = 0;
  167. }
  168. }
  169. if (NO_CHARACTER != offset) {
  170. fprintf(stderr, "xml_parser_error at %i:%i (is %c): %s\n",
  171. row + 1, column, parser->buffer[character], message
  172. );
  173. } else {
  174. fprintf(stderr, "xml_parser_error at %i:%i: %s\n",
  175. row + 1, column, message
  176. );
  177. }
  178. }
  179. /**
  180. * [PRIVATE]
  181. *
  182. * Returns the n-th not-whitespace byte in parser and 0 if such a byte does not
  183. * exist
  184. */
  185. static uint8_t xml_parser_peek(struct xml_parser* parser, size_t n) {
  186. size_t position = parser->position;
  187. while (position < parser->length) {
  188. if (!isspace(parser->buffer[position])) {
  189. if (n == 0) {
  190. return parser->buffer[position];
  191. } else {
  192. --n;
  193. }
  194. }
  195. position++;
  196. }
  197. return 0;
  198. }
  199. /**
  200. * [PRIVATE]
  201. *
  202. * Moves the parser's position n bytes. If the new position would be out of
  203. * bounds, it will be converted to the bounds itself
  204. */
  205. static void xml_parser_consume(struct xml_parser* parser, size_t n) {
  206. /* Debug information
  207. */
  208. #ifdef DEBUG
  209. #define min(X,Y) ((X) < (Y) ? (X) : (Y))
  210. char* consumed = alloca((n + 1) * sizeof(char));
  211. memcpy(consumed, &parser->buffer[parser->position], min(n, parser->length - parser->position));
  212. consumed[n] = 0;
  213. #undef min
  214. size_t message_buffer_length = 512;
  215. char* message_buffer = alloca(512 * sizeof(char));
  216. snprintf(message_buffer, message_buffer_length, "Consuming %li bytes \"%s\"", (long)n, consumed);
  217. message_buffer[message_buffer_length - 1] = 0;
  218. xml_parser_info(parser, message_buffer);
  219. #endif
  220. /* Move the position forward
  221. */
  222. parser->position += n;
  223. /* Don't go too far
  224. *
  225. * @warning Valid because parser->length must be greater than 0
  226. */
  227. if (parser->position >= parser->length) {
  228. parser->position = parser->length - 1;
  229. }
  230. }
  231. /**
  232. * [PRIVATE]
  233. *
  234. * Skips to the next non-whitespace character
  235. */
  236. static void xml_skip_whitespace(struct xml_parser* parser) {
  237. xml_parser_info(parser, "whitespace");
  238. while (isspace(parser->buffer[parser->position])) {
  239. if (parser->position + 1 >= parser->length) {
  240. return;
  241. } else {
  242. parser->position++;
  243. }
  244. }
  245. }
  246. /**
  247. * [PRIVATE]
  248. *
  249. * Parses the name out of the an XML tag's ending
  250. *
  251. * ---( Example )---
  252. * tag_name>
  253. * ---
  254. */
  255. static struct xml_string* xml_parse_tag_end(struct xml_parser* parser) {
  256. xml_parser_info(parser, "tag_end");
  257. size_t start = parser->position;
  258. size_t length = 0;
  259. /* Parse until `>' or a whitespace is reached
  260. */
  261. while (start + length < parser->length) {
  262. uint8_t current = xml_parser_peek(parser, CURRENT_CHARACTER);
  263. if (('>' == current) || isspace(current)) {
  264. break;
  265. } else {
  266. xml_parser_consume(parser, 1);
  267. length++;
  268. }
  269. }
  270. /* Consume `>'
  271. */
  272. if ('>' != xml_parser_peek(parser, CURRENT_CHARACTER)) {
  273. xml_parser_error(parser, CURRENT_CHARACTER, "xml_parse_tag_end::expected tag end");
  274. return 0;
  275. }
  276. xml_parser_consume(parser, 1);
  277. /* Return parsed tag name
  278. */
  279. struct xml_string* name = malloc(sizeof(struct xml_string));
  280. name->buffer = &parser->buffer[start];
  281. name->length = length;
  282. return name;
  283. }
  284. /**
  285. * [PRIVATE]
  286. *
  287. * Parses an opening XML tag without attributes
  288. *
  289. * ---( Example )---
  290. * <tag_name>
  291. * ---
  292. */
  293. static struct xml_string* xml_parse_tag_open(struct xml_parser* parser) {
  294. xml_parser_info(parser, "tag_open");
  295. xml_skip_whitespace(parser);
  296. /* Consume `<'
  297. */
  298. if ('<' != xml_parser_peek(parser, CURRENT_CHARACTER)) {
  299. xml_parser_error(parser, CURRENT_CHARACTER, "xml_parse_tag_open::expected opening tag");
  300. return 0;
  301. }
  302. xml_parser_consume(parser, 1);
  303. /* Consume tag name
  304. */
  305. return xml_parse_tag_end(parser);
  306. }
  307. /**
  308. * [PRIVATE]
  309. *
  310. * Parses an closing XML tag without attributes
  311. *
  312. * ---( Example )---
  313. * </tag_name>
  314. * ---
  315. */
  316. static struct xml_string* xml_parse_tag_close(struct xml_parser* parser) {
  317. xml_parser_info(parser, "tag_close");
  318. xml_skip_whitespace(parser);
  319. /* Consume `</'
  320. */
  321. if ( ('<' != xml_parser_peek(parser, CURRENT_CHARACTER))
  322. || ('/' != xml_parser_peek(parser, NEXT_CHARACTER))) {
  323. if ('<' != xml_parser_peek(parser, CURRENT_CHARACTER)) {
  324. xml_parser_error(parser, CURRENT_CHARACTER, "xml_parse_tag_close::expected closing tag `<'");
  325. }
  326. if ('/' != xml_parser_peek(parser, NEXT_CHARACTER)) {
  327. xml_parser_error(parser, NEXT_CHARACTER, "xml_parse_tag_close::expected closing tag `/'");
  328. }
  329. return 0;
  330. }
  331. xml_parser_consume(parser, 2);
  332. /* Consume tag name
  333. */
  334. return xml_parse_tag_end(parser);
  335. }
  336. /**
  337. * [PRIVATE]
  338. *
  339. * Parses a tag's content
  340. *
  341. * ---( Example )---
  342. * this is
  343. * a
  344. * tag {} content
  345. * ---
  346. *
  347. * @warning CDATA etc. is _not_ and will never be supported
  348. */
  349. static struct xml_string* xml_parse_content(struct xml_parser* parser) {
  350. xml_parser_info(parser, "content");
  351. /* Whitespace will be ignored
  352. */
  353. xml_skip_whitespace(parser);
  354. size_t start = parser->position;
  355. size_t length = 0;
  356. /* Consume until `<' is reached
  357. */
  358. while (start + length < parser->length) {
  359. uint8_t current = xml_parser_peek(parser, CURRENT_CHARACTER);
  360. if ('<' == current) {
  361. break;
  362. } else {
  363. xml_parser_consume(parser, 1);
  364. length++;
  365. }
  366. }
  367. /* Next character must be an `<' or we have reached end of file
  368. */
  369. if ('<' != xml_parser_peek(parser, CURRENT_CHARACTER)) {
  370. xml_parser_error(parser, CURRENT_CHARACTER, "xml_parse_content::expected <");
  371. return 0;
  372. }
  373. /* Ignore tailing whitespace
  374. */
  375. while ((length > 0) && isspace(parser->buffer[start + length - 1])) {
  376. length--;
  377. }
  378. /* Return text
  379. */
  380. struct xml_string* content = malloc(sizeof(struct xml_string));
  381. content->buffer = &parser->buffer[start];
  382. content->length = length;
  383. return content;
  384. }
  385. /**
  386. * [PRIVATE]
  387. *
  388. * Parses an XML fragment node
  389. *
  390. * ---( Example without children )---
  391. * <Node>Text</Node>
  392. * ---
  393. *
  394. * ---( Example with children )---
  395. * <Parent>
  396. * <Child>Text</Child>
  397. * <Child>Text</Child>
  398. * <Test>Content</Test>
  399. * </Parent>
  400. * ---
  401. */
  402. static struct xml_node* xml_parse_node(struct xml_parser* parser) {
  403. xml_parser_info(parser, "node");
  404. /* Setup variables
  405. */
  406. struct xml_string* tag_open = 0;
  407. struct xml_string* tag_close = 0;
  408. struct xml_string* content = 0;
  409. struct xml_node** children = calloc(1, sizeof(struct xml_node*));
  410. children[0] = 0;
  411. /* Parse open tag
  412. */
  413. tag_open = xml_parse_tag_open(parser);
  414. if (!tag_open) {
  415. xml_parser_error(parser, NO_CHARACTER, "xml_parse_node::tag_open");
  416. goto exit_failure;
  417. }
  418. /* If the content does not start with '<', a text content is assumed
  419. */
  420. if ('<' != xml_parser_peek(parser, CURRENT_CHARACTER)) {
  421. content = xml_parse_content(parser);
  422. if (!content) {
  423. xml_parser_error(parser, 0, "xml_parse_node::content");
  424. goto exit_failure;
  425. }
  426. /* Otherwise children are to be expected
  427. */
  428. } else while ('/' != xml_parser_peek(parser, NEXT_CHARACTER)) {
  429. /* Parse child node
  430. */
  431. struct xml_node* child = xml_parse_node(parser);
  432. if (!child) {
  433. xml_parser_error(parser, NEXT_CHARACTER, "xml_parse_node::child");
  434. goto exit_failure;
  435. }
  436. /* Grow child array :)
  437. */
  438. size_t old_elements = get_zero_terminated_array_elements(children);
  439. size_t new_elements = old_elements + 1;
  440. children = realloc(children, (new_elements + 1) * sizeof(struct xml_node*));
  441. /* Save child
  442. */
  443. children[new_elements - 1] = child;
  444. children[new_elements] = 0;
  445. }
  446. /* Parse close tag
  447. */
  448. tag_close = xml_parse_tag_close(parser);
  449. if (!tag_close) {
  450. xml_parser_error(parser, NO_CHARACTER, "xml_parse_node::tag_close");
  451. goto exit_failure;
  452. }
  453. /* Close tag has to match open tag
  454. */
  455. if (!xml_string_equals(tag_open, tag_close)) {
  456. xml_parser_error(parser, NO_CHARACTER, "xml_parse_node::tag missmatch");
  457. goto exit_failure;
  458. }
  459. /* Return parsed node
  460. */
  461. xml_string_free(tag_close);
  462. struct xml_node* node = malloc(sizeof(struct xml_node));
  463. node->name = tag_open;
  464. node->content = content;
  465. node->children = children;
  466. return node;
  467. /* A failure occured, so free all allocalted resources
  468. */
  469. exit_failure:
  470. if (tag_open) {
  471. xml_string_free(tag_open);
  472. }
  473. if (tag_close) {
  474. xml_string_free(tag_close);
  475. }
  476. if (content) {
  477. xml_string_free(content);
  478. }
  479. struct xml_node** it = children;
  480. while (*it) {
  481. xml_node_free(*it);
  482. ++it;
  483. }
  484. free(children);
  485. return 0;
  486. }
  487. /**
  488. * [PUBLIC API]
  489. *
  490. *
  491. */
  492. struct xml_document* xml_parse_document(uint8_t* buffer, size_t length) {
  493. /* Initialize parser
  494. */
  495. struct xml_parser parser = {
  496. .buffer = buffer,
  497. .position = 0,
  498. .length = length
  499. };
  500. /* An empty buffer can never contain a valid document
  501. */
  502. if (!length) {
  503. xml_parser_error(&parser, NO_CHARACTER, "xml_parse_document::length equals zero");
  504. return 0;
  505. }
  506. /* Parse the root node
  507. */
  508. struct xml_node* root = xml_parse_node(&parser);
  509. if (!root) {
  510. xml_parser_error(&parser, NO_CHARACTER, "xml_parse_document::parsing document failed");
  511. return 0;
  512. }
  513. /* Return parsed document
  514. */
  515. struct xml_document* document = malloc(sizeof(struct xml_document));
  516. document->buffer.buffer = buffer;
  517. document->buffer.length = length;
  518. document->root = root;
  519. return document;
  520. }
  521. /**
  522. * [PUBLIC API]
  523. */
  524. void xml_document_free(struct xml_document* document, _Bool free_buffer) {
  525. xml_node_free(document->root);
  526. if (free_buffer) {
  527. free(document->buffer.buffer);
  528. }
  529. free(document);
  530. }
  531. /**
  532. * [PUBLIC API]
  533. */
  534. struct xml_node* xml_document_root(struct xml_document* document) {
  535. return document->root;
  536. }
  537. /**
  538. * [PUBLIC API]
  539. */
  540. struct xml_string* xml_node_name(struct xml_node* node) {
  541. return node->name;
  542. }
  543. /**
  544. * [PUBLIC API]
  545. */
  546. struct xml_string* xml_node_content(struct xml_node* node) {
  547. return node->content;
  548. }
  549. /**
  550. * [PUBLIC API]
  551. *
  552. * @warning O(n)
  553. */
  554. size_t xml_node_children(struct xml_node* node) {
  555. return get_zero_terminated_array_elements(node->children);
  556. }
  557. /**
  558. * [PUBLIC API]
  559. */
  560. struct xml_node* xml_node_child(struct xml_node* node, size_t child) {
  561. if (child >= xml_node_children(node)) {
  562. return 0;
  563. }
  564. return node->children[child];
  565. }
  566. /**
  567. * [PUBLIC API]
  568. */
  569. size_t xml_string_length(struct xml_string* string) {
  570. return string->length;
  571. }
  572. /**
  573. * [PUBLIC API]
  574. */
  575. void xml_string_copy(struct xml_string* string, uint8_t* buffer, size_t length) {
  576. #define min(X,Y) ((X) < (Y) ? (X) : (Y))
  577. length = min(length, string->length);
  578. #undef min
  579. memcpy(buffer, string->buffer, length);
  580. }