Simple XML subset parser comparable to glib's Markup parser, but without any dependencies in one self contained file. Forked from https://github.com/ooxi/xml.c
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

899 lines
16 KiB

  1. /**
  2. * Copyright (c) 2012 ooxi/xml.c
  3. * https://github.com/ooxi/xml.c
  4. *
  5. * This software is provided 'as-is', without any express or implied warranty.
  6. * In no event will the authors be held liable for any damages arising from the
  7. * use of this software.
  8. *
  9. * Permission is granted to anyone to use this software for any purpose,
  10. * including commercial applications, and to alter it and redistribute it
  11. * freely, subject to the following restrictions:
  12. *
  13. * 1. The origin of this software must not be misrepresented; you must not
  14. * claim that you wrote the original software. If you use this software in a
  15. * product, an acknowledgment in the product documentation would be
  16. * appreciated but is not required.
  17. *
  18. * 2. Altered source versions must be plainly marked as such, and must not be
  19. * misrepresented as being the original software.
  20. *
  21. * 3. This notice may not be removed or altered from any source distribution.
  22. */
  23. #include <ctype.h>
  24. #include <malloc.h>
  25. #include <stdarg.h>
  26. #include <stdbool.h>
  27. #include <stdio.h>
  28. #include <stdlib.h>
  29. #include "xml.h"
  30. /**
  31. * [OPAQUE API]
  32. *
  33. * UTF-8 text
  34. */
  35. struct xml_string {
  36. uint8_t const* buffer;
  37. size_t length;
  38. };
  39. /**
  40. * [OPAQUE API]
  41. *
  42. * An xml_node will always contain a tag name and a 0-terminated list of
  43. * children. Moreover it may contain text content.
  44. */
  45. struct xml_node {
  46. struct xml_string* name;
  47. struct xml_string* content;
  48. struct xml_node** children;
  49. };
  50. /**
  51. * [OPAQUE API]
  52. *
  53. * An xml_document simply contains the root node and the underlying buffer
  54. */
  55. struct xml_document {
  56. struct {
  57. uint8_t* buffer;
  58. size_t length;
  59. } buffer;
  60. struct xml_node* root;
  61. };
  62. /**
  63. * [PRIVATE]
  64. *
  65. * Parser context
  66. */
  67. struct xml_parser {
  68. uint8_t* buffer;
  69. size_t position;
  70. size_t length;
  71. };
  72. /**
  73. * [PRIVATE]
  74. *
  75. * Character offsets
  76. */
  77. enum xml_parser_offset {
  78. NO_CHARACTER = -1,
  79. CURRENT_CHARACTER = 0,
  80. NEXT_CHARACTER = 1,
  81. };
  82. /**
  83. * [PRIVATE]
  84. *
  85. * @return Number of elements in 0-terminated array
  86. */
  87. static size_t get_zero_terminated_array_elements(struct xml_node** nodes) {
  88. size_t elements = 0;
  89. while (nodes[elements]) {
  90. ++elements;
  91. }
  92. return elements;
  93. }
  94. /**
  95. * [PRIVATE]
  96. *
  97. * @warning No UTF conversions will be attempted
  98. *
  99. * @return true gdw. a == b
  100. */
  101. static _Bool xml_string_equals(struct xml_string* a, struct xml_string* b) {
  102. if (a->length != b->length) {
  103. return false;
  104. }
  105. size_t i = 0; for (; i < a->length; ++i) {
  106. if (a->buffer[i] != b->buffer[i]) {
  107. return false;
  108. }
  109. }
  110. return true;
  111. }
  112. /**
  113. * [PRIVATE]
  114. */
  115. static uint8_t* xml_string_clone(struct xml_string* s) {
  116. uint8_t* clone = calloc(s->length + 1, sizeof(uint8_t));
  117. xml_string_copy(s, clone, s->length);
  118. clone[s->length] = 0;
  119. return clone;
  120. }
  121. /**
  122. * [PRIVATE]
  123. *
  124. * Frees the resources allocated by the string
  125. *
  126. * @waring `buffer` must _not_ be freed, since it is a reference to the
  127. * document's buffer
  128. */
  129. static void xml_string_free(struct xml_string* string) {
  130. free(string);
  131. }
  132. /**
  133. * [PRIVATE]
  134. *
  135. * Frees the resources allocated by the node
  136. */
  137. static void xml_node_free(struct xml_node* node) {
  138. xml_string_free(node->name);
  139. if (node->content) {
  140. xml_string_free(node->content);
  141. }
  142. struct xml_node** it = node->children;
  143. while (*it) {
  144. xml_node_free(*it);
  145. ++it;
  146. }
  147. free(node->children);
  148. free(node);
  149. }
  150. /**
  151. * [PRIVATE]
  152. *
  153. * Echos the parsers call stack for debugging purposes
  154. */
  155. #ifdef XML_PARSER_VERBOSE
  156. static void xml_parser_info(struct xml_parser* parser, char const* message) {
  157. fprintf(stdout, "xml_parser_info %s\n", message);
  158. }
  159. #else
  160. #define xml_parser_info(parser, message) {}
  161. #endif
  162. /**
  163. * [PRIVATE]
  164. *
  165. * Echos an error regarding the parser's source to the console
  166. */
  167. static void xml_parser_error(struct xml_parser* parser, enum xml_parser_offset offset, char const* message) {
  168. int row = 0;
  169. int column = 0;
  170. #define min(X,Y) ((X) < (Y) ? (X) : (Y))
  171. #define max(X,Y) ((X) > (Y) ? (X) : (Y))
  172. size_t character = max(0, min(parser->length, parser->position + offset));
  173. #undef min
  174. #undef max
  175. size_t position = 0; for (; position < character; ++position) {
  176. column++;
  177. if ('\n' == parser->buffer[position]) {
  178. row++;
  179. column = 0;
  180. }
  181. }
  182. if (NO_CHARACTER != offset) {
  183. fprintf(stderr, "xml_parser_error at %i:%i (is %c): %s\n",
  184. row + 1, column, parser->buffer[character], message
  185. );
  186. } else {
  187. fprintf(stderr, "xml_parser_error at %i:%i: %s\n",
  188. row + 1, column, message
  189. );
  190. }
  191. }
  192. /**
  193. * [PRIVATE]
  194. *
  195. * Returns the n-th not-whitespace byte in parser and 0 if such a byte does not
  196. * exist
  197. */
  198. static uint8_t xml_parser_peek(struct xml_parser* parser, size_t n) {
  199. size_t position = parser->position;
  200. while (position < parser->length) {
  201. if (!isspace(parser->buffer[position])) {
  202. if (n == 0) {
  203. return parser->buffer[position];
  204. } else {
  205. --n;
  206. }
  207. }
  208. position++;
  209. }
  210. return 0;
  211. }
  212. /**
  213. * [PRIVATE]
  214. *
  215. * Moves the parser's position n bytes. If the new position would be out of
  216. * bounds, it will be converted to the bounds itself
  217. */
  218. static void xml_parser_consume(struct xml_parser* parser, size_t n) {
  219. /* Debug information
  220. */
  221. #ifdef XML_PARSER_VERBOSE
  222. #define min(X,Y) ((X) < (Y) ? (X) : (Y))
  223. char* consumed = alloca((n + 1) * sizeof(char));
  224. memcpy(consumed, &parser->buffer[parser->position], min(n, parser->length - parser->position));
  225. consumed[n] = 0;
  226. #undef min
  227. size_t message_buffer_length = 512;
  228. char* message_buffer = alloca(512 * sizeof(char));
  229. snprintf(message_buffer, message_buffer_length, "Consuming %li bytes \"%s\"", (long)n, consumed);
  230. message_buffer[message_buffer_length - 1] = 0;
  231. xml_parser_info(parser, message_buffer);
  232. #endif
  233. /* Move the position forward
  234. */
  235. parser->position += n;
  236. /* Don't go too far
  237. *
  238. * @warning Valid because parser->length must be greater than 0
  239. */
  240. if (parser->position >= parser->length) {
  241. parser->position = parser->length - 1;
  242. }
  243. }
  244. /**
  245. * [PRIVATE]
  246. *
  247. * Skips to the next non-whitespace character
  248. */
  249. static void xml_skip_whitespace(struct xml_parser* parser) {
  250. xml_parser_info(parser, "whitespace");
  251. while (isspace(parser->buffer[parser->position])) {
  252. if (parser->position + 1 >= parser->length) {
  253. return;
  254. } else {
  255. parser->position++;
  256. }
  257. }
  258. }
  259. /**
  260. * [PRIVATE]
  261. *
  262. * Parses the name out of the an XML tag's ending
  263. *
  264. * ---( Example )---
  265. * tag_name>
  266. * ---
  267. */
  268. static struct xml_string* xml_parse_tag_end(struct xml_parser* parser) {
  269. xml_parser_info(parser, "tag_end");
  270. size_t start = parser->position;
  271. size_t length = 0;
  272. /* Parse until `>' or a whitespace is reached
  273. */
  274. while (start + length < parser->length) {
  275. uint8_t current = xml_parser_peek(parser, CURRENT_CHARACTER);
  276. if (('>' == current) || isspace(current)) {
  277. break;
  278. } else {
  279. xml_parser_consume(parser, 1);
  280. length++;
  281. }
  282. }
  283. /* Consume `>'
  284. */
  285. if ('>' != xml_parser_peek(parser, CURRENT_CHARACTER)) {
  286. xml_parser_error(parser, CURRENT_CHARACTER, "xml_parse_tag_end::expected tag end");
  287. return 0;
  288. }
  289. xml_parser_consume(parser, 1);
  290. /* Return parsed tag name
  291. */
  292. struct xml_string* name = malloc(sizeof(struct xml_string));
  293. name->buffer = &parser->buffer[start];
  294. name->length = length;
  295. return name;
  296. }
  297. /**
  298. * [PRIVATE]
  299. *
  300. * Parses an opening XML tag without attributes
  301. *
  302. * ---( Example )---
  303. * <tag_name>
  304. * ---
  305. */
  306. static struct xml_string* xml_parse_tag_open(struct xml_parser* parser) {
  307. xml_parser_info(parser, "tag_open");
  308. xml_skip_whitespace(parser);
  309. /* Consume `<'
  310. */
  311. if ('<' != xml_parser_peek(parser, CURRENT_CHARACTER)) {
  312. xml_parser_error(parser, CURRENT_CHARACTER, "xml_parse_tag_open::expected opening tag");
  313. return 0;
  314. }
  315. xml_parser_consume(parser, 1);
  316. /* Consume tag name
  317. */
  318. return xml_parse_tag_end(parser);
  319. }
  320. /**
  321. * [PRIVATE]
  322. *
  323. * Parses an closing XML tag without attributes
  324. *
  325. * ---( Example )---
  326. * </tag_name>
  327. * ---
  328. */
  329. static struct xml_string* xml_parse_tag_close(struct xml_parser* parser) {
  330. xml_parser_info(parser, "tag_close");
  331. xml_skip_whitespace(parser);
  332. /* Consume `</'
  333. */
  334. if ( ('<' != xml_parser_peek(parser, CURRENT_CHARACTER))
  335. || ('/' != xml_parser_peek(parser, NEXT_CHARACTER))) {
  336. if ('<' != xml_parser_peek(parser, CURRENT_CHARACTER)) {
  337. xml_parser_error(parser, CURRENT_CHARACTER, "xml_parse_tag_close::expected closing tag `<'");
  338. }
  339. if ('/' != xml_parser_peek(parser, NEXT_CHARACTER)) {
  340. xml_parser_error(parser, NEXT_CHARACTER, "xml_parse_tag_close::expected closing tag `/'");
  341. }
  342. return 0;
  343. }
  344. xml_parser_consume(parser, 2);
  345. /* Consume tag name
  346. */
  347. return xml_parse_tag_end(parser);
  348. }
  349. /**
  350. * [PRIVATE]
  351. *
  352. * Parses a tag's content
  353. *
  354. * ---( Example )---
  355. * this is
  356. * a
  357. * tag {} content
  358. * ---
  359. *
  360. * @warning CDATA etc. is _not_ and will never be supported
  361. */
  362. static struct xml_string* xml_parse_content(struct xml_parser* parser) {
  363. xml_parser_info(parser, "content");
  364. /* Whitespace will be ignored
  365. */
  366. xml_skip_whitespace(parser);
  367. size_t start = parser->position;
  368. size_t length = 0;
  369. /* Consume until `<' is reached
  370. */
  371. while (start + length < parser->length) {
  372. uint8_t current = xml_parser_peek(parser, CURRENT_CHARACTER);
  373. if ('<' == current) {
  374. break;
  375. } else {
  376. xml_parser_consume(parser, 1);
  377. length++;
  378. }
  379. }
  380. /* Next character must be an `<' or we have reached end of file
  381. */
  382. if ('<' != xml_parser_peek(parser, CURRENT_CHARACTER)) {
  383. xml_parser_error(parser, CURRENT_CHARACTER, "xml_parse_content::expected <");
  384. return 0;
  385. }
  386. /* Ignore tailing whitespace
  387. */
  388. while ((length > 0) && isspace(parser->buffer[start + length - 1])) {
  389. length--;
  390. }
  391. /* Return text
  392. */
  393. struct xml_string* content = malloc(sizeof(struct xml_string));
  394. content->buffer = &parser->buffer[start];
  395. content->length = length;
  396. return content;
  397. }
  398. /**
  399. * [PRIVATE]
  400. *
  401. * Parses an XML fragment node
  402. *
  403. * ---( Example without children )---
  404. * <Node>Text</Node>
  405. * ---
  406. *
  407. * ---( Example with children )---
  408. * <Parent>
  409. * <Child>Text</Child>
  410. * <Child>Text</Child>
  411. * <Test>Content</Test>
  412. * </Parent>
  413. * ---
  414. */
  415. static struct xml_node* xml_parse_node(struct xml_parser* parser) {
  416. xml_parser_info(parser, "node");
  417. /* Setup variables
  418. */
  419. struct xml_string* tag_open = 0;
  420. struct xml_string* tag_close = 0;
  421. struct xml_string* content = 0;
  422. struct xml_node** children = calloc(1, sizeof(struct xml_node*));
  423. children[0] = 0;
  424. /* Parse open tag
  425. */
  426. tag_open = xml_parse_tag_open(parser);
  427. if (!tag_open) {
  428. xml_parser_error(parser, NO_CHARACTER, "xml_parse_node::tag_open");
  429. goto exit_failure;
  430. }
  431. /* If the content does not start with '<', a text content is assumed
  432. */
  433. if ('<' != xml_parser_peek(parser, CURRENT_CHARACTER)) {
  434. content = xml_parse_content(parser);
  435. if (!content) {
  436. xml_parser_error(parser, 0, "xml_parse_node::content");
  437. goto exit_failure;
  438. }
  439. /* Otherwise children are to be expected
  440. */
  441. } else while ('/' != xml_parser_peek(parser, NEXT_CHARACTER)) {
  442. /* Parse child node
  443. */
  444. struct xml_node* child = xml_parse_node(parser);
  445. if (!child) {
  446. xml_parser_error(parser, NEXT_CHARACTER, "xml_parse_node::child");
  447. goto exit_failure;
  448. }
  449. /* Grow child array :)
  450. */
  451. size_t old_elements = get_zero_terminated_array_elements(children);
  452. size_t new_elements = old_elements + 1;
  453. children = realloc(children, (new_elements + 1) * sizeof(struct xml_node*));
  454. /* Save child
  455. */
  456. children[new_elements - 1] = child;
  457. children[new_elements] = 0;
  458. }
  459. /* Parse close tag
  460. */
  461. tag_close = xml_parse_tag_close(parser);
  462. if (!tag_close) {
  463. xml_parser_error(parser, NO_CHARACTER, "xml_parse_node::tag_close");
  464. goto exit_failure;
  465. }
  466. /* Close tag has to match open tag
  467. */
  468. if (!xml_string_equals(tag_open, tag_close)) {
  469. xml_parser_error(parser, NO_CHARACTER, "xml_parse_node::tag missmatch");
  470. goto exit_failure;
  471. }
  472. /* Return parsed node
  473. */
  474. xml_string_free(tag_close);
  475. struct xml_node* node = malloc(sizeof(struct xml_node));
  476. node->name = tag_open;
  477. node->content = content;
  478. node->children = children;
  479. return node;
  480. /* A failure occured, so free all allocalted resources
  481. */
  482. exit_failure:
  483. if (tag_open) {
  484. xml_string_free(tag_open);
  485. }
  486. if (tag_close) {
  487. xml_string_free(tag_close);
  488. }
  489. if (content) {
  490. xml_string_free(content);
  491. }
  492. struct xml_node** it = children;
  493. while (*it) {
  494. xml_node_free(*it);
  495. ++it;
  496. }
  497. free(children);
  498. return 0;
  499. }
  500. /**
  501. * [PUBLIC API]
  502. */
  503. struct xml_document* xml_parse_document(uint8_t* buffer, size_t length) {
  504. /* Initialize parser
  505. */
  506. struct xml_parser parser = {
  507. .buffer = buffer,
  508. .position = 0,
  509. .length = length
  510. };
  511. /* An empty buffer can never contain a valid document
  512. */
  513. if (!length) {
  514. xml_parser_error(&parser, NO_CHARACTER, "xml_parse_document::length equals zero");
  515. return 0;
  516. }
  517. /* Parse the root node
  518. */
  519. struct xml_node* root = xml_parse_node(&parser);
  520. if (!root) {
  521. xml_parser_error(&parser, NO_CHARACTER, "xml_parse_document::parsing document failed");
  522. return 0;
  523. }
  524. /* Return parsed document
  525. */
  526. struct xml_document* document = malloc(sizeof(struct xml_document));
  527. document->buffer.buffer = buffer;
  528. document->buffer.length = length;
  529. document->root = root;
  530. return document;
  531. }
  532. /**
  533. * [PUBLIC API]
  534. */
  535. struct xml_document* xml_open_document(FILE* source) {
  536. /* Prepare buffer
  537. */
  538. size_t const read_chunk = 1; // TODO 4096;
  539. size_t document_length = 0;
  540. size_t buffer_size = 1; // TODO 4069
  541. uint8_t* buffer = malloc(buffer_size * sizeof(uint8_t));
  542. /* Read hole file into buffer
  543. */
  544. while (!feof(source)) {
  545. /* Reallocate buffer
  546. */
  547. if (buffer_size - document_length < read_chunk) {
  548. buffer = realloc(buffer, buffer_size + 2 * read_chunk);
  549. buffer_size += 2 * read_chunk;
  550. }
  551. size_t read = fread(
  552. &buffer[document_length],
  553. sizeof(uint8_t), read_chunk,
  554. source
  555. );
  556. document_length += read;
  557. }
  558. fclose(source);
  559. /* Try to parse buffer
  560. */
  561. struct xml_document* document = xml_parse_document(buffer, document_length);
  562. if (!document) {
  563. free(buffer);
  564. return 0;
  565. }
  566. return document;
  567. }
  568. /**
  569. * [PUBLIC API]
  570. */
  571. void xml_document_free(struct xml_document* document, _Bool free_buffer) {
  572. xml_node_free(document->root);
  573. if (free_buffer) {
  574. free(document->buffer.buffer);
  575. }
  576. free(document);
  577. }
  578. /**
  579. * [PUBLIC API]
  580. */
  581. struct xml_node* xml_document_root(struct xml_document* document) {
  582. return document->root;
  583. }
  584. /**
  585. * [PUBLIC API]
  586. */
  587. struct xml_string* xml_node_name(struct xml_node* node) {
  588. return node->name;
  589. }
  590. /**
  591. * [PUBLIC API]
  592. */
  593. struct xml_string* xml_node_content(struct xml_node* node) {
  594. return node->content;
  595. }
  596. /**
  597. * [PUBLIC API]
  598. *
  599. * @warning O(n)
  600. */
  601. size_t xml_node_children(struct xml_node* node) {
  602. return get_zero_terminated_array_elements(node->children);
  603. }
  604. /**
  605. * [PUBLIC API]
  606. */
  607. struct xml_node* xml_node_child(struct xml_node* node, size_t child) {
  608. if (child >= xml_node_children(node)) {
  609. return 0;
  610. }
  611. return node->children[child];
  612. }
  613. /**
  614. * [PUBLIC API]
  615. */
  616. struct xml_node* xml_easy_child(struct xml_node* node, uint8_t const* child_name, ...) {
  617. /* Find childrens, one by one
  618. */
  619. struct xml_node* current = node;
  620. va_list arguments;
  621. va_start(arguments, child_name);
  622. /* Descent to current.child
  623. */
  624. while (child_name) {
  625. /* Convert child_name to xml_string for easy comparison
  626. */
  627. struct xml_string cn = {
  628. .buffer = child_name,
  629. .length = strlen(child_name)
  630. };
  631. /* Interate through all children
  632. */
  633. struct xml_node* next = 0;
  634. size_t i = 0; for (; i < xml_node_children(current); ++i) {
  635. struct xml_node* child = xml_node_child(current, i);
  636. if (xml_string_equals(xml_node_name(child), &cn)) {
  637. if (!next) {
  638. next = child;
  639. /* Two children with the same name
  640. */
  641. } else {
  642. return 0;
  643. }
  644. }
  645. }
  646. /* No child with that name found
  647. */
  648. if (!next) {
  649. return 0;
  650. }
  651. current = next;
  652. /* Find name of next child
  653. */
  654. child_name = va_arg(arguments, uint8_t const*);
  655. }
  656. va_end(arguments);
  657. /* Return current element
  658. */
  659. return current;
  660. }
  661. /**
  662. * [PUBLIC API]
  663. */
  664. uint8_t* xml_easy_name(struct xml_node* node) {
  665. if (!node) {
  666. return 0;
  667. }
  668. return xml_string_clone(xml_node_name(node));
  669. }
  670. /**
  671. * [PUBLIC API]
  672. */
  673. uint8_t* xml_easy_content(struct xml_node* node) {
  674. if (!node) {
  675. return 0;
  676. }
  677. return xml_string_clone(xml_node_content(node));
  678. }
  679. /**
  680. * [PUBLIC API]
  681. */
  682. size_t xml_string_length(struct xml_string* string) {
  683. if (!string) {
  684. return 0;
  685. }
  686. return string->length;
  687. }
  688. /**
  689. * [PUBLIC API]
  690. */
  691. void xml_string_copy(struct xml_string* string, uint8_t* buffer, size_t length) {
  692. if (!string) {
  693. return;
  694. }
  695. #define min(X,Y) ((X) < (Y) ? (X) : (Y))
  696. length = min(length, string->length);
  697. #undef min
  698. memcpy(buffer, string->buffer, length);
  699. }