Simple XML subset parser comparable to glib's Markup parser, but without any dependencies in one self contained file. Forked from https://github.com/ooxi/xml.c
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

915 行
17 KiB

  1. /**
  2. * Copyright (c) 2012 ooxi/xml.c
  3. * https://github.com/ooxi/xml.c
  4. *
  5. * This software is provided 'as-is', without any express or implied warranty.
  6. * In no event will the authors be held liable for any damages arising from the
  7. * use of this software.
  8. *
  9. * Permission is granted to anyone to use this software for any purpose,
  10. * including commercial applications, and to alter it and redistribute it
  11. * freely, subject to the following restrictions:
  12. *
  13. * 1. The origin of this software must not be misrepresented; you must not
  14. * claim that you wrote the original software. If you use this software in a
  15. * product, an acknowledgment in the product documentation would be
  16. * appreciated but is not required.
  17. *
  18. * 2. Altered source versions must be plainly marked as such, and must not be
  19. * misrepresented as being the original software.
  20. *
  21. * 3. This notice may not be removed or altered from any source distribution.
  22. */
  23. #ifdef XML_PARSER_VERBOSE
  24. #include <alloca.h>
  25. #endif
  26. #include <ctype.h>
  27. #include <malloc.h>
  28. #include <stdarg.h>
  29. #include <stdbool.h>
  30. #include <stdio.h>
  31. #include <stdlib.h>
  32. #include "xml.h"
  33. /**
  34. * [OPAQUE API]
  35. *
  36. * UTF-8 text
  37. */
  38. struct xml_string {
  39. uint8_t const* buffer;
  40. size_t length;
  41. };
  42. /**
  43. * [OPAQUE API]
  44. *
  45. * An xml_node will always contain a tag name and a 0-terminated list of
  46. * children. Moreover it may contain text content.
  47. */
  48. struct xml_node {
  49. struct xml_string* name;
  50. struct xml_string* content;
  51. struct xml_node** children;
  52. };
  53. /**
  54. * [OPAQUE API]
  55. *
  56. * An xml_document simply contains the root node and the underlying buffer
  57. */
  58. struct xml_document {
  59. struct {
  60. uint8_t* buffer;
  61. size_t length;
  62. } buffer;
  63. struct xml_node* root;
  64. };
  65. /**
  66. * [PRIVATE]
  67. *
  68. * Parser context
  69. */
  70. struct xml_parser {
  71. uint8_t* buffer;
  72. size_t position;
  73. size_t length;
  74. };
  75. /**
  76. * [PRIVATE]
  77. *
  78. * Character offsets
  79. */
  80. enum xml_parser_offset {
  81. NO_CHARACTER = -1,
  82. CURRENT_CHARACTER = 0,
  83. NEXT_CHARACTER = 1,
  84. };
  85. /**
  86. * [PRIVATE]
  87. *
  88. * @return Number of elements in 0-terminated array
  89. */
  90. static size_t get_zero_terminated_array_elements(struct xml_node** nodes) {
  91. size_t elements = 0;
  92. while (nodes[elements]) {
  93. ++elements;
  94. }
  95. return elements;
  96. }
  97. /**
  98. * [PRIVATE]
  99. *
  100. * @warning No UTF conversions will be attempted
  101. *
  102. * @return true gdw. a == b
  103. */
  104. static _Bool xml_string_equals(struct xml_string* a, struct xml_string* b) {
  105. if (a->length != b->length) {
  106. return false;
  107. }
  108. size_t i = 0; for (; i < a->length; ++i) {
  109. if (a->buffer[i] != b->buffer[i]) {
  110. return false;
  111. }
  112. }
  113. return true;
  114. }
  115. /**
  116. * [PRIVATE]
  117. */
  118. static uint8_t* xml_string_clone(struct xml_string* s) {
  119. if (!s) {
  120. return 0;
  121. }
  122. uint8_t* clone = calloc(s->length + 1, sizeof(uint8_t));
  123. xml_string_copy(s, clone, s->length);
  124. clone[s->length] = 0;
  125. return clone;
  126. }
  127. /**
  128. * [PRIVATE]
  129. *
  130. * Frees the resources allocated by the string
  131. *
  132. * @warning `buffer` must _not_ be freed, since it is a reference to the
  133. * document's buffer
  134. */
  135. static void xml_string_free(struct xml_string* string) {
  136. free(string);
  137. }
  138. /**
  139. * [PRIVATE]
  140. *
  141. * Frees the resources allocated by the node
  142. */
  143. static void xml_node_free(struct xml_node* node) {
  144. xml_string_free(node->name);
  145. if (node->content) {
  146. xml_string_free(node->content);
  147. }
  148. struct xml_node** it = node->children;
  149. while (*it) {
  150. xml_node_free(*it);
  151. ++it;
  152. }
  153. free(node->children);
  154. free(node);
  155. }
  156. /**
  157. * [PRIVATE]
  158. *
  159. * Echos the parsers call stack for debugging purposes
  160. */
  161. #ifdef XML_PARSER_VERBOSE
  162. static void xml_parser_info(struct xml_parser* parser, char const* message) {
  163. fprintf(stdout, "xml_parser_info %s\n", message);
  164. }
  165. #else
  166. #define xml_parser_info(parser, message) {}
  167. #endif
  168. /**
  169. * [PRIVATE]
  170. *
  171. * Echos an error regarding the parser's source to the console
  172. */
  173. static void xml_parser_error(struct xml_parser* parser, enum xml_parser_offset offset, char const* message) {
  174. int row = 0;
  175. int column = 0;
  176. #define min(X,Y) ((X) < (Y) ? (X) : (Y))
  177. #define max(X,Y) ((X) > (Y) ? (X) : (Y))
  178. size_t character = max(0, min(parser->length, parser->position + offset));
  179. #undef min
  180. #undef max
  181. size_t position = 0; for (; position < character; ++position) {
  182. column++;
  183. if ('\n' == parser->buffer[position]) {
  184. row++;
  185. column = 0;
  186. }
  187. }
  188. if (NO_CHARACTER != offset) {
  189. fprintf(stderr, "xml_parser_error at %i:%i (is %c): %s\n",
  190. row + 1, column, parser->buffer[character], message
  191. );
  192. } else {
  193. fprintf(stderr, "xml_parser_error at %i:%i: %s\n",
  194. row + 1, column, message
  195. );
  196. }
  197. }
  198. /**
  199. * [PRIVATE]
  200. *
  201. * Returns the n-th not-whitespace byte in parser and 0 if such a byte does not
  202. * exist
  203. */
  204. static uint8_t xml_parser_peek(struct xml_parser* parser, size_t n) {
  205. size_t position = parser->position;
  206. while (position < parser->length) {
  207. if (!isspace(parser->buffer[position])) {
  208. if (n == 0) {
  209. return parser->buffer[position];
  210. } else {
  211. --n;
  212. }
  213. }
  214. position++;
  215. }
  216. return 0;
  217. }
  218. /**
  219. * [PRIVATE]
  220. *
  221. * Moves the parser's position n bytes. If the new position would be out of
  222. * bounds, it will be converted to the bounds itself
  223. */
  224. static void xml_parser_consume(struct xml_parser* parser, size_t n) {
  225. /* Debug information
  226. */
  227. #ifdef XML_PARSER_VERBOSE
  228. #define min(X,Y) ((X) < (Y) ? (X) : (Y))
  229. char* consumed = alloca((n + 1) * sizeof(char));
  230. memcpy(consumed, &parser->buffer[parser->position], min(n, parser->length - parser->position));
  231. consumed[n] = 0;
  232. #undef min
  233. size_t message_buffer_length = 512;
  234. char* message_buffer = alloca(512 * sizeof(char));
  235. snprintf(message_buffer, message_buffer_length, "Consuming %li bytes \"%s\"", (long)n, consumed);
  236. message_buffer[message_buffer_length - 1] = 0;
  237. xml_parser_info(parser, message_buffer);
  238. #endif
  239. /* Move the position forward
  240. */
  241. parser->position += n;
  242. /* Don't go too far
  243. *
  244. * @warning Valid because parser->length must be greater than 0
  245. */
  246. if (parser->position >= parser->length) {
  247. parser->position = parser->length - 1;
  248. }
  249. }
  250. /**
  251. * [PRIVATE]
  252. *
  253. * Skips to the next non-whitespace character
  254. */
  255. static void xml_skip_whitespace(struct xml_parser* parser) {
  256. xml_parser_info(parser, "whitespace");
  257. while (isspace(parser->buffer[parser->position])) {
  258. if (parser->position + 1 >= parser->length) {
  259. return;
  260. } else {
  261. parser->position++;
  262. }
  263. }
  264. }
  265. /**
  266. * [PRIVATE]
  267. *
  268. * Parses the name out of the an XML tag's ending
  269. *
  270. * ---( Example )---
  271. * tag_name>
  272. * ---
  273. */
  274. static struct xml_string* xml_parse_tag_end(struct xml_parser* parser) {
  275. xml_parser_info(parser, "tag_end");
  276. size_t start = parser->position;
  277. size_t length = 0;
  278. /* Parse until `>' or a whitespace is reached
  279. */
  280. while (start + length < parser->length) {
  281. uint8_t current = xml_parser_peek(parser, CURRENT_CHARACTER);
  282. if (('>' == current) || isspace(current)) {
  283. break;
  284. } else {
  285. xml_parser_consume(parser, 1);
  286. length++;
  287. }
  288. }
  289. /* Consume `>'
  290. */
  291. if ('>' != xml_parser_peek(parser, CURRENT_CHARACTER)) {
  292. xml_parser_error(parser, CURRENT_CHARACTER, "xml_parse_tag_end::expected tag end");
  293. return 0;
  294. }
  295. xml_parser_consume(parser, 1);
  296. /* Return parsed tag name
  297. */
  298. struct xml_string* name = malloc(sizeof(struct xml_string));
  299. name->buffer = &parser->buffer[start];
  300. name->length = length;
  301. return name;
  302. }
  303. /**
  304. * [PRIVATE]
  305. *
  306. * Parses an opening XML tag without attributes
  307. *
  308. * ---( Example )---
  309. * <tag_name>
  310. * ---
  311. */
  312. static struct xml_string* xml_parse_tag_open(struct xml_parser* parser) {
  313. xml_parser_info(parser, "tag_open");
  314. xml_skip_whitespace(parser);
  315. /* Consume `<'
  316. */
  317. if ('<' != xml_parser_peek(parser, CURRENT_CHARACTER)) {
  318. xml_parser_error(parser, CURRENT_CHARACTER, "xml_parse_tag_open::expected opening tag");
  319. return 0;
  320. }
  321. xml_parser_consume(parser, 1);
  322. /* Consume tag name
  323. */
  324. return xml_parse_tag_end(parser);
  325. }
  326. /**
  327. * [PRIVATE]
  328. *
  329. * Parses an closing XML tag without attributes
  330. *
  331. * ---( Example )---
  332. * </tag_name>
  333. * ---
  334. */
  335. static struct xml_string* xml_parse_tag_close(struct xml_parser* parser) {
  336. xml_parser_info(parser, "tag_close");
  337. xml_skip_whitespace(parser);
  338. /* Consume `</'
  339. */
  340. if ( ('<' != xml_parser_peek(parser, CURRENT_CHARACTER))
  341. || ('/' != xml_parser_peek(parser, NEXT_CHARACTER))) {
  342. if ('<' != xml_parser_peek(parser, CURRENT_CHARACTER)) {
  343. xml_parser_error(parser, CURRENT_CHARACTER, "xml_parse_tag_close::expected closing tag `<'");
  344. }
  345. if ('/' != xml_parser_peek(parser, NEXT_CHARACTER)) {
  346. xml_parser_error(parser, NEXT_CHARACTER, "xml_parse_tag_close::expected closing tag `/'");
  347. }
  348. return 0;
  349. }
  350. xml_parser_consume(parser, 2);
  351. /* Consume tag name
  352. */
  353. return xml_parse_tag_end(parser);
  354. }
  355. /**
  356. * [PRIVATE]
  357. *
  358. * Parses a tag's content
  359. *
  360. * ---( Example )---
  361. * this is
  362. * a
  363. * tag {} content
  364. * ---
  365. *
  366. * @warning CDATA etc. is _not_ and will never be supported
  367. */
  368. static struct xml_string* xml_parse_content(struct xml_parser* parser) {
  369. xml_parser_info(parser, "content");
  370. /* Whitespace will be ignored
  371. */
  372. xml_skip_whitespace(parser);
  373. size_t start = parser->position;
  374. size_t length = 0;
  375. /* Consume until `<' is reached
  376. */
  377. while (start + length < parser->length) {
  378. uint8_t current = xml_parser_peek(parser, CURRENT_CHARACTER);
  379. if ('<' == current) {
  380. break;
  381. } else {
  382. xml_parser_consume(parser, 1);
  383. length++;
  384. }
  385. }
  386. /* Next character must be an `<' or we have reached end of file
  387. */
  388. if ('<' != xml_parser_peek(parser, CURRENT_CHARACTER)) {
  389. xml_parser_error(parser, CURRENT_CHARACTER, "xml_parse_content::expected <");
  390. return 0;
  391. }
  392. /* Ignore tailing whitespace
  393. */
  394. while ((length > 0) && isspace(parser->buffer[start + length - 1])) {
  395. length--;
  396. }
  397. /* Return text
  398. */
  399. struct xml_string* content = malloc(sizeof(struct xml_string));
  400. content->buffer = &parser->buffer[start];
  401. content->length = length;
  402. return content;
  403. }
  404. /**
  405. * [PRIVATE]
  406. *
  407. * Parses an XML fragment node
  408. *
  409. * ---( Example without children )---
  410. * <Node>Text</Node>
  411. * ---
  412. *
  413. * ---( Example with children )---
  414. * <Parent>
  415. * <Child>Text</Child>
  416. * <Child>Text</Child>
  417. * <Test>Content</Test>
  418. * </Parent>
  419. * ---
  420. */
  421. static struct xml_node* xml_parse_node(struct xml_parser* parser) {
  422. xml_parser_info(parser, "node");
  423. /* Setup variables
  424. */
  425. struct xml_string* tag_open = 0;
  426. struct xml_string* tag_close = 0;
  427. struct xml_string* content = 0;
  428. struct xml_node** children = calloc(1, sizeof(struct xml_node*));
  429. children[0] = 0;
  430. /* Parse open tag
  431. */
  432. tag_open = xml_parse_tag_open(parser);
  433. if (!tag_open) {
  434. xml_parser_error(parser, NO_CHARACTER, "xml_parse_node::tag_open");
  435. goto exit_failure;
  436. }
  437. /* If tag ends with `/' it's self closing, skip content lookup */
  438. if (tag_open->length > 0 && '/' == tag_open->buffer[tag_open->length - 1]) {
  439. /* Drop `/'
  440. */
  441. --tag_open->length;
  442. goto node_creation;
  443. }
  444. /* If the content does not start with '<', a text content is assumed
  445. */
  446. if ('<' != xml_parser_peek(parser, CURRENT_CHARACTER)) {
  447. content = xml_parse_content(parser);
  448. if (!content) {
  449. xml_parser_error(parser, 0, "xml_parse_node::content");
  450. goto exit_failure;
  451. }
  452. /* Otherwise children are to be expected
  453. */
  454. } else while ('/' != xml_parser_peek(parser, NEXT_CHARACTER)) {
  455. /* Parse child node
  456. */
  457. struct xml_node* child = xml_parse_node(parser);
  458. if (!child) {
  459. xml_parser_error(parser, NEXT_CHARACTER, "xml_parse_node::child");
  460. goto exit_failure;
  461. }
  462. /* Grow child array :)
  463. */
  464. size_t old_elements = get_zero_terminated_array_elements(children);
  465. size_t new_elements = old_elements + 1;
  466. children = realloc(children, (new_elements + 1) * sizeof(struct xml_node*));
  467. /* Save child
  468. */
  469. children[new_elements - 1] = child;
  470. children[new_elements] = 0;
  471. }
  472. /* Parse close tag
  473. */
  474. tag_close = xml_parse_tag_close(parser);
  475. if (!tag_close) {
  476. xml_parser_error(parser, NO_CHARACTER, "xml_parse_node::tag_close");
  477. goto exit_failure;
  478. }
  479. /* Close tag has to match open tag
  480. */
  481. if (!xml_string_equals(tag_open, tag_close)) {
  482. xml_parser_error(parser, NO_CHARACTER, "xml_parse_node::tag missmatch");
  483. goto exit_failure;
  484. }
  485. /* Return parsed node
  486. */
  487. xml_string_free(tag_close);
  488. node_creation:;
  489. struct xml_node* node = malloc(sizeof(struct xml_node));
  490. node->name = tag_open;
  491. node->content = content;
  492. node->children = children;
  493. return node;
  494. /* A failure occured, so free all allocalted resources
  495. */
  496. exit_failure:
  497. if (tag_open) {
  498. xml_string_free(tag_open);
  499. }
  500. if (tag_close) {
  501. xml_string_free(tag_close);
  502. }
  503. if (content) {
  504. xml_string_free(content);
  505. }
  506. struct xml_node** it = children;
  507. while (*it) {
  508. xml_node_free(*it);
  509. ++it;
  510. }
  511. free(children);
  512. return 0;
  513. }
  514. /**
  515. * [PUBLIC API]
  516. */
  517. struct xml_document* xml_parse_document(uint8_t* buffer, size_t length) {
  518. /* Initialize parser
  519. */
  520. struct xml_parser parser = {
  521. .buffer = buffer,
  522. .position = 0,
  523. .length = length
  524. };
  525. /* An empty buffer can never contain a valid document
  526. */
  527. if (!length) {
  528. xml_parser_error(&parser, NO_CHARACTER, "xml_parse_document::length equals zero");
  529. return 0;
  530. }
  531. /* Parse the root node
  532. */
  533. struct xml_node* root = xml_parse_node(&parser);
  534. if (!root) {
  535. xml_parser_error(&parser, NO_CHARACTER, "xml_parse_document::parsing document failed");
  536. return 0;
  537. }
  538. /* Return parsed document
  539. */
  540. struct xml_document* document = malloc(sizeof(struct xml_document));
  541. document->buffer.buffer = buffer;
  542. document->buffer.length = length;
  543. document->root = root;
  544. return document;
  545. }
  546. /**
  547. * [PUBLIC API]
  548. */
  549. struct xml_document* xml_open_document(FILE* source) {
  550. /* Prepare buffer
  551. */
  552. size_t const read_chunk = 1; // TODO 4096;
  553. size_t document_length = 0;
  554. size_t buffer_size = 1; // TODO 4069
  555. uint8_t* buffer = malloc(buffer_size * sizeof(uint8_t));
  556. /* Read hole file into buffer
  557. */
  558. while (!feof(source)) {
  559. /* Reallocate buffer
  560. */
  561. if (buffer_size - document_length < read_chunk) {
  562. buffer = realloc(buffer, buffer_size + 2 * read_chunk);
  563. buffer_size += 2 * read_chunk;
  564. }
  565. size_t read = fread(
  566. &buffer[document_length],
  567. sizeof(uint8_t), read_chunk,
  568. source
  569. );
  570. document_length += read;
  571. }
  572. fclose(source);
  573. /* Try to parse buffer
  574. */
  575. struct xml_document* document = xml_parse_document(buffer, document_length);
  576. if (!document) {
  577. free(buffer);
  578. return 0;
  579. }
  580. return document;
  581. }
  582. /**
  583. * [PUBLIC API]
  584. */
  585. void xml_document_free(struct xml_document* document, bool free_buffer) {
  586. xml_node_free(document->root);
  587. if (free_buffer) {
  588. free(document->buffer.buffer);
  589. }
  590. free(document);
  591. }
  592. /**
  593. * [PUBLIC API]
  594. */
  595. struct xml_node* xml_document_root(struct xml_document* document) {
  596. return document->root;
  597. }
  598. /**
  599. * [PUBLIC API]
  600. */
  601. struct xml_string* xml_node_name(struct xml_node* node) {
  602. return node->name;
  603. }
  604. /**
  605. * [PUBLIC API]
  606. */
  607. struct xml_string* xml_node_content(struct xml_node* node) {
  608. return node->content;
  609. }
  610. /**
  611. * [PUBLIC API]
  612. *
  613. * @warning O(n)
  614. */
  615. size_t xml_node_children(struct xml_node* node) {
  616. return get_zero_terminated_array_elements(node->children);
  617. }
  618. /**
  619. * [PUBLIC API]
  620. */
  621. struct xml_node* xml_node_child(struct xml_node* node, size_t child) {
  622. if (child >= xml_node_children(node)) {
  623. return 0;
  624. }
  625. return node->children[child];
  626. }
  627. /**
  628. * [PUBLIC API]
  629. */
  630. struct xml_node* xml_easy_child(struct xml_node* node, uint8_t const* child_name, ...) {
  631. /* Find children, one by one
  632. */
  633. struct xml_node* current = node;
  634. va_list arguments;
  635. va_start(arguments, child_name);
  636. /* Descent to current.child
  637. */
  638. while (child_name) {
  639. /* Convert child_name to xml_string for easy comparison
  640. */
  641. struct xml_string cn = {
  642. .buffer = child_name,
  643. .length = strlen(child_name)
  644. };
  645. /* Interate through all children
  646. */
  647. struct xml_node* next = 0;
  648. size_t i = 0; for (; i < xml_node_children(current); ++i) {
  649. struct xml_node* child = xml_node_child(current, i);
  650. if (xml_string_equals(xml_node_name(child), &cn)) {
  651. if (!next) {
  652. next = child;
  653. /* Two children with the same name
  654. */
  655. } else {
  656. return 0;
  657. }
  658. }
  659. }
  660. /* No child with that name found
  661. */
  662. if (!next) {
  663. return 0;
  664. }
  665. current = next;
  666. /* Find name of next child
  667. */
  668. child_name = va_arg(arguments, uint8_t const*);
  669. }
  670. va_end(arguments);
  671. /* Return current element
  672. */
  673. return current;
  674. }
  675. /**
  676. * [PUBLIC API]
  677. */
  678. uint8_t* xml_easy_name(struct xml_node* node) {
  679. if (!node) {
  680. return 0;
  681. }
  682. return xml_string_clone(xml_node_name(node));
  683. }
  684. /**
  685. * [PUBLIC API]
  686. */
  687. uint8_t* xml_easy_content(struct xml_node* node) {
  688. if (!node) {
  689. return 0;
  690. }
  691. return xml_string_clone(xml_node_content(node));
  692. }
  693. /**
  694. * [PUBLIC API]
  695. */
  696. size_t xml_string_length(struct xml_string* string) {
  697. if (!string) {
  698. return 0;
  699. }
  700. return string->length;
  701. }
  702. /**
  703. * [PUBLIC API]
  704. */
  705. void xml_string_copy(struct xml_string* string, uint8_t* buffer, size_t length) {
  706. if (!string) {
  707. return;
  708. }
  709. #define min(X,Y) ((X) < (Y) ? (X) : (Y))
  710. length = min(length, string->length);
  711. #undef min
  712. memcpy(buffer, string->buffer, length);
  713. }