diff options
Diffstat (limited to 'source/kit/xml.c')
-rw-r--r-- | source/kit/xml.c | 437 |
1 files changed, 437 insertions, 0 deletions
diff --git a/source/kit/xml.c b/source/kit/xml.c new file mode 100644 index 0000000..c62ee82 --- /dev/null +++ b/source/kit/xml.c @@ -0,0 +1,437 @@ +#include "xml.h" + +#include "input_buffer.h" +#include <assert.h> + +typedef struct { + ib_t last; + kit_str_builder_t text; + kit_da_xml_t tags; +} kit_xml_intermediate_t; + +static kit_status_t kit_xml_unescape_(str_builder_t *str) { + assert(str != NULL); + + str_builder_t buf; + DA_INIT(buf, str->size, str->alloc); + buf.size = 0; + + for (i64 i = 0; i < str->size; i++) + if (str->values[i] != '&') + buf.values[buf.size++] = str->values[i]; + else { + i64 n = 1; + while (i + n < str->size && str->values[i + n] != ';') n++; + if (i + n >= str->size) { + DA_DESTROY(buf); + return KIT_ERROR_INTERNAL; + } + if (n == 3 && memcmp(str->values + i, "<", 4) == 0) + buf.values[buf.size++] = '<'; + else if (n == 3 && memcmp(str->values + i, ">", 4) == 0) + buf.values[buf.size++] = '>'; + else if (n == 4 && memcmp(str->values + i, "&", 5) == 0) + buf.values[buf.size++] = '&'; + else if (n == 5 && memcmp(str->values + i, """, 6) == 0) + buf.values[buf.size++] = '"'; + else if (n == 5 && memcmp(str->values + i, "'", 6) == 0) + buf.values[buf.size++] = '\''; + else { + DA_DESTROY(buf); + return KIT_ERROR_INTERNAL; + } + i += n; + } + + DA_DESTROY(*str); + *str = buf; + + return KIT_OK; +} + +static ib_t kit_xml_parse_text_(ib_t begin) { + ib_t text = ib_until(begin, SZ("<")); + ib_t last = ib_copy(text); + + for (;;) { + ib_t comment_open = ib_exact(last, SZ("<!--")); + + if (comment_open.status != KIT_OK) { + ib_destroy(comment_open); + break; + } + + ib_t comment_text = ib_until(comment_open, SZ("-->")); + ib_t comment_close = ib_exact(comment_text, SZ("-->")); + ib_t next_text = ib_until(comment_close, SZ("<")); + + if (next_text.status == KIT_OK && next_text.data.size > 0) { + i64 n = text.data.size; + DA_RESIZE(text.data, n + next_text.data.size); + + assert(text.data.size == n + next_text.data.size); + if (text.data.size != n + next_text.data.size) + next_text.status = KIT_ERROR_BAD_ALLOC; + else + memcpy(text.data.values + n, next_text.data.values, + next_text.data.size); + } + + ib_destroy(last); + last = ib_copy(next_text); + + ib_destroy(comment_open); + ib_destroy(comment_text); + ib_destroy(comment_close); + ib_destroy(next_text); + } + + // move + DA_DESTROY(last.data); + last.data = text.data; + memset(&text.data, 0, sizeof text.data); + + kit_status_t s = kit_xml_unescape_(&last.data); + if (s != KIT_OK) + last.status = s; + + ib_destroy(text); + + return last; +} + +static ib_t kit_xml_parse_string_(ib_t begin) { + ib_t quotes_open = ib_exact(begin, SZ("\"")); + ib_t apostr_open = ib_exact(begin, SZ("'")); + + ib_t open = quotes_open.status == KIT_OK ? quotes_open + : apostr_open; + + ib_t text = ib_until(open, WRAP_STR(open.data)); + ib_t close = ib_exact(text, WRAP_STR(open.data)); + + // move + DA_DESTROY(close.data); + close.data = text.data; + memset(&text.data, 0, sizeof text.data); + + kit_status_t s = kit_xml_unescape_(&close.data); + if (s == KIT_OK) + close.status = s; + + ib_destroy(quotes_open); + ib_destroy(apostr_open); + ib_destroy(text); + + return close; +} + +static kit_xml_intermediate_t kit_xml_parse_buf_( + ib_t begin, kit_allocator_t *alloc) { + kit_xml_intermediate_t res; + memset(&res, 0, sizeof res); + + ib_t last, spaces; + memset(&last, 0, sizeof last); + memset(&spaces, 0, sizeof spaces); + + ib_t tag_text = kit_xml_parse_text_(begin); + last = ib_copy(tag_text); + + DA_INIT(res.tags, 0, alloc); + + for (;;) { + ib_t tagend_open = ib_exact(last, SZ("</")); + ib_destroy(tagend_open); + if (tagend_open.status == KIT_OK) + break; + + ib_t tag_open = ib_exact(last, SZ("<")); + + if (tag_open.status != KIT_OK) { + ib_destroy(tag_open); + break; + } + + xml_t tag; + memset(&tag, 0, sizeof tag); + + ib_t decl_open = ib_exact(tag_open, SZ("?")); + + ib_destroy(last); + + if (decl_open.status == KIT_OK) { + tag.is_declaration = 1; + last = ib_copy(decl_open); + } else + last = ib_copy(tag_open); + + ib_destroy(decl_open); + + spaces = ib_any(last, SZ(" \t\r\n")); + ib_t tag_name = ib_none(spaces, SZ(" \t\r\n/>")); + ib_destroy(spaces); + + DA_INIT(tag.properties, 0, alloc); + + ib_destroy(last); + last = ib_copy(tag_name); + + for (;;) { + spaces = ib_any(last, SZ(" \t\r\n")); + ib_t property = ib_none(spaces, SZ(" \t\r\n=?/>")); + ib_destroy(spaces); + + if (property.status != KIT_OK || property.data.size == 0) { + ib_destroy(property); + break; + } + + spaces = ib_any(property, SZ(" \t\r\n")); + ib_t equals = ib_exact(spaces, SZ("=")); + ib_destroy(spaces); + spaces = ib_any(equals, SZ(" \t\r\n")); + ib_t value = kit_xml_parse_string_(spaces); + ib_destroy(spaces); + + ib_destroy(last); + last = ib_copy(value); + + if (last.status == KIT_OK) { + i64 n = tag.properties.size; + DA_RESIZE(tag.properties, n + 1); + + assert(tag.properties.size == n + 1); + if (tag.properties.size != n + 1) { + last.status = KIT_ERROR_BAD_ALLOC; + DA_DESTROY(tag.properties); + } else { + // move + tag.properties.values[n].name = property.data; + memset(&property.data, 0, sizeof property.data); + + // move + tag.properties.values[n].value = value.data; + memset(&value.data, 0, sizeof value.data); + } + } + + ib_destroy(property); + ib_destroy(equals); + ib_destroy(value); + } + + spaces = ib_any(last, SZ(" \t\r\n")); + + if (tag.is_declaration) { + ib_t tag_decl_close = ib_exact(spaces, SZ("?>")); + ib_destroy(spaces); + + ib_destroy(last); + last = tag_decl_close; + + DA_INIT(tag.text, 0, alloc); + DA_INIT(tag.children, 0, alloc); + } else { + ib_t tag_close = ib_exact(spaces, SZ(">")); + ib_t tag_close_empty = ib_exact(spaces, SZ("/>")); + ib_destroy(spaces); + + if (tag_close.status == KIT_OK) { + kit_xml_intermediate_t im = kit_xml_parse_buf_(tag_close, + alloc); + tag.text = im.text; + tag.children = im.tags; + + tagend_open = ib_exact(im.last, SZ("</")); + ib_destroy(im.last); + spaces = ib_any(tagend_open, SZ(" \t\r\n")); + ib_t tagend_name = ib_exact(spaces, WRAP_STR(tag_name.data)); + ib_destroy(spaces); + spaces = ib_any(tagend_name, SZ(" \t\r\n")); + ib_t tagend_close = ib_exact(spaces, SZ(">")); + ib_destroy(spaces); + ib_destroy(tagend_open); + ib_destroy(tagend_name); + + ib_destroy(last); + last = tagend_close; + + } else if (tag_close_empty.status == KIT_OK) { + ib_destroy(last); + last = ib_copy(tag_close_empty); + + DA_INIT(tag.text, 0, alloc); + DA_INIT(tag.children, 0, alloc); + } else + last.status = KIT_ERROR_INTERNAL; + + ib_destroy(tag_close); + ib_destroy(tag_close_empty); + } + + ib_t tag_tail = kit_xml_parse_text_(last); + + ib_destroy(last); + last = ib_copy(tag_tail); + + if (last.status == KIT_OK) { + i64 n = res.tags.size; + DA_RESIZE(res.tags, n + 1); + + assert(res.tags.size == n + 1); + if (res.tags.size != n + 1) { + last.status = KIT_ERROR_BAD_ALLOC; + xml_destroy(&tag); + } else { + // move + tag.tag = tag_name.data; + memset(&tag_name.data, 0, sizeof tag_name.data); + + // move + tag.tail = tag_tail.data; + memset(&tag_tail.data, 0, sizeof tag_tail.data); + + res.tags.values[n] = tag; + } + } else + xml_destroy(&tag); + + ib_destroy(tag_open); + ib_destroy(tag_name); + ib_destroy(tag_tail); + } + + if (last.status != KIT_OK) { + for (i64 i = 0; i < res.tags.size; i++) + xml_destroy(res.tags.values + i); + DA_DESTROY(res.text); + DA_DESTROY(res.tags); + } else { + // move + res.text = tag_text.data; + memset(&tag_text.data, 0, sizeof tag_text.data); + } + + ib_destroy(tag_text); + + res.last = last; + return res; +} + +kit_xml_parse_result_t kit_xml_parse(kit_is_handle_t is, + kit_allocator_t *alloc) { + ib_t ib = ib_wrap(is, alloc); + kit_xml_intermediate_t im = kit_xml_parse_buf_(ib, alloc); + ib_destroy(ib); + + kit_xml_parse_result_t res; + memset(&res, 0, sizeof res); + + res.status = im.last.status; + ib_destroy(im.last); + + if (res.status != KIT_OK) + return res; + + if (im.text.size == 0 && im.tags.size == 1) { + res.xml = im.tags.values[0]; + DA_DESTROY(im.text); + DA_DESTROY(im.tags); + return res; + } + + DA_INIT(res.xml.tag, 0, alloc); + DA_INIT(res.xml.tail, 0, alloc); + DA_INIT(res.xml.properties, 0, alloc); + + res.xml.text = im.text; + res.xml.children = im.tags; + + return res; +} + +kit_xml_text_t kit_xml_print(kit_xml_t *xml, kit_allocator_t *alloc) { + assert(xml != NULL); + + xml_text_t result; + memset(&result, 0, sizeof result); + + result.status = KIT_ERROR_NOT_IMPLEMENTED; + return result; +} + +static kit_status_t kit_xml_append_text_(str_builder_t *buf, + xml_t *xml) { + assert(buf != NULL); + assert(xml != NULL); + + i64 n = buf->size; + DA_RESIZE(*buf, n + xml->text.size); + + assert(buf->size == n + xml->text.size); + if (buf->size != n + xml->text.size) + return KIT_ERROR_BAD_ALLOC; + + if (xml->text.size > 0) + memcpy(buf->values + n, xml->text.values, xml->text.size); + + for (i64 i = 0; i < xml->children.size; i++) { + kit_status_t s = kit_xml_append_text_(buf, + xml->children.values + i); + if (s != KIT_OK) + return s; + + str_t tail = WRAP_STR(xml->children.values[i].tail); + + if (tail.size <= 0) + continue; + + n = buf->size; + DA_RESIZE(*buf, n + tail.size); + + assert(buf->size == n + tail.size); + if (buf->size != n + tail.size) + return KIT_ERROR_BAD_ALLOC; + + if (tail.size > 0) + memcpy(buf->values + n, tail.values, tail.size); + } + + return KIT_OK; +} + +kit_xml_text_t kit_xml_full_text(kit_xml_t *xml, + kit_allocator_t *alloc) { + kit_xml_text_t res; + res.status = KIT_OK; + DA_INIT(res.text, 0, alloc); + + if (xml != NULL) + res.status = kit_xml_append_text_(&res.text, xml); + else + res.status = KIT_ERROR_INVALID_ARGUMENT; + + return res; +} + +void kit_xml_destroy(kit_xml_t *xml) { + assert(xml != NULL); + if (xml == NULL) + return; + + for (i64 i = 0; i < xml->properties.size; i++) { + DA_DESTROY(xml->properties.values[i].name); + DA_DESTROY(xml->properties.values[i].value); + } + + for (i64 i = 0; i < xml->children.size; i++) + kit_xml_destroy(xml->children.values + i); + + DA_DESTROY(xml->tag); + DA_DESTROY(xml->text); + DA_DESTROY(xml->tail); + + DA_DESTROY(xml->properties); + DA_DESTROY(xml->children); +} |