#include "xml.h" #include "input_buffer.h" #include <assert.h> typedef struct { ib_token_t last; str_builder_t text; kit_da_xml_t tags; } kit_xml_intermediate_t; static s32 kit_xml_alloc_and_unescape_(str_builder_t *dst, str_t str, kit_allocator_t *alloc) { assert(dst != NULL); assert(str.size == 0 || str.values != NULL); if (dst == NULL) return KIT_ERROR_INTERNAL; if (str.size != 0 && str.values == NULL) return KIT_ERROR_INTERNAL; DA_INIT(*dst, str.size, alloc); if (dst->size != str.size) return KIT_ERROR_OUT_OF_MEMORY; dst->size = 0; for (i64 i = 0; i < str.size; i++) if (str.values[i] != '&') dst->values[dst->size++] = str.values[i]; else { i64 n = 1; while (i + n < str.size && str.values[i + n] != ';') n++; if (i + n >= str.size) { DA_DESTROY(*dst); return KIT_PARSING_FAILED; } if (n == 3 && memcmp(str.values + i, "<", 4) == 0) dst->values[dst->size++] = '<'; else if (n == 3 && memcmp(str.values + i, ">", 4) == 0) dst->values[dst->size++] = '>'; else if (n == 4 && memcmp(str.values + i, "&", 5) == 0) dst->values[dst->size++] = '&'; else if (n == 5 && memcmp(str.values + i, """, 6) == 0) dst->values[dst->size++] = '"'; else if (n == 5 && memcmp(str.values + i, "'", 6) == 0) dst->values[dst->size++] = '\''; else if (n - 2 <= 8 && str.values[i + 1] == '#' && str.values[i + 2] == 'x') { // hex encoding // c8 buf[8]; u64 x = 0; memcpy(buf, str.values + (i + 3), n - 2); for (i64 k = 0; k < n - 2; k++) { c8 c = str.values[i + 3 + k]; x <<= 8; if (c >= '0' && c <= '9') x |= (c - '0'); else if (c >= 'a' && c <= 'f') x |= 10 + (c - 'a'); else if (c >= 'A' && c <= 'F') x |= 10 + (c - 'A'); else { x = 0; break; } } if (x == 0 || x > 255u) { // TODO // UTF-8 encoding DA_DESTROY(*dst); return KIT_PARSING_FAILED; } dst->values[dst->size++] = (c8) x; } else if (n - 1 <= 20 && str.values[i + 1] == '#') { // dec encoding // c8 buf[20]; u64 x = 0; memcpy(buf, str.values + (i + 2), n - 2); for (i64 k = 0; k < n - 1; k++) { c8 c = str.values[i + 2 + k]; x *= 10; if (c >= '0' && c <= '9') x += (c - '0'); else { x = 0; break; } } if (x == 0 || x > 255u) { // TODO // UTF-8 encoding DA_DESTROY(*dst); return KIT_PARSING_FAILED; } dst->values[dst->size++] = (c8) x; } else { DA_DESTROY(*dst); return KIT_PARSING_FAILED; } i += n; } return KIT_OK; } static ib_token_t kit_xml_parse_text_(ib_token_t begin, str_builder_t *dst) { ib_token_t last = ib_until(begin, SZ("<")); DA_RESIZE(*dst, last.size); assert(dst->size == last.size); if (dst->size != last.size) last.status |= KIT_ERROR_OUT_OF_MEMORY ; else if (last.size > 0) memcpy(dst->values, ib_str(last).values, last.size); for (;;) { ib_token_t comment_open = ib_exact(last, SZ("<!--")); if (comment_open.status != KIT_OK) break; ib_token_t comment_text = ib_until(comment_open, SZ("-->")); ib_token_t comment_close = ib_exact(comment_text, SZ("-->")); ib_token_t next_text = ib_until(comment_close, SZ("<")); if (next_text.status == KIT_OK && next_text.size > 0) { i64 n = dst->size; DA_RESIZE(*dst, n + next_text.size); assert(dst->size == n + next_text.size); if (dst->size != n + next_text.size) next_text.status |= KIT_ERROR_OUT_OF_MEMORY ; else memcpy(dst->values + n, ib_str(next_text).values, ib_str(next_text).size); } last = next_text; } return last; } static ib_token_t kit_xml_parse_string_(ib_token_t begin, ib_token_t *value) { assert(value != NULL); if (value == NULL) { begin.status |= KIT_ERROR_INTERNAL; return begin; } ib_token_t quotes_open = ib_exact(begin, SZ("\"")); ib_token_t apostr_open = ib_exact(begin, SZ("'")); ib_token_t open = quotes_open.status == KIT_OK ? quotes_open : apostr_open; *value = ib_until(open, ib_str(open)); ib_token_t close = ib_exact(*value, ib_str(open)); return close; } static kit_xml_intermediate_t kit_xml_parse_buf_( ib_token_t begin, kit_allocator_t *alloc) { kit_xml_intermediate_t res; memset(&res, 0, sizeof res); ib_token_t last, spaces; memset(&last, 0, sizeof last); memset(&spaces, 0, sizeof spaces); str_builder_t tag_text_string; str_builder_t tag_tail_string; DA_INIT(tag_text_string, 0, alloc); DA_INIT(tag_tail_string, 0, alloc); ib_token_t tag_text = kit_xml_parse_text_(begin, &tag_text_string); last = tag_text; DA_INIT(res.tags, 0, alloc); for (;;) { ib_token_t tagend_open = ib_exact(last, SZ("</")); if (tagend_open.status == KIT_OK) break; ib_token_t tag_open = ib_exact(last, SZ("<")); if (tag_open.status != KIT_OK) break; xml_t tag; memset(&tag, 0, sizeof tag); ib_token_t decl_open = ib_exact(tag_open, SZ("?")); if (decl_open.status == KIT_OK) { tag.is_declaration = 1; last = decl_open; } else last = tag_open; spaces = ib_any(last, SZ(" \t\r\n")); ib_token_t tag_name = ib_none(spaces, SZ(" \t\r\n/>")); DA_INIT(tag.properties, 0, alloc); last = tag_name; for (;;) { spaces = ib_any(last, SZ(" \t\r\n")); ib_token_t property = ib_none(spaces, SZ(" \t\r\n=?/>")); if (property.status != KIT_OK || property.size == 0) break; spaces = ib_any(property, SZ(" \t\r\n")); ib_token_t equals = ib_exact(spaces, SZ("=")); spaces = ib_any(equals, SZ(" \t\r\n")); ib_token_t value; last = kit_xml_parse_string_(spaces, &value); if (last.status == KIT_OK) { i64 n = tag.properties.size; DA_RESIZE(tag.properties, n + 1); assert(tag.properties.size == n + 1); if (tag.properties.size != n + 1) { last.status |= KIT_ERROR_OUT_OF_MEMORY ; DA_DESTROY(tag.properties); } else { last.status |= kit_xml_alloc_and_unescape_( &tag.properties.values[n].name, ib_str(property), alloc); last.status |= kit_xml_alloc_and_unescape_( &tag.properties.values[n].value, ib_str(value), alloc); } } } if (tag.is_declaration) { ib_token_t tag_decl_close = ib_exact(spaces, SZ("?>")); last = tag_decl_close; DA_INIT(tag.text, 0, alloc); DA_INIT(tag.children, 0, alloc); } else { ib_token_t tag_close = ib_exact(spaces, SZ(">")); ib_token_t tag_close_empty = ib_exact(spaces, SZ("/>")); if (tag_close.status == KIT_OK) { kit_xml_intermediate_t im = kit_xml_parse_buf_(tag_close, alloc); tag.text = im.text; tag.children = im.tags; tagend_open = ib_exact(im.last, SZ("</")); spaces = ib_any(tagend_open, SZ(" \t\r\n")); ib_token_t tagend_name = ib_exact(spaces, ib_str(tag_name)); spaces = ib_any(tagend_name, SZ(" \t\r\n")); ib_token_t tagend_close = ib_exact(spaces, SZ(">")); last = tagend_close; } else if (tag_close_empty.status == KIT_OK) { last = tag_close_empty; DA_INIT(tag.text, 0, alloc); DA_INIT(tag.children, 0, alloc); } else last.status |= KIT_PARSING_FAILED; } ib_token_t tag_tail = kit_xml_parse_text_(last, &tag_tail_string); last = tag_tail; if (last.status == KIT_OK) { i64 n = res.tags.size; DA_RESIZE(res.tags, n + 1); assert(res.tags.size == n + 1); if (res.tags.size != n + 1) { last.status |= KIT_ERROR_OUT_OF_MEMORY ; xml_destroy(&tag); } else { last.status |= kit_xml_alloc_and_unescape_( &tag.tag, ib_str(tag_name), alloc); last.status |= kit_xml_alloc_and_unescape_( &tag.tail, WRAP_STR(tag_tail_string), alloc); res.tags.values[n] = tag; } } else xml_destroy(&tag); } if (last.status != KIT_OK) { for (i64 i = 0; i < res.tags.size; i++) xml_destroy(res.tags.values + i); DA_DESTROY(res.text); DA_DESTROY(res.tags); } else last.status |= kit_xml_alloc_and_unescape_( &res.text, WRAP_STR(tag_text_string), alloc); DA_DESTROY(tag_text_string); DA_DESTROY(tag_tail_string); res.last = last; return res; } kit_xml_parse_result_t kit_xml_parse(is_handle_t is, kit_allocator_t *alloc) { input_buffer_t ib = ib_wrap(is, alloc); kit_xml_intermediate_t im = kit_xml_parse_buf_(ib_token(&ib), alloc); kit_xml_parse_result_t res; memset(&res, 0, sizeof res); res.status = im.last.status; if (res.status != KIT_OK) { ib_destroy(&ib); return res; } if (im.text.size == 0 && im.tags.size == 1) { res.xml = im.tags.values[0]; DA_DESTROY(im.text); DA_DESTROY(im.tags); ib_destroy(&ib); return res; } DA_INIT(res.xml.tag, 0, alloc); DA_INIT(res.xml.tail, 0, alloc); DA_INIT(res.xml.properties, 0, alloc); res.xml.text = im.text; res.xml.children = im.tags; ib_destroy(&ib); return res; } kit_xml_text_t kit_xml_print(kit_xml_t *xml, kit_allocator_t *alloc) { // TODO // assert(xml != NULL); xml_text_t result; memset(&result, 0, sizeof result); result.status = KIT_ERROR_NOT_IMPLEMENTED; return result; } static s32 kit_xml_append_text_(str_builder_t *buf, xml_t *xml) { assert(buf != NULL); assert(xml != NULL); i64 n = buf->size; DA_RESIZE(*buf, n + xml->text.size); assert(buf->size == n + xml->text.size); if (buf->size != n + xml->text.size) return KIT_ERROR_OUT_OF_MEMORY; if (xml->text.size > 0) memcpy(buf->values + n, xml->text.values, xml->text.size); for (i64 i = 0; i < xml->children.size; i++) { s32 s = kit_xml_append_text_(buf, xml->children.values + i); if (s != KIT_OK) return s; str_t tail = WRAP_STR(xml->children.values[i].tail); if (tail.size <= 0) continue; n = buf->size; DA_RESIZE(*buf, n + tail.size); assert(buf->size == n + tail.size); if (buf->size != n + tail.size) return KIT_ERROR_OUT_OF_MEMORY ; if (tail.size > 0) memcpy(buf->values + n, tail.values, tail.size); } return KIT_OK; } kit_xml_text_t kit_xml_full_text(kit_xml_t *xml, kit_allocator_t *alloc) { kit_xml_text_t res; res.status = KIT_OK; DA_INIT(res.text, 0, alloc); if (xml != NULL) res.status = kit_xml_append_text_(&res.text, xml); else res.status = KIT_ERROR_INVALID_ARGUMENT; return res; } b8 kit_xml_has_property(kit_xml_t *xml, kit_str_t name) { assert(xml != NULL); if (xml == NULL) return 0; for (i64 i = 0; i < xml->properties.size; i++) if (AR_EQUAL(xml->properties.values[i].name, name)) return 1; return 0; } str_t kit_xml_property(kit_xml_t *xml, str_t name) { assert(xml != NULL); if (xml == NULL) return str(0, NULL); for (i64 i = 0; i < xml->properties.size; i++) if (AR_EQUAL(xml->properties.values[i].name, name)) return WRAP_STR(xml->properties.values[i].value); assert(0); return str(0, NULL); } void kit_xml_destroy(kit_xml_t *xml) { assert(xml != NULL); if (xml == NULL) return; for (i64 i = 0; i < xml->properties.size; i++) { DA_DESTROY(xml->properties.values[i].name); DA_DESTROY(xml->properties.values[i].value); } for (i64 i = 0; i < xml->children.size; i++) kit_xml_destroy(xml->children.values + i); DA_DESTROY(xml->tag); DA_DESTROY(xml->text); DA_DESTROY(xml->tail); DA_DESTROY(xml->properties); DA_DESTROY(xml->children); }