diff options
-rw-r--r-- | source/kit/xml.c | 216 | ||||
-rw-r--r-- | source/kit/xml.h | 13 | ||||
-rw-r--r-- | source/tests/xml.test.c | 106 |
3 files changed, 272 insertions, 63 deletions
diff --git a/source/kit/xml.c b/source/kit/xml.c index d815e9b..1c8e9f6 100644 --- a/source/kit/xml.c +++ b/source/kit/xml.c @@ -9,17 +9,49 @@ typedef struct { kit_da_xml_t tags; } kit_xml_intermediate_t; -static kit_xml_intermediate_t kit_xml_parse_buf_( - ib_t begin, kit_allocator_t *alloc) { - kit_xml_intermediate_t res; - memset(&res, 0, sizeof res); +static kit_status_t kit_xml_unescape_(str_builder_t *str) { + assert(str != NULL); + + str_builder_t buf; + DA_INIT(buf, str->size, str->alloc); + buf.size = 0; + + for (i64 i = 0; i < str->size; i++) + if (str->values[i] != '&') + buf.values[buf.size++] = str->values[i]; + else { + i64 n = 1; + while (i + n < str->size && str->values[i + n] != ';') n++; + if (i + n >= str->size) { + DA_DESTROY(buf); + return KIT_ERROR_INTERNAL; + } + if (n == 3 && memcmp(str->values + i, "<", 4) == 0) + buf.values[buf.size++] = '<'; + else if (n == 3 && memcmp(str->values + i, ">", 4) == 0) + buf.values[buf.size++] = '>'; + else if (n == 4 && memcmp(str->values + i, "&", 5) == 0) + buf.values[buf.size++] = '&'; + else if (n == 5 && memcmp(str->values + i, """, 6) == 0) + buf.values[buf.size++] = '"'; + else if (n == 5 && memcmp(str->values + i, "'", 6) == 0) + buf.values[buf.size++] = '\''; + else { + DA_DESTROY(buf); + return KIT_ERROR_INTERNAL; + } + i += n; + } - ib_t last, spaces; - memset(&last, 0, sizeof last); - memset(&spaces, 0, sizeof spaces); + DA_DESTROY(*str); + *str = buf; - ib_t tag_text = ib_until(begin, SZ("<")); - last = ib_copy(tag_text); + return KIT_OK; +} + +static ib_t kit_xml_parse_text_(ib_t begin) { + ib_t text = ib_until(begin, SZ("<")); + ib_t last = ib_copy(text); for (;;) { ib_t comment_open = ib_exact(last, SZ("<!--")); @@ -34,13 +66,13 @@ static kit_xml_intermediate_t kit_xml_parse_buf_( ib_t next_text = ib_until(comment_close, SZ("<")); if (next_text.status == KIT_OK && next_text.data.size > 0) { - i64 n = tag_text.data.size; - DA_RESIZE(tag_text.data, n + next_text.data.size); + i64 n = text.data.size; + DA_RESIZE(text.data, n + next_text.data.size); - if (tag_text.data.size != n + next_text.data.size) + if (text.data.size != n + next_text.data.size) next_text.status = KIT_ERROR_BAD_ALLOC; else - memcpy(tag_text.data.values + n, next_text.data.values, + memcpy(text.data.values + n, next_text.data.values, next_text.data.size); } @@ -53,6 +85,58 @@ static kit_xml_intermediate_t kit_xml_parse_buf_( ib_destroy(next_text); } + // move + DA_DESTROY(last.data); + last.data = text.data; + memset(&text.data, 0, sizeof text.data); + + kit_status_t s = kit_xml_unescape_(&last.data); + if (s != KIT_OK) + last.status = s; + + ib_destroy(text); + + return last; +} + +static ib_t kit_xml_parse_string_(ib_t begin) { + ib_t quotes_open = ib_exact(begin, SZ("\"")); + ib_t apostr_open = ib_exact(begin, SZ("'")); + + ib_t open = quotes_open.status == KIT_OK ? quotes_open + : apostr_open; + + ib_t text = ib_until(open, WRAP_STR(open.data)); + ib_t close = ib_exact(text, WRAP_STR(open.data)); + + // move + DA_DESTROY(close.data); + close.data = text.data; + memset(&text.data, 0, sizeof text.data); + + kit_status_t s = kit_xml_unescape_(&close.data); + if (s == KIT_OK) + close.status = s; + + ib_destroy(quotes_open); + ib_destroy(apostr_open); + ib_destroy(text); + + return close; +} + +static kit_xml_intermediate_t kit_xml_parse_buf_( + ib_t begin, kit_allocator_t *alloc) { + kit_xml_intermediate_t res; + memset(&res, 0, sizeof res); + + ib_t last, spaces; + memset(&last, 0, sizeof last); + memset(&spaces, 0, sizeof spaces); + + ib_t tag_text = kit_xml_parse_text_(begin); + last = ib_copy(tag_text); + DA_INIT(res.tags, 0, alloc); for (;;) { @@ -105,14 +189,12 @@ static kit_xml_intermediate_t kit_xml_parse_buf_( spaces = ib_any(property, SZ(" \t\r\n")); ib_t equals = ib_exact(spaces, SZ("=")); ib_destroy(spaces); - spaces = ib_any(equals, SZ(" \t\r\n")); - ib_t value_open = ib_exact(spaces, SZ("\"")); + spaces = ib_any(equals, SZ(" \t\r\n")); + ib_t value = kit_xml_parse_string_(spaces); ib_destroy(spaces); - ib_t value_text = ib_until(value_open, SZ("\"")); - ib_t value_close = ib_exact(value_text, SZ("\"")); ib_destroy(last); - last = value_close; + last = ib_copy(value); if (last.status == KIT_OK) { i64 n = tag.properties.size; @@ -127,15 +209,14 @@ static kit_xml_intermediate_t kit_xml_parse_buf_( memset(&property.data, 0, sizeof property.data); // move - tag.properties.values[n].value = value_text.data; - memset(&value_text.data, 0, sizeof value_text.data); + tag.properties.values[n].value = value.data; + memset(&value.data, 0, sizeof value.data); } } ib_destroy(property); ib_destroy(equals); - ib_destroy(value_open); - ib_destroy(value_text); + ib_destroy(value); } spaces = ib_any(last, SZ(" \t\r\n")); @@ -187,43 +268,11 @@ static kit_xml_intermediate_t kit_xml_parse_buf_( ib_destroy(tag_close_empty); } - ib_t tag_tail = ib_until(last, SZ("<")); + ib_t tag_tail = kit_xml_parse_text_(last); ib_destroy(last); last = ib_copy(tag_tail); - for (;;) { - ib_t comment_open = ib_exact(last, SZ("<!--")); - - if (comment_open.status != KIT_OK) { - ib_destroy(comment_open); - break; - } - - ib_t comment_text = ib_until(comment_open, SZ("-->")); - ib_t comment_close = ib_exact(comment_text, SZ("-->")); - ib_t next_text = ib_until(comment_close, SZ("<")); - - if (next_text.status == KIT_OK && next_text.data.size > 0) { - i64 n = tag_tail.data.size; - DA_RESIZE(tag_tail.data, n + next_text.data.size); - - if (tag_tail.data.size != n + next_text.data.size) - next_text.status = KIT_ERROR_BAD_ALLOC; - else - memcpy(tag_tail.data.values + n, next_text.data.values, - next_text.data.size); - } - - ib_destroy(last); - last = ib_copy(next_text); - - ib_destroy(comment_open); - ib_destroy(comment_text); - ib_destroy(comment_close); - ib_destroy(next_text); - } - if (last.status == KIT_OK) { i64 n = res.tags.size; DA_RESIZE(res.tags, n + 1); @@ -299,17 +348,68 @@ kit_xml_parse_result_t kit_xml_parse(kit_is_handle_t is, return res; } -kit_xml_print_result_t kit_xml_print(kit_xml_t *xml, - kit_allocator_t *alloc) { +kit_xml_text_t kit_xml_print(kit_xml_t *xml, kit_allocator_t *alloc) { assert(xml != NULL); - kit_xml_print_result_t result; + xml_text_t result; memset(&result, 0, sizeof result); result.status = KIT_ERROR_NOT_IMPLEMENTED; return result; } +static kit_status_t kit_xml_append_text_(str_builder_t *buf, + xml_t *xml) { + assert(buf != NULL); + assert(xml != NULL); + + i64 n = buf->size; + DA_RESIZE(*buf, n + xml->text.size); + + assert(buf->size == n + xml->text.size); + if (buf->size != n + xml->text.size) + return KIT_ERROR_BAD_ALLOC; + + memcpy(buf->values + n, xml->text.values, xml->text.size); + + for (i64 i = 0; i < xml->children.size; i++) { + kit_status_t s = kit_xml_append_text_(buf, + xml->children.values + i); + if (s != KIT_OK) + return s; + + str_t tail = WRAP_STR(xml->children.values[i].tail); + + if (tail.size <= 0) + continue; + + n = buf->size; + DA_RESIZE(*buf, n + tail.size); + + assert(buf->size == n + tail.size); + if (buf->size != n + tail.size) + return KIT_ERROR_BAD_ALLOC; + + memcpy(buf->values + n, tail.values, tail.size); + } + + return KIT_OK; +} + +kit_xml_text_t kit_xml_full_text(kit_xml_t *xml, + kit_allocator_t *alloc) { + kit_xml_text_t res; + res.status = KIT_OK; + DA_INIT(res.text, 0, alloc); + + if (xml != NULL) + res.status = kit_xml_append_text_(&res.text, xml); + else + res.status = KIT_ERROR_INVALID_ARGUMENT; + + return res; +} + void kit_xml_destroy(kit_xml_t *xml) { assert(xml != NULL); if (xml == NULL) diff --git a/source/kit/xml.h b/source/kit/xml.h index 00dde52..6e04878 100644 --- a/source/kit/xml.h +++ b/source/kit/xml.h @@ -35,14 +35,15 @@ typedef struct { typedef struct { kit_status_t status; - kit_str_builder_t xml; -} kit_xml_print_result_t; + kit_str_builder_t text; +} kit_xml_text_t; kit_xml_parse_result_t kit_xml_parse(kit_is_handle_t is, kit_allocator_t *alloc); -kit_xml_print_result_t kit_xml_print(kit_xml_t *xml, - kit_allocator_t *alloc); -void kit_xml_destroy(kit_xml_t *xml); +kit_xml_text_t kit_xml_print(kit_xml_t *xml, kit_allocator_t *alloc); +kit_xml_text_t kit_xml_full_text(kit_xml_t *xml, + kit_allocator_t *alloc); +void kit_xml_destroy(kit_xml_t *xml); #ifdef __cplusplus } @@ -51,9 +52,11 @@ void kit_xml_destroy(kit_xml_t *xml); #ifndef KIT_DISABLE_SHORT_NAMES # define xml_parse kit_xml_parse # define xml_print kit_xml_print +# define xml_full_text kit_xml_full_text # define xml_destroy kit_xml_destroy # define xml_t kit_xml_t # define xml_parse_result_t kit_xml_parse_result_t +# define xml_text_t kit_xml_text_t #endif #endif diff --git a/source/tests/xml.test.c b/source/tests/xml.test.c index e477acb..2a0b114 100644 --- a/source/tests/xml.test.c +++ b/source/tests/xml.test.c @@ -161,6 +161,26 @@ TEST("xml parse child") { is_destroy(is); } +TEST("xml parse child with text and tail") { + is_handle_t is = IS_WRAP_STRING(SZ("<foo>text<bar /> tail</foo>")); + xml_parse_result_t res = xml_parse(is, NULL); + + REQUIRE_EQ(res.status, KIT_OK); + + if (res.status == KIT_OK) { + REQUIRE(AR_EQUAL(res.xml.tag, SZ("foo"))); + REQUIRE(AR_EQUAL(res.xml.text, SZ("text"))); + REQUIRE_EQ(res.xml.children.size, 1); + if (res.xml.children.size == 1) { + REQUIRE(AR_EQUAL(res.xml.children.values[0].tag, SZ("bar"))); + REQUIRE(AR_EQUAL(res.xml.children.values[0].tail, SZ(" tail"))); + } + xml_destroy(&res.xml); + } + + is_destroy(is); +} + TEST("xml parse declaration") { is_handle_t is = IS_WRAP_STRING(SZ("<?foo ?>")); xml_parse_result_t res = xml_parse(is, NULL); @@ -300,4 +320,90 @@ TEST("xml parse comment tail between text") { is_destroy(is); } +TEST("xml parse escaped text") { + is_handle_t is = IS_WRAP_STRING(SZ("<foo>")); + xml_parse_result_t res = xml_parse(is, NULL); + + REQUIRE_EQ(res.status, KIT_OK); + + if (res.status == KIT_OK) { + REQUIRE_EQ(res.xml.is_declaration, 0); + REQUIRE(AR_EQUAL(res.xml.text, SZ("<foo>"))); + xml_destroy(&res.xml); + } + + is_destroy(is); +} + +TEST("xml parse escaped quote property") { + is_handle_t is = IS_WRAP_STRING(SZ("<foo bar=\"&"\" />")); + xml_parse_result_t res = xml_parse(is, NULL); + + REQUIRE_EQ(res.status, KIT_OK); + + if (res.status == KIT_OK) { + REQUIRE_EQ(res.xml.is_declaration, 0); + REQUIRE(AR_EQUAL(res.xml.tag, SZ("foo"))); + REQUIRE_EQ(res.xml.properties.size, 1); + if (res.xml.properties.size == 1) { + REQUIRE(AR_EQUAL(res.xml.properties.values[0].name, SZ("bar"))); + REQUIRE( + AR_EQUAL(res.xml.properties.values[0].value, SZ("&\""))); + } + xml_destroy(&res.xml); + } + + is_destroy(is); +} + +TEST("xml parse escaped apostrophe property") { + is_handle_t is = IS_WRAP_STRING(SZ("<foo bar='&'' />")); + xml_parse_result_t res = xml_parse(is, NULL); + + REQUIRE_EQ(res.status, KIT_OK); + + if (res.status == KIT_OK) { + REQUIRE_EQ(res.xml.is_declaration, 0); + REQUIRE(AR_EQUAL(res.xml.tag, SZ("foo"))); + REQUIRE_EQ(res.xml.properties.size, 1); + if (res.xml.properties.size == 1) { + REQUIRE(AR_EQUAL(res.xml.properties.values[0].name, SZ("bar"))); + REQUIRE(AR_EQUAL(res.xml.properties.values[0].value, SZ("&'"))); + } + xml_destroy(&res.xml); + } + + is_destroy(is); +} + +TEST("xml parse invalid escape") { + is_handle_t is = IS_WRAP_STRING(SZ("&foobar;")); + xml_parse_result_t res = xml_parse(is, NULL); + + REQUIRE_EQ(res.status, KIT_ERROR_INTERNAL); + + if (res.status == KIT_OK) + xml_destroy(&res.xml); + + is_destroy(is); +} + +TEST("xml full text") { + is_handle_t is = IS_WRAP_STRING( + SZ("<tag>foo <a>text</a> bar <b>text</b> tail</tag>")); + xml_parse_result_t res = xml_parse(is, NULL); + + REQUIRE_EQ(res.status, KIT_OK); + + if (res.status == KIT_OK) { + xml_text_t text = xml_full_text(&res.xml, NULL); + REQUIRE_EQ(text.status, KIT_OK); + REQUIRE(AR_EQUAL(text.text, SZ("foo text bar text tail"))); + DA_DESTROY(text.text); + xml_destroy(&res.xml); + } + + is_destroy(is); +} + #undef KIT_TEST_FILE |