From 18c419ffb4e750c3c9ea8570cf18f3099267b1bb Mon Sep 17 00:00:00 2001 From: Mitya Selivanov Date: Tue, 19 Sep 2023 05:34:00 +0200 Subject: Update xml parsing --- source/kit/input_buffer.c | 134 ++++++++++++++++++++++- source/kit/input_buffer.h | 22 +++- source/kit/xml.c | 230 +++++++++++++++++++++++++++++++-------- source/tests/input_buffer.test.c | 8 +- source/tests/xml.test.c | 85 +++++++++++++++ 5 files changed, 423 insertions(+), 56 deletions(-) (limited to 'source') diff --git a/source/kit/input_buffer.c b/source/kit/input_buffer.c index 5081cc4..3ee4959 100644 --- a/source/kit/input_buffer.c +++ b/source/kit/input_buffer.c @@ -87,6 +87,23 @@ kit_ib_t kit_ib_wrap(kit_is_handle_t upstream, return buf; } +kit_ib_t kit_ib_copy(kit_ib_t buf) { + kit_ib_t next; + memset(&next, 0, sizeof next); + + if (buf.status != KIT_OK) { + next.status = buf.status; + return next; + } + + kit_buf_acquire_(buf.internal); + + next.offset = buf.offset; + next.internal = buf.internal; + + return next; +} + kit_ib_t kit_ib_read(kit_ib_t buf, i64 size) { kit_ib_t next; memset(&next, 0, sizeof next); @@ -116,6 +133,118 @@ kit_ib_t kit_ib_read(kit_ib_t buf, i64 size) { return next; } +kit_ib_t kit_ib_any(kit_ib_t buf, kit_str_t data) { + kit_ib_t next; + memset(&next, 0, sizeof next); + + if (buf.status != KIT_OK) { + next.status = buf.status; + return next; + } + + kit_buf_acquire_(buf.internal); + + DA_INIT(next.data, 0, kit_buf_alloc_(buf.internal)); + + i64 size = 0; + + for (;; ++size) { + kit_buf_adjust_(buf.internal, buf.offset + size + 1); + + DA_RESIZE(next.data, size + 1); + + assert(next.data.size == size + 1); + if (next.data.size != size + 1) { + next.status = KIT_ERROR_BAD_ALLOC; + return next; + } + + kit_str_t destination = { .size = 1, + .values = next.data.values + size }; + i64 n = kit_buf_read_(buf.internal, buf.offset + size, + destination); + + if (n != 1) + break; + + i8 found = 0; + + for (i64 i = 0; i < data.size; i++) + if (data.values[i] == destination.values[0]) { + found = 1; + break; + } + + if (!found) + break; + } + + next.offset = buf.offset + size; + next.internal = buf.internal; + + DA_RESIZE(next.data, size); + if (next.data.size != size) + next.status = KIT_ERROR_BAD_ALLOC; + + return next; +} + +kit_ib_t kit_ib_none(kit_ib_t buf, kit_str_t data) { + kit_ib_t next; + memset(&next, 0, sizeof next); + + if (buf.status != KIT_OK) { + next.status = buf.status; + return next; + } + + kit_buf_acquire_(buf.internal); + + DA_INIT(next.data, 0, kit_buf_alloc_(buf.internal)); + + i64 size = 0; + + for (;; ++size) { + kit_buf_adjust_(buf.internal, buf.offset + size + 1); + + DA_RESIZE(next.data, size + 1); + + assert(next.data.size == size + 1); + if (next.data.size != size + 1) { + next.status = KIT_ERROR_BAD_ALLOC; + return next; + } + + kit_str_t destination = { .size = 1, + .values = next.data.values + size }; + i64 n = kit_buf_read_(buf.internal, buf.offset + size, + destination); + + if (n != 1) + break; + + i8 found = 0; + + for (i64 i = 0; i < data.size; i++) + if (data.values[i] == destination.values[0]) { + found = 1; + break; + } + + if (found) + break; + } + + next.offset = buf.offset + size; + next.internal = buf.internal; + + DA_RESIZE(next.data, size); + if (next.data.size != size) + next.status = KIT_ERROR_BAD_ALLOC; + + return next; +} + kit_ib_t kit_ib_exact(kit_ib_t buf, kit_str_t data) { kit_ib_t res = kit_ib_read(buf, data.size); if (!AR_EQUAL(res.data, data)) @@ -177,7 +306,8 @@ kit_ib_t kit_ib_until(kit_ib_t buf, kit_str_t data) { } kit_ib_t kit_ib_while(kit_ib_t buf, - kit_ib_read_condition_fn condition) { + kit_ib_read_condition_fn condition, + void *context) { kit_ib_t next; memset(&next, 0, sizeof next); @@ -209,7 +339,7 @@ kit_ib_t kit_ib_while(kit_ib_t buf, destination); kit_str_t data = { .size = size + 1, .values = next.data.values }; - if (n != 1 || condition == NULL || condition(data) == 0) + if (n != 1 || condition == NULL || condition(data, context) == 0) break; } diff --git a/source/kit/input_buffer.h b/source/kit/input_buffer.h index e34b512..a40ec99 100644 --- a/source/kit/input_buffer.h +++ b/source/kit/input_buffer.h @@ -16,24 +16,38 @@ typedef struct { kit_str_builder_t data; } kit_ib_t; -typedef i8 (*kit_ib_read_condition_fn)(kit_str_t data); +typedef i8 (*kit_ib_read_condition_fn)(kit_str_t data, void *context); kit_ib_t kit_ib_wrap(kit_is_handle_t upstream, kit_allocator_t *alloc); +kit_ib_t kit_ib_copy(kit_ib_t buf); + kit_ib_t kit_ib_read(kit_ib_t buf, i64 size); +kit_ib_t kit_ib_any(kit_ib_t buf, kit_str_t data); + +kit_ib_t kit_ib_none(kit_ib_t buf, kit_str_t data); + kit_ib_t kit_ib_exact(kit_ib_t buf, kit_str_t data); kit_ib_t kit_ib_until(kit_ib_t buf, kit_str_t data); kit_ib_t kit_ib_while(kit_ib_t buf, - kit_ib_read_condition_fn condition); + kit_ib_read_condition_fn condition, + void *context); void kit_ib_destroy(kit_ib_t buf); #define KIT_IB_WRAP(upstream) kit_ib_wrap(upstream, NULL) +#define KIT_IB_SKIP(buf_, proc_, ...) \ + do { \ + kit_ib_t temp_buf_ = (buf_); \ + (buf_) = proc_((buf_), __VA_ARGS__); \ + kit_ib_destroy((temp_buf_)); \ + } while (0) + #ifdef __cplusplus } #endif @@ -42,13 +56,17 @@ void kit_ib_destroy(kit_ib_t buf); # define ib_t kit_ib_t # define ib_read_condition_fn kit_ib_read_condition_fn # define ib_wrap kit_ib_wrap +# define ib_copy kit_ib_copy # define ib_read kit_ib_read +# define ib_any kit_ib_any +# define ib_none kit_ib_none # define ib_exact kit_ib_exact # define ib_until kit_ib_until # define ib_while kit_ib_while # define ib_destroy kit_ib_destroy # define IB_WRAP KIT_IB_WRAP +# define IB_SKIP KIT_IB_SKIP #endif #endif diff --git a/source/kit/xml.c b/source/kit/xml.c index bb16192..2bef6d5 100644 --- a/source/kit/xml.c +++ b/source/kit/xml.c @@ -3,73 +3,197 @@ #include "input_buffer.h" #include -kit_xml_parse_result_t kit_xml_parse(kit_is_handle_t is, - kit_allocator_t *alloc) { - xml_parse_result_t res; +typedef struct { + ib_t last; + kit_xml_t xml; +} kit_xml_intermediate_t; + +static kit_xml_intermediate_t kit_xml_parse_buf_( + ib_t begin, kit_allocator_t *alloc) { + kit_xml_intermediate_t res; memset(&res, 0, sizeof res); - ib_t begin = ib_wrap(is, alloc); - ib_t tag_before = ib_until(begin, SZ("<")); - ib_t tag_open = ib_exact(tag_before, SZ("<")); - ib_t tag_name = ib_until(tag_open, SZ(">")); - ib_t tag_name_empty = ib_until(tag_open, SZ("/")); - -#define return_ \ - ib_destroy(begin); \ - ib_destroy(tag_before); \ - ib_destroy(tag_open); \ - ib_destroy(tag_name); \ - ib_destroy(tag_name_empty); \ - ib_destroy(tag_close); \ + ib_t last, spaces; + memset(&last, 0, sizeof last); + memset(&spaces, 0, sizeof spaces); + + ib_t tag_open = ib_exact(begin, SZ("<")); + ib_t tag_name = ib_none(tag_open, SZ(" \t\r\n/>")); + last = ib_copy(tag_name); + + DA_INIT(res.xml.properties, 0, alloc); + + for (;;) { + spaces = ib_any(last, SZ(" \t\r\n")); + ib_t property = ib_none(spaces, SZ(" \t\r\n=/>")); + ib_destroy(spaces); + + if (property.status != KIT_OK || property.data.size == 0) { + ib_destroy(property); + break; + } + + spaces = ib_any(property, SZ(" \t\r\n")); + ib_t equals = ib_exact(spaces, SZ("=")); + ib_destroy(spaces); + spaces = ib_any(equals, SZ(" \t\r\n")); + ib_t value_open = ib_exact(spaces, SZ("\"")); + ib_t value_text = ib_until(value_open, SZ("\"")); + ib_t value_close = ib_exact(value_text, SZ("\"")); + + if (value_close.status == KIT_OK) { + i64 n = res.xml.properties.size; + DA_RESIZE(res.xml.properties, n + 1); + + if (res.xml.properties.size != n + 1) { + res.last = value_close; + res.last.status = KIT_ERROR_BAD_ALLOC; + + ib_destroy(begin); + ib_destroy(tag_open); + ib_destroy(tag_name); + ib_destroy(last); + ib_destroy(spaces); + ib_destroy(property); + ib_destroy(equals); + ib_destroy(value_open); + ib_destroy(value_text); + + DA_DESTROY(res.xml.properties); + return res; + } + + // move + res.xml.properties.values[n].name = property.data; + memset(&property.data, 0, sizeof property.data); + + // move + res.xml.properties.values[n].value = value_text.data; + memset(&value_text.data, 0, sizeof value_text.data); + } + + ib_destroy(last); + last = ib_copy(value_close); + + ib_destroy(spaces); + ib_destroy(property); + ib_destroy(equals); + ib_destroy(value_open); + ib_destroy(value_text); + ib_destroy(value_close); + + if (value_close.status != KIT_OK) + break; + } + + spaces = ib_any(last, SZ(" \t\r\n")); + ib_t tag_close = ib_exact(spaces, SZ(">")); + ib_t tag_close_empty = ib_exact(spaces, SZ("/>")); + + ib_destroy(last); + ib_destroy(spaces); + +#define return_ \ + ib_destroy(begin); \ + ib_destroy(tag_open); \ + ib_destroy(tag_name); \ + ib_destroy(tag_close); \ + ib_destroy(tag_close_empty); \ return - if (tag_name_empty.offset < tag_name.offset) { - ib_t tag_close = ib_exact(tag_name_empty, SZ("/>")); + if (tag_close_empty.status == KIT_OK) { + ib_t tag_tail = ib_until(tag_close_empty, SZ("<")); + + if (tag_tail.status != KIT_OK) { + res.last = tag_tail; - if (tag_close.status != KIT_OK) { - res.status = KIT_ERROR_INTERNAL; + DA_DESTROY(res.xml.properties); return_ res; } // move - res.xml.tag = tag_name_empty.data; - memset(&tag_name_empty.data, 0, sizeof tag_name_empty.data); + res.xml.tag = tag_name.data; + memset(&tag_name.data, 0, sizeof tag_name.data); - while (res.xml.tag.size > 0 && - res.xml.tag.values[res.xml.tag.size - 1] == ' ') - --res.xml.tag.size; + // move + res.xml.tail = tag_tail.data; + memset(&tag_tail.data, 0, sizeof tag_tail.data); DA_INIT(res.xml.text, 0, alloc); - DA_INIT(res.xml.tail, 0, alloc); - DA_INIT(res.xml.properties, 0, alloc); DA_INIT(res.xml.children, 0, alloc); - res.status = KIT_OK; + res.last = tag_tail; return_ res; } - ib_t tag_close = ib_exact(tag_name, SZ(">")); - ib_t tag_text = ib_until(tag_close, SZ("<")); - ib_t tagend_open = ib_exact(tag_text, SZ("")); + ib_t tag_tail = ib_until(tagend_close, SZ("<")); #undef return_ -#define return_ \ - ib_destroy(begin); \ - ib_destroy(tag_before); \ - ib_destroy(tag_open); \ - ib_destroy(tag_name); \ - ib_destroy(tag_name_empty); \ - ib_destroy(tag_close); \ - ib_destroy(tag_text); \ - ib_destroy(tagend_open); \ - ib_destroy(tagend_name); \ - ib_destroy(tagend_close); \ +#define return_ \ + ib_destroy(begin); \ + ib_destroy(tag_open); \ + ib_destroy(tag_name); \ + ib_destroy(tag_close); \ + ib_destroy(tag_close_empty); \ + ib_destroy(tag_text); \ + ib_destroy(tagend_open); \ + ib_destroy(tagend_name); \ + ib_destroy(tagend_close); \ return - if (tagend_close.status != KIT_OK) { - res.status = KIT_ERROR_INTERNAL; + if (tag_tail.status != KIT_OK) { + res.last = tag_tail; + + DA_DESTROY(res.xml.properties); + DA_DESTROY(res.xml.children); return_ res; } @@ -81,15 +205,25 @@ kit_xml_parse_result_t kit_xml_parse(kit_is_handle_t is, res.xml.text = tag_text.data; memset(&tag_text.data, 0, sizeof tag_text.data); - DA_INIT(res.xml.tail, 0, alloc); - DA_INIT(res.xml.properties, 0, alloc); - DA_INIT(res.xml.children, 0, alloc); + // move + res.xml.tail = tag_tail.data; + memset(&tag_tail.data, 0, sizeof tag_tail.data); - res.status = KIT_OK; + res.last = tag_tail; return_ res; #undef return_ } +kit_xml_parse_result_t kit_xml_parse(kit_is_handle_t is, + kit_allocator_t *alloc) { + kit_xml_intermediate_t im = kit_xml_parse_buf_(ib_wrap(is, alloc), + alloc); + kit_xml_parse_result_t res = { .status = im.last.status, + .xml = im.xml }; + ib_destroy(im.last); + return res; +} + kit_xml_print_result_t kit_xml_print(kit_xml_t *xml, kit_allocator_t *alloc) { assert(xml != NULL); diff --git a/source/tests/input_buffer.test.c b/source/tests/input_buffer.test.c index 237d60d..89717d1 100644 --- a/source/tests/input_buffer.test.c +++ b/source/tests/input_buffer.test.c @@ -56,7 +56,7 @@ TEST("input buffer read twice") { is_destroy(in); } -static i8 is_integer_(str_t const data) { +static i8 is_integer_(str_t const data, void *_) { for (ptrdiff_t i = 0; i < data.size; i++) if (data.values[i] < '0' || data.values[i] > '9') return 0; @@ -69,7 +69,7 @@ TEST("input buffer read integer once") { is_handle_t in = IS_WRAP_STRING(text); ib_t first = IB_WRAP(in); - ib_t second = ib_while(first, is_integer_); + ib_t second = ib_while(first, is_integer_, NULL); REQUIRE(second.status == KIT_OK); REQUIRE(second.data.size == 5); @@ -87,9 +87,9 @@ TEST("input buffer read integer twice") { is_handle_t in = IS_WRAP_STRING(text); ib_t first = IB_WRAP(in); - ib_t second = ib_while(first, is_integer_); + ib_t second = ib_while(first, is_integer_, NULL); ib_t third = ib_read(second, 1); - ib_t fourth = ib_while(third, is_integer_); + ib_t fourth = ib_while(third, is_integer_, NULL); REQUIRE(fourth.status == KIT_OK); REQUIRE(second.data.size == 3); diff --git a/source/tests/xml.test.c b/source/tests/xml.test.c index 7d2151c..52ff7c0 100644 --- a/source/tests/xml.test.c +++ b/source/tests/xml.test.c @@ -58,4 +58,89 @@ TEST("xml parse empty tag") { is_destroy(is); } +TEST("xml parse tail") { + is_handle_t is = IS_WRAP_STRING(SZ(" bar")); + xml_parse_result_t res = xml_parse(is, NULL); + + REQUIRE_EQ(res.status, KIT_OK); + + if (res.status == KIT_OK) { + REQUIRE(AR_EQUAL(res.xml.tag, SZ("foo"))); + REQUIRE(AR_EQUAL(res.xml.tail, SZ(" bar"))); + xml_destroy(&res.xml); + } + + is_destroy(is); +} + +TEST("xml parse empty tail") { + is_handle_t is = IS_WRAP_STRING(SZ(" bar")); + xml_parse_result_t res = xml_parse(is, NULL); + + REQUIRE_EQ(res.status, KIT_OK); + + if (res.status == KIT_OK) { + REQUIRE(AR_EQUAL(res.xml.tag, SZ("foo"))); + REQUIRE(AR_EQUAL(res.xml.tail, SZ(" bar"))); + xml_destroy(&res.xml); + } + + is_destroy(is); +} + +TEST("xml parse property") { + is_handle_t is = IS_WRAP_STRING(SZ("")); + xml_parse_result_t res = xml_parse(is, NULL); + + REQUIRE_EQ(res.status, KIT_OK); + + if (res.status == KIT_OK) { + REQUIRE(AR_EQUAL(res.xml.tag, SZ("foo"))); + REQUIRE_EQ(res.xml.properties.size, 1); + if (res.xml.properties.size == 1) { + REQUIRE(AR_EQUAL(res.xml.properties.values[0].name, SZ("bar"))); + REQUIRE(AR_EQUAL(res.xml.properties.values[0].value, SZ("42"))); + } + xml_destroy(&res.xml); + } + + is_destroy(is); +} + +TEST("xml parse empty property") { + is_handle_t is = IS_WRAP_STRING(SZ("")); + xml_parse_result_t res = xml_parse(is, NULL); + + REQUIRE_EQ(res.status, KIT_OK); + + if (res.status == KIT_OK) { + REQUIRE(AR_EQUAL(res.xml.tag, SZ("foo"))); + REQUIRE_EQ(res.xml.properties.size, 1); + if (res.xml.properties.size == 1) { + REQUIRE(AR_EQUAL(res.xml.properties.values[0].name, SZ("bar"))); + REQUIRE(AR_EQUAL(res.xml.properties.values[0].value, SZ("42"))); + } + xml_destroy(&res.xml); + } + + is_destroy(is); +} + +TEST("xml parse child") { + is_handle_t is = IS_WRAP_STRING(SZ("")); + xml_parse_result_t res = xml_parse(is, NULL); + + REQUIRE_EQ(res.status, KIT_OK); + + if (res.status == KIT_OK) { + REQUIRE(AR_EQUAL(res.xml.tag, SZ("foo"))); + REQUIRE_EQ(res.xml.children.size, 1); + if (res.xml.children.size == 1) + REQUIRE(AR_EQUAL(res.xml.children.values[0].tag, SZ("bar"))); + xml_destroy(&res.xml); + } + + is_destroy(is); +} + #undef KIT_TEST_FILE -- cgit v1.2.3