From 12727806af641970a651b8f969cba33677ae7395 Mon Sep 17 00:00:00 2001 From: David Oberhollenzer Date: Tue, 20 Jun 2023 17:43:21 +0200 Subject: Add a helper to libutil for splitting token separated lines Signed-off-by: David Oberhollenzer --- include/util/parse.h | 37 ++++++++++++++ lib/util/Makemodule.am | 8 ++- lib/util/src/split_line.c | 109 ++++++++++++++++++++++++++++++++++++++++ lib/util/test/split_line.c | 120 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 272 insertions(+), 2 deletions(-) create mode 100644 lib/util/src/split_line.c create mode 100644 lib/util/test/split_line.c diff --git a/include/util/parse.h b/include/util/parse.h index bc635ad..8da07b1 100644 --- a/include/util/parse.h +++ b/include/util/parse.h @@ -15,6 +15,18 @@ enum { ISTREAM_LINE_SKIP_EMPTY = 0x04, }; +enum { + SPLIT_LINE_OK = 0, + SPLIT_LINE_ALLOC = -1, + SPLIT_LINE_UNMATCHED_QUOTE = -2, + SPLIT_LINE_ESCAPE = -3, +}; + +typedef struct { + size_t count; + char *args[]; +} split_line_t; + #ifdef __cplusplus extern "C" { #endif @@ -48,6 +60,31 @@ extern "C" { SQFS_INTERNAL int istream_get_line(sqfs_istream_t *strm, char **out, size_t *line_num, int flags); +/** + * @brief Split a line of special character separated tokens + * + * The underlying string is modified, replacing sequences of separator + * characters with a single null byte and compacting the string. Every + * occourance of a termianted string is recorded in the returned structure. + * + * @param line A modifyable buffer holding a line + * @param len The maximum length of the string in the buffer to process + * @param sep A string of valid separator caracaters + * @param out Returns the token list, free this with free() + * + * @return Zero on success, a negative SPLIT_LINE_* error code on failure + */ +SQFS_INTERNAL int split_line(char *line, size_t len, + const char *sep, split_line_t **out); + +/** + * @brief Remove the first N components of a tokenized line + * + * @param sep A successfully split up line + * @param count Number of components to remove from the front + */ +SQFS_INTERNAL void split_line_remove_front(split_line_t *sep, size_t count); + #ifdef __cplusplus } #endif diff --git a/lib/util/Makemodule.am b/lib/util/Makemodule.am index 1ca4802..6386066 100644 --- a/lib/util/Makemodule.am +++ b/lib/util/Makemodule.am @@ -9,7 +9,7 @@ libutil_a_SOURCES = include/util/util.h include/util/str_table.h \ lib/util/src/canonicalize_name.c lib/util/src/filename_sane.c \ lib/util/src/source_date_epoch.c lib/util/src/file_cmp.c \ lib/util/src/hex_decode.c lib/util/src/base64_decode.c \ - lib/util/src/get_line.c + lib/util/src/get_line.c lib/util/src/split_line.c libutil_a_CFLAGS = $(AM_CFLAGS) libutil_a_CPPFLAGS = $(AM_CPPFLAGS) @@ -80,10 +80,14 @@ test_base64_decode_LDADD = libutil.a libcompat.a test_get_line_SOURCES = lib/util/test/get_line.c test_get_line_LDADD = libutil.a libio.a libcompat.a +test_split_line_SOURCES = lib/util/test/split_line.c +test_split_line_LDADD = libutil.a libcompat.a + LIBUTIL_TESTS = \ test_str_table test_rbtree test_xxhash test_threadpool test_ismemzero \ test_canonicalize_name test_filename_sane test_filename_sane_w32 \ - test_sdate_epoch test_hex_decode test_base64_decode test_get_line + test_sdate_epoch test_hex_decode test_base64_decode test_get_line \ + test_split_line check_PROGRAMS += $(LIBUTIL_TESTS) TESTS += $(LIBUTIL_TESTS) diff --git a/lib/util/src/split_line.c b/lib/util/src/split_line.c new file mode 100644 index 0000000..ede9964 --- /dev/null +++ b/lib/util/src/split_line.c @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* + * split_line.c + * + * Copyright (C) 2023 David Oberhollenzer + */ +#include "config.h" +#include "util/parse.h" + +#include +#include +#include + +static split_line_t *append_arg(split_line_t *in, char *arg) +{ + split_line_t *out = realloc(in, sizeof(*in) + + (in->count + 1) * sizeof(char *)); + + if (out == NULL) { + free(in); + return NULL; + } + + out->args[out->count++] = arg; + return out; +} + +static int is_sep(const char *sep, int c) +{ + return strchr(sep, c) != NULL && c != '\0'; +} + +int split_line(char *line, size_t len, const char *sep, split_line_t **out) +{ + split_line_t *split = calloc(1, sizeof(*split)); + char *src = line, *dst = line; + + if (split == NULL) + return SPLIT_LINE_ALLOC; + + while (len > 0 && is_sep(sep, *src)) { + ++src; + --len; + } + + while (len > 0 && *src != '\0') { + split = append_arg(split, dst); + if (split == NULL) + return SPLIT_LINE_ALLOC; + + if (*src == '"') { + ++src; + --len; + + while (len > 0 && *src != '\0' && *src != '"') { + if (src[0] == '\\') { + if (len < 2) + goto fail_esc; + if (src[1] != '"' && src[1] != '\\') + goto fail_esc; + + *(dst++) = src[1]; + src += 2; + len -= 2; + } else { + *(dst++) = *(src++); + --len; + } + } + + if (len == 0 || *src != '"') + goto fail_quote; + ++src; + --len; + } else { + while (len > 0 && !is_sep(sep, *src) && *src != '\0') { + *(dst++) = *(src++); + --len; + } + } + + while (len > 0 && is_sep(sep, *src)) { + ++src; + --len; + } + + *(dst++) = '\0'; + } + + *out = split; + return SPLIT_LINE_OK; +fail_esc: + free(split); + return SPLIT_LINE_ESCAPE; +fail_quote: + free(split); + return SPLIT_LINE_UNMATCHED_QUOTE; +} + +void split_line_remove_front(split_line_t *split, size_t count) +{ + if (count < split->count) { + for (size_t i = count, j = 0; i < split->count; ++i, ++j) + split->args[j] = split->args[i]; + split->count -= count; + } else { + split->count = 0; + } +} diff --git a/lib/util/test/split_line.c b/lib/util/test/split_line.c new file mode 100644 index 0000000..a0c32b8 --- /dev/null +++ b/lib/util/test/split_line.c @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-3.0-or-later */ +/* + * split_line.c + * + * Copyright (C) 2023 David Oberhollenzer + */ +#include "config.h" +#include "util/parse.h" +#include "util/test.h" +#include "compat.h" + +#include +#include + +static const struct { + const char *in; + size_t count; + const char **out; +} split[] = { + { "", 0, NULL }, + { " \t ", 0, NULL }, + { "foo", 1, (const char *[]){ "foo" } }, + { " foo ", 1, (const char *[]){ "foo" } }, + { "foo bar", 2, (const char *[]){ "foo", "bar" } }, + { " foo \t bar ", 2, (const char *[]){ "foo", "bar" } }, + { " foo \t bar baz ", 3, (const char *[]){ "foo", "bar", "baz" } }, + { " foo \t \" bar \" baz ", 3, + (const char *[]){ "foo", " bar ", "baz" } }, + { " foo \t \" \\\"bar \" baz ", 3, + (const char *[]){ "foo", " \"bar ", "baz" } }, + { " foo \t \" bar \\\\\" baz ", 3, + (const char *[]){ "foo", " bar \\", "baz" } }, +}; + +static const struct { + const char *in; + size_t orig; + size_t count; + size_t remain; + const char **out; +} drop_test[] = { + { "foo,bar,baz", 3, 0, 3, (const char *[]){ "foo", "bar", "baz" } }, + { "foo,bar,baz", 3, 1, 2, (const char *[]){ "bar", "baz" } }, + { "foo,bar,baz", 3, 2, 1, (const char *[]){ "baz" } }, + { "foo,bar,baz", 3, 3, 0, NULL }, + { "foo,bar,baz", 3, 4, 0, NULL }, + { "foo,bar,baz", 3, 100, 0, NULL }, +}; + +static void dump_components(split_line_t *sep) +{ + for (size_t i = 0; i < sep->count; ++i) + fprintf(stderr, "\t`%s`\n", sep->args[i]); +} + +int main(int argc, char **argv) +{ + (void)argc; (void)argv; + + for (size_t i = 0; i < sizeof(split) / sizeof(split[0]); ++i) { + split_line_t *sep; + char *copy; + int ret; + + copy = strdup(split[i].in); + TEST_NOT_NULL(copy); + + ret = split_line(copy, strlen(copy), " \t", &sep); + TEST_EQUAL_I(ret, 0); + TEST_NOT_NULL(sep); + + fprintf(stderr, "splitting `%s`\n", split[i].in); + dump_components(sep); + + TEST_EQUAL_UI(sep->count, split[i].count); + + for (size_t j = 0; j < sep->count; ++j) { + TEST_STR_EQUAL(sep->args[j], split[i].out[j]); + } + + free(sep); + free(copy); + } + + for (size_t i = 0; i < sizeof(drop_test) / sizeof(drop_test[0]); ++i) { + split_line_t *sep; + char *copy; + int ret; + + copy = strdup(drop_test[i].in); + TEST_NOT_NULL(copy); + + fprintf(stderr, "splitting `%s`\n", drop_test[i].in); + + ret = split_line(copy, strlen(copy), ",", &sep); + TEST_EQUAL_I(ret, 0); + TEST_NOT_NULL(sep); + + dump_components(sep); + + TEST_EQUAL_UI(sep->count, drop_test[i].orig); + + fprintf(stderr, "removing first %u components\n", + (unsigned int)drop_test[i].count); + + split_line_remove_front(sep, drop_test[i].count); + dump_components(sep); + + TEST_EQUAL_UI(sep->count, drop_test[i].remain); + + for (size_t j = 0; j < sep->count; ++j) { + TEST_STR_EQUAL(sep->args[j], drop_test[i].out[j]); + } + + free(sep); + free(copy); + } + + return EXIT_SUCCESS; +} -- cgit v1.2.3