From 2a050759ab621b87d0782cc8235907a1757b46cc Mon Sep 17 00:00:00 2001 From: Robin Haberkorn Date: Wed, 11 Sep 2024 14:30:24 +0200 Subject: fixed searches in single-byte encoded documents * while code is guaranteed to be in valid UTF-8, this cannot be said about the result of string building. * The search pattern can end up with invalid Unicode bytes even when searching on UTF-8 buffers, e.g. if ^EQq inserts garbage. There are currently no checks. * When searching on a raw buffer, it must be possible to search for arbitrary bytes (^EUq). Since teco_pattern2regexp() was always expecting clean UTF-8 input, this would sometimes skip over too many bytes and could even crash. * Instead, teco_pattern2regexp() now takes the target codepage into account. --- src/parser.c | 16 +++++---------- src/search.c | 57 ++++++++++++++++++++++++++++++++++++++---------------- src/string-utils.h | 13 +++++++++++++ tests/testsuite.at | 9 +-------- 4 files changed, 59 insertions(+), 36 deletions(-) diff --git a/src/parser.c b/src/parser.c index 321803a..3c37f81 100644 --- a/src/parser.c +++ b/src/parser.c @@ -158,17 +158,11 @@ gboolean teco_execute_macro(const gchar *macro, gsize macro_len, teco_qreg_table_t *qreg_table_locals, GError **error) { - /* - * Validate UTF-8, but accept null bytes. - * NOTE: there is g_utf8_validate_len() in Glib 2.60 - */ - const gchar *p = macro; - while (!g_utf8_validate(p, macro_len - (p - macro), &p) && !*p) - p++; - if (p - macro < macro_len) { - g_set_error(error, TECO_ERROR, TECO_ERROR_CODEPOINT, - "Invalid UTF-8 byte sequence at %" G_GSIZE_FORMAT, - p - macro); + const teco_string_t str = {(gchar *)macro, macro_len}; + + if (!teco_string_validate_utf8(&str)) { + g_set_error_literal(error, TECO_ERROR, TECO_ERROR_CODEPOINT, + "Invalid UTF-8 byte sequence in macro"); return FALSE; } diff --git a/src/search.c b/src/search.c index 43a2936..01c598e 100644 --- a/src/search.c +++ b/src/search.c @@ -136,6 +136,7 @@ typedef enum { * The pointer is modified and always left after * the last character used, so it may point to the * terminating null byte after the call. + * @param codepage The codepage of pattern. * @param escape_default Whether to treat single characters * as classes or not. * @param error A GError. @@ -150,7 +151,7 @@ typedef enum { */ static gchar * teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern, - gboolean escape_default, GError **error) + guint codepage, gboolean escape_default, GError **error) { while (pattern->len > 0) { switch (*state) { @@ -170,7 +171,8 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern, */ if (!escape_default) return g_strdup(""); - gsize len = g_utf8_next_char(pattern->data) - pattern->data; + gsize len = codepage == SC_CP_UTF8 + ? g_utf8_next_char(pattern->data) - pattern->data : 1; gchar *escaped = g_regex_escape_string(pattern->data, len); pattern->data += len; pattern->len -= len; @@ -235,16 +237,26 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern, case TECO_SEARCH_STATE_ANYQ: { teco_qreg_t *reg; - - /* FIXME: Once the parser is UTF-8, we need pass a code point here */ + gsize len; + gunichar chr; + + if (codepage == SC_CP_UTF8) { + len = g_utf8_next_char(pattern->data) - pattern->data; + chr = g_utf8_get_char(pattern->data); + } else { + len = 1; + chr = *pattern->data; + } switch (teco_machine_qregspec_input(teco_search_qreg_machine, - *pattern->data, ®, NULL, error)) { + chr, ®, NULL, error)) { case TECO_MACHINE_QREGSPEC_ERROR: return NULL; case TECO_MACHINE_QREGSPEC_MORE: /* incomplete, but consume byte */ - break; + pattern->data += len; + pattern->len -= len; + continue; case TECO_MACHINE_QREGSPEC_DONE: teco_machine_qregspec_reset(teco_search_qreg_machine); @@ -253,8 +265,8 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern, if (!reg->vtable->get_string(reg, &str.data, &str.len, NULL, error)) return NULL; - pattern->data++; - pattern->len--; + pattern->data += len; + pattern->len -= len; *state = TECO_SEARCH_STATE_START; return g_regex_escape_string(str.data, str.len); } @@ -289,11 +301,11 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern, * string argument) are currently not reported as errors. * * @param pattern The pattern to scan through. - * It must always be in UTF-8. * Modifies the pointer to point after the last * successfully scanned character, so it can be * called recursively. It may also point to the * terminating null byte after the call. + * @param codepage The codepage of pattern. * @param single_expr Whether to scan a single pattern * expression or an arbitrary sequence. * @param error A GError. @@ -301,7 +313,7 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern, * Must be freed with g_free(). */ static gchar * -teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error) +teco_pattern2regexp(teco_string_t *pattern, guint codepage, gboolean single_expr, GError **error) { teco_search_state_t state = TECO_SEARCH_STATE_START; g_auto(teco_string_t) re = {NULL, 0}; @@ -313,7 +325,7 @@ teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error * as classes, so we do not convert them to regexp * classes unnecessarily. */ - g_autofree gchar *temp = teco_class2regexp(&state, pattern, FALSE, error); + g_autofree gchar *temp = teco_class2regexp(&state, pattern, codepage, FALSE, error); if (!temp) return NULL; @@ -338,7 +350,8 @@ teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error case TECO_CTL_KEY('X'): teco_string_append_c(&re, '.'); break; case TECO_CTL_KEY('N'): state = TECO_SEARCH_STATE_NOT; break; default: { - gsize len = g_utf8_next_char(pattern->data) - pattern->data; + gsize len = codepage == SC_CP_UTF8 + ? g_utf8_next_char(pattern->data) - pattern->data : 1; /* the allocation could theoretically be avoided by escaping char-wise */ g_autofree gchar *escaped = g_regex_escape_string(pattern->data, len); teco_string_append(&re, escaped, strlen(escaped)); @@ -351,7 +364,7 @@ teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error case TECO_SEARCH_STATE_NOT: { state = TECO_SEARCH_STATE_START; - g_autofree gchar *temp = teco_class2regexp(&state, pattern, TRUE, error); + g_autofree gchar *temp = teco_class2regexp(&state, pattern, codepage, TRUE, error); if (!temp) return NULL; if (!*temp) @@ -387,7 +400,7 @@ teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error case TECO_SEARCH_STATE_MANY: { /* consume exactly one pattern element */ - g_autofree gchar *temp = teco_pattern2regexp(pattern, TRUE, error); + g_autofree gchar *temp = teco_pattern2regexp(pattern, codepage, TRUE, error); if (!temp) return NULL; if (!*temp) @@ -413,7 +426,7 @@ teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error state = TECO_SEARCH_STATE_START; break; default: { - g_autofree gchar *temp = teco_pattern2regexp(pattern, TRUE, error); + g_autofree gchar *temp = teco_pattern2regexp(pattern, codepage, TRUE, error); if (!temp) return NULL; if (!*temp) @@ -551,9 +564,18 @@ teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gs GRegexCompileFlags flags = G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_DOTALL; /* this is set in teco_state_search_initial() */ - if (ctx->expectstring.machine.codepage != SC_CP_UTF8) + if (ctx->expectstring.machine.codepage != SC_CP_UTF8) { /* single byte encoding */ flags |= G_REGEX_RAW; + } else if (!teco_string_validate_utf8(str)) { + /* + * While SciTECO code is always guaranteed to be in valid UTF-8, + * the result of string building may not (eg. if ^EQq inserts garbage). + */ + g_set_error_literal(error, TECO_ERROR, TECO_ERROR_CODEPOINT, + "Invalid UTF-8 byte sequence in search pattern"); + return FALSE; + } if (teco_current_doc_must_undo()) undo__teco_interface_ssm(SCI_SETSEL, @@ -568,8 +590,9 @@ teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gs g_autoptr(GRegex) re = NULL; teco_string_t pattern = *str; + g_autofree gchar *re_pattern; /* NOTE: teco_pattern2regexp() modifies str pointer */ - g_autofree gchar *re_pattern = teco_pattern2regexp(&pattern, FALSE, error); + re_pattern = teco_pattern2regexp(&pattern, ctx->expectstring.machine.codepage, FALSE, error); if (!re_pattern) return FALSE; teco_machine_qregspec_reset(teco_search_qreg_machine); diff --git a/src/string-utils.h b/src/string-utils.h index 1b4957f..806140e 100644 --- a/src/string-utils.h +++ b/src/string-utils.h @@ -197,6 +197,19 @@ teco_string_rindex(const teco_string_t *str, gchar chr) const gchar *teco_string_last_occurrence(const teco_string_t *str, const gchar *chars); +/** + * Validate whether string consists exclusively of valid UTF-8, but accept null bytes. + * @note there is g_utf8_validate_len() in Glib 2.60 + */ +static inline gboolean +teco_string_validate_utf8(const teco_string_t *str) +{ + const gchar *p = str->data; + while (!g_utf8_validate(p, str->len - (p - str->data), &p) && !*p) + p++; + return p - str->data == str->len; +} + /** @memberof teco_string_t */ static inline void teco_string_clear(teco_string_t *str) diff --git a/tests/testsuite.at b/tests/testsuite.at index 0733d2a..0c7612a 100644 --- a/tests/testsuite.at +++ b/tests/testsuite.at @@ -85,6 +85,7 @@ AT_CHECK([$SCITECO -e "@EQa//0EE 1U*0EE 0:@EUa/f^@^@/ :Qa-4\"N(0/0)' Ga Z-4\"N(0 AT_CHECK([$SCITECO -e "0EE 129@I// -A-129\"N(0/0)' HXa @EQa// EE\"N(0/0)'"], 0, ignore, ignore) AT_CHECK([$SCITECO -8e "129@:^Ua// 0Qa-129\"N(0/0)'"], 0, ignore, ignore) AT_CHECK([$SCITECO -e "1EE 167Ua @I/^EUa/ .-1\"N(0/0)'"], 0, ignore, ignore) +AT_CHECK([$SCITECO -8e "194Ua Qa@I//J :@S/^EUa/\"F(0/0)'"], 0, ignore, ignore) AT_CLEANUP AT_SETUP([Unicode]) @@ -203,11 +204,3 @@ AT_SKIP_IF([case $host in *-*-*bsd* | *-*-darwin*) true;; *) false;; esac]) AT_CHECK([$SCITECO -e "@^Um{U.a Q.a-100000\"<%.aMm'} 0Mm"], 0, ignore, ignore) AT_XFAIL_IF(true) AT_CLEANUP - -AT_SETUP([Unicode glitches]) -# While TECO code must always be UTF-8, strings after string building -# can be in single-byte encodings as well. -# It must be possible to search for single bytes in single-byte encodings. -AT_CHECK([$SCITECO -8e "164Ua Ga@I//J :@S/^EUa/\"F(0/0)'"], 0, ignore, ignore) -AT_XFAIL_IF(true) -AT_CLEANUP -- cgit v1.2.3