diff options
| author | Robin Haberkorn <rhaberkorn@fmsbw.de> | 2026-06-28 13:44:41 +0200 |
|---|---|---|
| committer | Robin Haberkorn <rhaberkorn@fmsbw.de> | 2026-06-28 13:44:41 +0200 |
| commit | 7bd7bdad687e5f790afda6f0f22444f3a169a6b1 (patch) | |
| tree | 4cafda53c6a7aec2cd49b6b7dd3b2d488e462b0e | |
| parent | 0dfb113b47d958093e6ae086c9695e5be83b24b8 (diff) | |
fixed ^EGq (character class) pattern construct for embedded null bytes and `-`
This was using g_regex_escape_string() which always translates a null byte
to `\0`, which is ambiguous if followed by other digits, so a null byte followed
by a digit would result in a wrong regular expression.
Actually the same could happen outside of character classes, ie. `@S/^@1/` was also broken.
Also it does not escape `-`, so the result cannot be used in character classes.
This is fixed now in a new custom implementation teco_regex_escape().
Once moving to a custom terex lexer, we won't need any of this of course
unless we want to provide a regex escaping string building construct.
We are now completely free of GRegex.
| -rw-r--r-- | src/search.c | 53 | ||||
| -rw-r--r-- | tests/testsuite.at | 5 |
2 files changed, 55 insertions, 3 deletions
diff --git a/src/search.c b/src/search.c index 601cc55..c156589 100644 --- a/src/search.c +++ b/src/search.c @@ -166,6 +166,53 @@ teco_state_search_initial(teco_machine_main_t *ctx, GError **error) return TRUE; } +/** + * Escape an arbitrary string, so it can be embedded into an + * Advanced Regular Expression. + * + * This is similar to g_regex_escape_string() but in contrast + * the result can be used within character classes as well. + * + * @param str String to escape. + * @param len Length of string. + * @return Regular expression as a newly allocated null-terminated string. + */ +static gchar * +teco_regex_escape(const gchar *str, gsize len) +{ + /* + * Considering the expected size of strings to escape, + * it's probably not worth to calculate a minimal size. + */ + gchar *escaped = g_malloc(len*4+1); + gchar *p = escaped; + + /* + * This works in UTF-8 as well since none of the escaped characters + * can be a continuation byte. + */ + while (len > 0) { + if (!*str) { + /* there is no shorter __unambiguous__ way */ + *p++ = '\\'; + *p++ = '0'; + *p++ = '0'; + *p++ = '0'; + } else if (strchr("^.*+?([-]{\\$", *str)) { + *p++ = '\\'; + *p++ = *str; + } else { + *p++ = *str; + } + + str++; + len--; + } + *p = '\0'; + + return escaped; +} + typedef enum { TECO_SEARCH_STATE_START, TECO_SEARCH_STATE_CTL, @@ -230,7 +277,7 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern, return g_strdup(""); gsize len = codepage == SC_CP_UTF8 ? g_utf8_next_char(pattern->data) - pattern->data : 1; - gchar *escaped = g_regex_escape_string(pattern->data, len); + gchar *escaped = teco_regex_escape(pattern->data, len); pattern->data += len; pattern->len -= len; return escaped; @@ -325,7 +372,7 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern, pattern->data += len; pattern->len -= len; *state = TECO_SEARCH_STATE_START; - return g_regex_escape_string(str.data ? : "", str.len); + return teco_regex_escape(str.data ? : "", str.len); } break; } @@ -444,7 +491,7 @@ teco_pattern2regexp(teco_string_t *pattern, teco_machine_qregspec_t *qreg_machin gsize len = codepage == SC_CP_UTF8 ? g_utf8_next_char(pattern->data) - pattern->data : 1; /* the allocation could theoretically be avoided by escaping char-wise */ - g_autofree gchar *escaped = g_regex_escape_string(pattern->data, len); + g_autofree gchar *escaped = teco_regex_escape(pattern->data, len); teco_string_append(&re, escaped, strlen(escaped)); pattern->data += len; pattern->len -= len; diff --git a/tests/testsuite.at b/tests/testsuite.at index a97e0f8..98425d1 100644 --- a/tests/testsuite.at +++ b/tests/testsuite.at @@ -515,6 +515,11 @@ AT_SETUP([Search for one of characters in uninitialized Q-Register]) TE_CHECK([[:@S/^EGa/"S(0/0)']], 0, ignore, ignore) AT_CLEANUP +AT_SETUP([Search for class with special characters]) +# Relevant when patterns are internally converted to regular expressions. +TE_CHECK([[![! @I/^@-]B/J ![! @EUc/^@1]A-C/ ::@S/^EM^EGc/"F(0/0)' ^S+3"N(0/0)']], 0, ignore, ignore) +AT_CLEANUP + AT_SETUP([Search accesses wrong Q-Register table]) TE_CHECK([[@^U.#xx/123/ @^Um{:@S/^EG.#xx/$} :Mm Mm]], 1, ignore, ignore) AT_CLEANUP |
