diff options
Diffstat (limited to 'src/search.c')
| -rw-r--r-- | src/search.c | 86 |
1 files changed, 74 insertions, 12 deletions
diff --git a/src/search.c b/src/search.c index 601cc55..ce4a338 100644 --- a/src/search.c +++ b/src/search.c @@ -166,6 +166,53 @@ teco_state_search_initial(teco_machine_main_t *ctx, GError **error) return TRUE; } +/** + * Escape an arbitrary string, so it can be embedded into an + * Advanced Regular Expression. + * + * This is similar to g_regex_escape_string() but in contrast + * the result can be used within character classes as well. + * + * @param str String to escape. + * @param len Length of string. + * @return Regular expression as a newly allocated null-terminated string. + */ +static gchar * +teco_regex_escape(const gchar *str, gsize len) +{ + /* + * Considering the expected size of strings to escape, + * it's probably not worth to calculate a minimal size. + */ + gchar *escaped = g_malloc(len*4+1); + gchar *p = escaped; + + /* + * This works in UTF-8 as well since none of the escaped characters + * can be a continuation byte. + */ + while (len > 0) { + if (!*str) { + /* there is no shorter __unambiguous__ way */ + *p++ = '\\'; + *p++ = '0'; + *p++ = '0'; + *p++ = '0'; + } else if (strchr("^.*+?(|[-]{\\$", *str)) { + *p++ = '\\'; + *p++ = *str; + } else { + *p++ = *str; + } + + str++; + len--; + } + *p = '\0'; + + return escaped; +} + typedef enum { TECO_SEARCH_STATE_START, TECO_SEARCH_STATE_CTL, @@ -230,7 +277,7 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern, return g_strdup(""); gsize len = codepage == SC_CP_UTF8 ? g_utf8_next_char(pattern->data) - pattern->data : 1; - gchar *escaped = g_regex_escape_string(pattern->data, len); + gchar *escaped = teco_regex_escape(pattern->data, len); pattern->data += len; pattern->len -= len; return escaped; @@ -325,7 +372,7 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern, pattern->data += len; pattern->len -= len; *state = TECO_SEARCH_STATE_START; - return g_regex_escape_string(str.data ? : "", str.len); + return teco_regex_escape(str.data ? : "", str.len); } break; } @@ -378,14 +425,29 @@ teco_pattern2regexp(teco_string_t *pattern, teco_machine_qregspec_t *qreg_machin g_auto(teco_string_t) re = {NULL, 0}; do { - /* - * Previous character was caret. - * Make sure it is handled like a control character. - * This is necessary even though we have string building activated, - * to support constructs like ^Q^Q (typed with carets) in order to - * quote pattern matching characters. - */ if (state == TECO_SEARCH_STATE_CTL) { + if (*pattern->data == '~') { + /* rest of pattern is a regular expression */ + teco_string_append(&re, pattern->data+1, pattern->len-1); + /* + * FIXME: In terex, it actually could contain null bytes. + */ + if (teco_string_contains(re, '\0')) { + g_set_error_literal(error, TECO_ERROR, TECO_ERROR_FAILED, + "Regular expression must not contain null-byte" + " - use \\0 instead"); + return NULL; + } + return g_steal_pointer(&re.data) ? : g_strdup(""); + } + + /* + * Previous character was caret. + * Make sure it is handled like a control character. + * This is necessary even though we have string building activated, + * to support constructs like ^Q^Q (typed with carets) in order to + * quote pattern matching characters. + */ *pattern->data = TECO_CTL_KEY(g_ascii_toupper(*pattern->data)); state = TECO_SEARCH_STATE_START; } @@ -444,7 +506,7 @@ teco_pattern2regexp(teco_string_t *pattern, teco_machine_qregspec_t *qreg_machin gsize len = codepage == SC_CP_UTF8 ? g_utf8_next_char(pattern->data) - pattern->data : 1; /* the allocation could theoretically be avoided by escaping char-wise */ - g_autofree gchar *escaped = g_regex_escape_string(pattern->data, len); + g_autofree gchar *escaped = teco_regex_escape(pattern->data, len); teco_string_append(&re, escaped, strlen(escaped)); pattern->data += len; pattern->len -= len; @@ -736,8 +798,8 @@ teco_do_search_backwards(regex_t *re, gsize from, gsize to, gint *count, GError gsize to_block = to-from; while (to_block > 0) { - gsize from_block = teco_search_block_size > 0 - ? MAX(0, to_block - teco_search_block_size) : 0; + gsize from_block = teco_search_block_size > 0 && to_block >= teco_search_block_size + ? to_block - teco_search_block_size : 0; /* how many bytes have been consumed in the current block */ gsize offset = 0; |
