diff options
Diffstat (limited to 'src/search.c')
-rw-r--r-- | src/search.c | 211 |
1 files changed, 143 insertions, 68 deletions
diff --git a/src/search.c b/src/search.c index 733eab9..0d04895 100644 --- a/src/search.c +++ b/src/search.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2023 Robin Haberkorn + * Copyright (C) 2012-2024 Robin Haberkorn * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -38,11 +38,8 @@ #include "search.h" typedef struct { - /* - * FIXME: Should perhaps all be teco_int_t? - */ - gint dot; - gint from, to; + gssize dot; + gssize from, to; gint count; teco_buffer_t *from_buffer, *to_buffer; @@ -63,6 +60,9 @@ teco_state_search_initial(teco_machine_main_t *ctx, GError **error) if (ctx->mode > TECO_MODE_NORMAL) return TRUE; + teco_machine_stringbuilding_set_codepage(&ctx->expectstring.machine, + teco_interface_get_codepage()); + if (G_UNLIKELY(!teco_search_qreg_machine)) teco_search_qreg_machine = teco_machine_qregspec_new(TECO_QREG_REQUIRED, ctx->qreg_table_locals, ctx->parent.must_undo); @@ -79,16 +79,16 @@ teco_state_search_initial(teco_machine_main_t *ctx, GError **error) return FALSE; if (v1 <= v2) { teco_search_parameters.count = 1; - teco_search_parameters.from = (gint)v1; - teco_search_parameters.to = (gint)v2; + teco_search_parameters.from = teco_interface_glyphs2bytes(v1); + teco_search_parameters.to = teco_interface_glyphs2bytes(v2); } else { teco_search_parameters.count = -1; - teco_search_parameters.from = (gint)v2; - teco_search_parameters.to = (gint)v1; + teco_search_parameters.from = teco_interface_glyphs2bytes(v2); + teco_search_parameters.to = teco_interface_glyphs2bytes(v1); } - if (!teco_validate_pos(teco_search_parameters.from) || - !teco_validate_pos(teco_search_parameters.to)) { + if (teco_search_parameters.from < 0 || + teco_search_parameters.to < 0) { /* * FIXME: In derived classes, the command name will * no longer be correct. @@ -114,24 +114,10 @@ teco_state_search_initial(teco_machine_main_t *ctx, GError **error) return TRUE; } -static const gchar * -teco_regexp_escape_chr(gchar chr) -{ - static gchar escaped[] = {'\\', '\0', '\0', '\0'}; - - if (!chr) { - escaped[1] = 'c'; - escaped[2] = '@'; - return escaped; - } - - escaped[1] = chr; - escaped[2] = '\0'; - return g_ascii_isalnum(chr) ? escaped + 1 : escaped; -} - typedef enum { TECO_SEARCH_STATE_START, + TECO_SEARCH_STATE_CTL, + TECO_SEARCH_STATE_ESCAPE, TECO_SEARCH_STATE_NOT, TECO_SEARCH_STATE_CTL_E, TECO_SEARCH_STATE_ANYQ, @@ -153,6 +139,7 @@ typedef enum { * The pointer is modified and always left after * the last character used, so it may point to the * terminating null byte after the call. + * @param codepage The codepage of pattern. * @param escape_default Whether to treat single characters * as classes or not. * @param error A GError. @@ -161,10 +148,13 @@ typedef enum { * When a non-empty string is returned, the state has always * been reset to TECO_STATE_STATE_START. * Must be freed with g_free(). + * + * @fixme The allocations could be avoided by letting it append + * to the target regexp teco_string_t directly. */ static gchar * teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern, - gboolean escape_default, GError **error) + guint codepage, gboolean escape_default, GError **error) { while (pattern->len > 0) { switch (*state) { @@ -184,8 +174,12 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern, */ if (!escape_default) return g_strdup(""); - pattern->len--; - return g_strdup(teco_regexp_escape_chr(*pattern->data++)); + gsize len = codepage == SC_CP_UTF8 + ? g_utf8_next_char(pattern->data) - pattern->data : 1; + gchar *escaped = g_regex_escape_string(pattern->data, len); + pattern->data += len; + pattern->len -= len; + return escaped; } break; @@ -246,25 +240,36 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern, case TECO_SEARCH_STATE_ANYQ: { teco_qreg_t *reg; - + gsize len; + gunichar chr; + + if (codepage == SC_CP_UTF8) { + len = g_utf8_next_char(pattern->data) - pattern->data; + chr = g_utf8_get_char(pattern->data); + } else { + len = 1; + chr = *pattern->data; + } switch (teco_machine_qregspec_input(teco_search_qreg_machine, - *pattern->data, ®, NULL, error)) { + chr, ®, NULL, error)) { case TECO_MACHINE_QREGSPEC_ERROR: return NULL; case TECO_MACHINE_QREGSPEC_MORE: /* incomplete, but consume byte */ - break; + pattern->data += len; + pattern->len -= len; + continue; case TECO_MACHINE_QREGSPEC_DONE: teco_machine_qregspec_reset(teco_search_qreg_machine); g_auto(teco_string_t) str = {NULL, 0}; - if (!reg->vtable->get_string(reg, &str.data, &str.len, error)) + if (!reg->vtable->get_string(reg, &str.data, &str.len, NULL, error)) return NULL; - pattern->data++; - pattern->len--; + pattern->data += len; + pattern->len -= len; *state = TECO_SEARCH_STATE_START; return g_regex_escape_string(str.data, str.len); } @@ -303,6 +308,7 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern, * successfully scanned character, so it can be * called recursively. It may also point to the * terminating null byte after the call. + * @param codepage The codepage of pattern. * @param single_expr Whether to scan a single pattern * expression or an arbitrary sequence. * @param error A GError. @@ -310,19 +316,31 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern, * Must be freed with g_free(). */ static gchar * -teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error) +teco_pattern2regexp(teco_string_t *pattern, guint codepage, gboolean single_expr, GError **error) { teco_search_state_t state = TECO_SEARCH_STATE_START; g_auto(teco_string_t) re = {NULL, 0}; do { /* + * Previous character was caret. + * Make sure it is handled like a control character. + * This is necessary even though we have string building activated, + * to support constructs like ^Q^Q (typed with carets) in order to + * quote pattern matching characters. + */ + if (state == TECO_SEARCH_STATE_CTL) { + *pattern->data = TECO_CTL_KEY(g_ascii_toupper(*pattern->data)); + state = TECO_SEARCH_STATE_START; + } + + /* * First check whether it is a class. * This will not treat individual characters * as classes, so we do not convert them to regexp * classes unnecessarily. */ - g_autofree gchar *temp = teco_class2regexp(&state, pattern, FALSE, error); + g_autofree gchar *temp = teco_class2regexp(&state, pattern, codepage, FALSE, error); if (!temp) return NULL; @@ -344,18 +362,40 @@ teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error switch (state) { case TECO_SEARCH_STATE_START: switch (*pattern->data) { - case TECO_CTL_KEY('X'): teco_string_append_c(&re, '.'); break; - case TECO_CTL_KEY('N'): state = TECO_SEARCH_STATE_NOT; break; - default: { - const gchar *escaped = teco_regexp_escape_chr(*pattern->data); - teco_string_append(&re, escaped, strlen(escaped)); - } + case '^': + state = TECO_SEARCH_STATE_CTL; + break; + case TECO_CTL_KEY('Q'): + case TECO_CTL_KEY('R'): + state = TECO_SEARCH_STATE_ESCAPE; + break; + case TECO_CTL_KEY('X'): + teco_string_append_c(&re, '.'); + break; + case TECO_CTL_KEY('N'): + state = TECO_SEARCH_STATE_NOT; + break; + default: + state = TECO_SEARCH_STATE_ESCAPE; + continue; } break; + case TECO_SEARCH_STATE_ESCAPE: { + state = TECO_SEARCH_STATE_START; + gsize len = codepage == SC_CP_UTF8 + ? g_utf8_next_char(pattern->data) - pattern->data : 1; + /* the allocation could theoretically be avoided by escaping char-wise */ + g_autofree gchar *escaped = g_regex_escape_string(pattern->data, len); + teco_string_append(&re, escaped, strlen(escaped)); + pattern->data += len; + pattern->len -= len; + continue; + } + case TECO_SEARCH_STATE_NOT: { state = TECO_SEARCH_STATE_START; - g_autofree gchar *temp = teco_class2regexp(&state, pattern, TRUE, error); + g_autofree gchar *temp = teco_class2regexp(&state, pattern, codepage, TRUE, error); if (!temp) return NULL; if (!*temp) @@ -391,7 +431,7 @@ teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error case TECO_SEARCH_STATE_MANY: { /* consume exactly one pattern element */ - g_autofree gchar *temp = teco_pattern2regexp(pattern, TRUE, error); + g_autofree gchar *temp = teco_pattern2regexp(pattern, codepage, TRUE, error); if (!temp) return NULL; if (!*temp) @@ -417,7 +457,7 @@ teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error state = TECO_SEARCH_STATE_START; break; default: { - g_autofree gchar *temp = teco_pattern2regexp(pattern, TRUE, error); + g_autofree gchar *temp = teco_pattern2regexp(pattern, codepage, TRUE, error); if (!temp) return NULL; if (!*temp) @@ -454,16 +494,17 @@ teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error } static gboolean -teco_do_search(GRegex *re, gint from, gint to, gint *count, GError **error) +teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error) { g_autoptr(GMatchInfo) info = NULL; - const gchar *buffer = (const gchar *)teco_interface_ssm(SCI_GETCHARACTERPOINTER, 0, 0); + /* NOTE: can return NULL pointer for completely new and empty documents */ + const gchar *buffer = (const gchar *)teco_interface_ssm(SCI_GETRANGEPOINTER, from, to-from) ? : ""; GError *tmp_error = NULL; /* * NOTE: The return boolean does NOT signal whether an error was generated. */ - g_regex_match_full(re, buffer, (gssize)to, from, 0, &info, &tmp_error); + g_regex_match_full(re, buffer, to-from, 0, 0, &info, &tmp_error); if (tmp_error) { g_propagate_error(error, tmp_error); return FALSE; @@ -543,7 +584,7 @@ teco_do_search(GRegex *re, gint from, gint to, gint *count, GError **error) if (matched_from >= 0 && matched_to >= 0) /* match success */ - teco_interface_ssm(SCI_SETSEL, matched_from, matched_to); + teco_interface_ssm(SCI_SETSEL, from+matched_from, from+matched_to); return TRUE; } @@ -551,8 +592,22 @@ teco_do_search(GRegex *re, gint from, gint to, gint *count, GError **error) static gboolean teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gsize new_chars, GError **error) { - static const GRegexCompileFlags flags = G_REGEX_CASELESS | G_REGEX_MULTILINE | - G_REGEX_DOTALL | G_REGEX_RAW; + /* FIXME: Should G_REGEX_OPTIMIZE be added under certain circumstances? */ + GRegexCompileFlags flags = G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_DOTALL; + + /* this is set in teco_state_search_initial() */ + if (ctx->expectstring.machine.codepage != SC_CP_UTF8) { + /* single byte encoding */ + flags |= G_REGEX_RAW; + } else if (!teco_string_validate_utf8(str)) { + /* + * While SciTECO code is always guaranteed to be in valid UTF-8, + * the result of string building may not (eg. if ^EQq inserts garbage). + */ + g_set_error_literal(error, TECO_ERROR, TECO_ERROR_CODEPOINT, + "Invalid UTF-8 byte sequence in search pattern"); + return FALSE; + } if (teco_current_doc_must_undo()) undo__teco_interface_ssm(SCI_SETSEL, @@ -567,8 +622,9 @@ teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gs g_autoptr(GRegex) re = NULL; teco_string_t pattern = *str; + g_autofree gchar *re_pattern; /* NOTE: teco_pattern2regexp() modifies str pointer */ - g_autofree gchar *re_pattern = teco_pattern2regexp(&pattern, FALSE, error); + re_pattern = teco_pattern2regexp(&pattern, ctx->expectstring.machine.codepage, FALSE, error); if (!re_pattern) return FALSE; teco_machine_qregspec_reset(teco_search_qreg_machine); @@ -668,13 +724,15 @@ teco_state_search_done(teco_machine_main_t *ctx, const teco_string_t *str, GErro undo__teco_interface_ssm(SCI_SETANCHOR, anchor, 0); if (!search_reg->vtable->undo_set_string(search_reg, error) || - !search_reg->vtable->set_string(search_reg, str->data, str->len, error)) + !search_reg->vtable->set_string(search_reg, str->data, str->len, + teco_default_codepage(), error)) return NULL; teco_interface_ssm(SCI_SETANCHOR, anchor, 0); } else { g_auto(teco_string_t) search_str = {NULL, 0}; - if (!search_reg->vtable->get_string(search_reg, &search_str.data, &search_str.len, error) || + if (!search_reg->vtable->get_string(search_reg, &search_str.data, &search_str.len, + NULL, error) || !teco_state_search_process(ctx, &search_str, search_str.len, error)) return NULL; } @@ -890,12 +948,12 @@ teco_state_search_kill_done(teco_machine_main_t *ctx, const teco_string_t *str, if (teco_is_failure(search_state)) return &teco_state_start; - gint dot = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0); + sptr_t dot = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0); teco_interface_ssm(SCI_BEGINUNDOACTION, 0, 0); if (teco_search_parameters.dot < dot) { /* kill forwards */ - gint anchor = teco_interface_ssm(SCI_GETANCHOR, 0, 0); + sptr_t anchor = teco_interface_ssm(SCI_GETANCHOR, 0, 0); if (teco_current_doc_must_undo()) undo__teco_interface_ssm(SCI_GOTOPOS, dot, 0); @@ -903,18 +961,23 @@ teco_state_search_kill_done(teco_machine_main_t *ctx, const teco_string_t *str, teco_interface_ssm(SCI_DELETERANGE, teco_search_parameters.dot, anchor - teco_search_parameters.dot); + + /* NOTE: An undo action is not always created. */ + if (teco_current_doc_must_undo() && + teco_search_parameters.dot != anchor) + undo__teco_interface_ssm(SCI_UNDO, 0, 0); } else { /* kill backwards */ teco_interface_ssm(SCI_DELETERANGE, dot, teco_search_parameters.dot - dot); + + /* NOTE: An undo action is not always created. */ + if (teco_current_doc_must_undo() && + teco_search_parameters.dot != dot) + undo__teco_interface_ssm(SCI_UNDO, 0, 0); } teco_interface_ssm(SCI_ENDUNDOACTION, 0, 0); teco_ring_dirtify(); - /* NOTE: An undo action is not always created. */ - if (teco_current_doc_must_undo() && - teco_search_parameters.dot != dot) - undo__teco_interface_ssm(SCI_UNDO, 0, 0); - return &teco_state_start; } @@ -981,11 +1044,20 @@ teco_state_search_delete_done(teco_machine_main_t *ctx, const teco_string_t *str */ TECO_DEFINE_STATE_SEARCH(teco_state_search_delete); +static gboolean +teco_state_replace_insert_initial(teco_machine_main_t *ctx, GError **error) +{ + if (ctx->mode == TECO_MODE_NORMAL) + teco_machine_stringbuilding_set_codepage(&ctx->expectstring.machine, + teco_interface_get_codepage()); + return TRUE; +} + /* * FIXME: Could be static */ TECO_DEFINE_STATE_INSERT(teco_state_replace_insert, - .initial_cb = NULL + .initial_cb = (teco_state_initial_cb_t)teco_state_replace_insert_initial ); static teco_state_t * @@ -1058,11 +1130,13 @@ teco_state_replace_default_insert_done_overwrite(teco_machine_main_t *ctx, const if (str->len > 0) { if (!replace_reg->vtable->undo_set_string(replace_reg, error) || - !replace_reg->vtable->set_string(replace_reg, str->data, str->len, error)) + !replace_reg->vtable->set_string(replace_reg, str->data, str->len, + teco_default_codepage(), error)) return NULL; } else { g_auto(teco_string_t) replace_str = {NULL, 0}; - if (!replace_reg->vtable->get_string(replace_reg, &replace_str.data, &replace_str.len, error) || + if (!replace_reg->vtable->get_string(replace_reg, &replace_str.data, &replace_str.len, + NULL, error) || (replace_str.len > 0 && !teco_state_insert_process(ctx, &replace_str, replace_str.len, error))) return NULL; } @@ -1089,7 +1163,8 @@ teco_state_replace_default_ignore_done(teco_machine_main_t *ctx, const teco_stri g_assert(replace_reg != NULL); if (!replace_reg->vtable->undo_set_string(replace_reg, error) || - !replace_reg->vtable->set_string(replace_reg, str->data, str->len, error)) + !replace_reg->vtable->set_string(replace_reg, str->data, str->len, + teco_default_codepage(), error)) return NULL; return &teco_state_start; |