diff options
Diffstat (limited to 'src/search.c')
-rw-r--r-- | src/search.c | 126 |
1 files changed, 60 insertions, 66 deletions
diff --git a/src/search.c b/src/search.c index 01c598e..81d2074 100644 --- a/src/search.c +++ b/src/search.c @@ -24,6 +24,13 @@ #include <glib.h> #include <glib/gprintf.h> +/* should always be Henry Spencer's version from contrib/hsrex */ +#define REGEX_STANDALONE +//#define REGEX_WCHAR +#include <regalone.h> +#include <regex.h> +G_DEFINE_AUTO_CLEANUP_CLEAR_FUNC(regex_t, regfree); + #include "sciteco.h" #include "string-utils.h" #include "expressions.h" @@ -463,53 +470,38 @@ teco_pattern2regexp(teco_string_t *pattern, guint codepage, gboolean single_expr } static gboolean -teco_do_search(GRegex *re, gint from, gint to, gint *count, GError **error) +teco_do_search(regex_t *re, gsize from, gsize to, gint *count, GError **error) { - g_autoptr(GMatchInfo) info = NULL; - const gchar *buffer = (const gchar *)teco_interface_ssm(SCI_GETCHARACTERPOINTER, 0, 0); - GError *tmp_error = NULL; - - /* - * NOTE: The return boolean does NOT signal whether an error was generated. - */ - g_regex_match_full(re, buffer, (gssize)to, from, 0, &info, &tmp_error); - if (tmp_error) { - g_propagate_error(error, tmp_error); - return FALSE; - } + regmatch_t info = {.rm_so = from, .rm_eo = to}; + /* FIXME: avoid moving the gap here */ + const guchar *buffer = (const guchar *)teco_interface_ssm(SCI_GETCHARACTERPOINTER, 0, 0); gint matched_from = -1, matched_to = -1; if (*count >= 0) { - while (g_match_info_matches(info) && --(*count)) { - /* - * NOTE: The return boolean does NOT signal whether an error was generated. - */ - g_match_info_next(info, &tmp_error); - if (tmp_error) { - g_propagate_error(error, tmp_error); - return FALSE; - } - } - - if (!*count) + gint rc; + while ((rc = re_exec(re, buffer+from, to-from, NULL, 1, &info, REG_NOTEOL | REG_NOTBOL)) == REG_OKAY && --(*count)) + from += info.rm_eo; + if (rc == REG_OKAY) { /* successful */ - g_match_info_fetch_pos(info, 0, - &matched_from, &matched_to); + matched_from = from+info.rm_so; + matched_to = from+info.rm_eo; + } else if (rc != REG_NOMATCH) { + // FIXME: Use regerror() + g_set_error_literal(error, TECO_ERROR, TECO_ERROR_FAILED, + "Error executing regular expression"); + return FALSE; + } } else { /* only keep the last `count' matches, in a circular stack */ - typedef struct { - gint from, to; - } teco_range_t; - - gsize matched_size = sizeof(teco_range_t) * -*count; + gsize matched_size = sizeof(regmatch_t) * -*count; /* * matched_size could overflow. * NOTE: Glib 2.48 has g_size_checked_mul() which uses * compiler intrinsics. */ - if (matched_size / sizeof(teco_range_t) != -*count) + if (matched_size / sizeof(regmatch_t) != -*count) /* guaranteed to fail either teco_memory_check() or g_malloc() */ matched_size = G_MAXSIZE; @@ -522,32 +514,29 @@ teco_do_search(GRegex *re, gint from, gint to, gint *count, GError **error) if (!teco_memory_check(matched_size, error)) return FALSE; - g_autofree teco_range_t *matched = g_malloc(matched_size); + g_autofree regmatch_t *matched = g_malloc(matched_size); gint matched_total = 0, i = 0; - while (g_match_info_matches(info)) { - g_match_info_fetch_pos(info, 0, - &matched[i].from, &matched[i].to); - - /* - * NOTE: The return boolean does NOT signal whether an error was generated. - */ - g_match_info_next(info, &tmp_error); - if (tmp_error) { - g_propagate_error(error, tmp_error); - return FALSE; - } - + gint rc; + while ((rc = re_exec(re, buffer+from, to-from, NULL, 1, &matched[i], REG_NOTEOL | REG_NOTBOL | REG_STARTEND)) == REG_OKAY) { + matched[i].rm_so += from; + matched[i].rm_eo += from; + from = matched[i].rm_eo; i = ++matched_total % -(*count); } *count = MIN(*count + matched_total, 0); - if (!*count) { - /* successful -> i points to stack bottom */ - matched_from = matched[i].from; - matched_to = matched[i].to; + if (rc != REG_NOMATCH) { + // FIXME: Use regerror() + g_set_error_literal(error, TECO_ERROR, TECO_ERROR_FAILED, + "Error executing regular expression"); + return FALSE; } + + /* successful -> i points to stack bottom */ + matched_from = matched[i].rm_so; + matched_to = matched[i].rm_eo; } if (matched_from >= 0 && matched_to >= 0) @@ -560,14 +549,11 @@ teco_do_search(GRegex *re, gint from, gint to, gint *count, GError **error) static gboolean teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gsize new_chars, GError **error) { - /* FIXME: Should G_REGEX_OPTIMIZE be added under certain circumstances? */ - GRegexCompileFlags flags = G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_DOTALL; + gint flags = REG_EXTENDED | REG_ICASE; /* this is set in teco_state_search_initial() */ - if (ctx->expectstring.machine.codepage != SC_CP_UTF8) { - /* single byte encoding */ - flags |= G_REGEX_RAW; - } else if (!teco_string_validate_utf8(str)) { + if (ctx->expectstring.machine.codepage == SC_CP_UTF8 && + !teco_string_validate_utf8(str)) { /* * While SciTECO code is always guaranteed to be in valid UTF-8, * the result of string building may not (eg. if ^EQq inserts garbage). @@ -588,7 +574,7 @@ teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gs !search_reg->vtable->set_integer(search_reg, TECO_FAILURE, error)) return FALSE; - g_autoptr(GRegex) re = NULL; + g_auto(regex_t) re = {0}; teco_string_t pattern = *str; g_autofree gchar *re_pattern; /* NOTE: teco_pattern2regexp() modifies str pointer */ @@ -602,10 +588,18 @@ teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gs if (!*re_pattern) goto failure; /* - * FIXME: Should we propagate at least some of the errors? + * FIXME: We don't have to escape null characters in re_pattern. */ - re = g_regex_new(re_pattern, flags, 0, NULL); - if (!re) +#if 0 + gint rc = ctx->expectstring.machine.codepage == SC_CP_UTF8 + ? re_wcomp(&re, re_pattern, strlen(re_pattern), flags) + : re_comp(&re, re_pattern, strlen(re_pattern), flags); +#endif + // FIXME: Apparently this is the ASCII-only version, while re_wcomp() is the widechar version + // which expects UTF-32. + // This means that teco_pattern2regexp() would have to return an UTF-32 version. + gint rc = re_comp(&re, re_pattern, strlen(re_pattern), flags); + if (rc) goto failure; if (!teco_qreg_current && @@ -616,7 +610,7 @@ teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gs gint count = teco_search_parameters.count; - if (!teco_do_search(re, teco_search_parameters.from, teco_search_parameters.to, &count, error)) + if (!teco_do_search(&re, teco_search_parameters.from, teco_search_parameters.to, &count, error)) return FALSE; if (teco_search_parameters.to_buffer && count) { @@ -631,12 +625,12 @@ teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gs teco_buffer_edit(buffer); if (buffer == teco_search_parameters.to_buffer) { - if (!teco_do_search(re, 0, teco_search_parameters.dot, &count, error)) + if (!teco_do_search(&re, 0, teco_search_parameters.dot, &count, error)) return FALSE; break; } - if (!teco_do_search(re, 0, teco_interface_ssm(SCI_GETLENGTH, 0, 0), + if (!teco_do_search(&re, 0, teco_interface_ssm(SCI_GETLENGTH, 0, 0), &count, error)) return FALSE; } while (count); @@ -646,14 +640,14 @@ teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gs teco_buffer_edit(buffer); if (buffer == teco_search_parameters.to_buffer) { - if (!teco_do_search(re, teco_search_parameters.dot, + if (!teco_do_search(&re, teco_search_parameters.dot, teco_interface_ssm(SCI_GETLENGTH, 0, 0), &count, error)) return FALSE; break; } - if (!teco_do_search(re, 0, teco_interface_ssm(SCI_GETLENGTH, 0, 0), + if (!teco_do_search(&re, 0, teco_interface_ssm(SCI_GETLENGTH, 0, 0), &count, error)) return FALSE; } while (count); |