From ab0d97147d8c19eabc41b11698dff13cd04d67ae Mon Sep 17 00:00:00 2001 From: Robin Haberkorn Date: Sat, 14 Sep 2024 19:00:01 +0200 Subject: imported Henry Spencer's regex implementation from Tcl Source: github.com/garyhouston/hsrex * This version should be a Thompson NFA, using backtracking only for backreferences, so it should be much safer than PCRE (GRegex). Search times should be linear and there should be no way to cause stack overflows (unless we would generate backreferences). * Importing the lib makes sure we don't add another compile-time dependency. Also, we could implement our own regcomp() which translates directly from TECO patterns. * This is still WIP and currently only works with the ASCII version. The widechar version does not define re_comp() and re_exec(). * Apparently we can't have an ASCII and widechar version at the same time, so we must build two libtool libraries and somehow mangle the names. * Ideally the widechar version will also work with UTF-8 strings. * An alternative might be to import the Gnulib regex module. How does it choose the encoding anyway? * Or we could just use Oniguruma - but this would have to be a new external library dependency. --- src/search.c | 126 ++++++++++++++++++++++++++++------------------------------- 1 file changed, 60 insertions(+), 66 deletions(-) (limited to 'src/search.c') diff --git a/src/search.c b/src/search.c index 01c598e..81d2074 100644 --- a/src/search.c +++ b/src/search.c @@ -24,6 +24,13 @@ #include #include +/* should always be Henry Spencer's version from contrib/hsrex */ +#define REGEX_STANDALONE +//#define REGEX_WCHAR +#include +#include +G_DEFINE_AUTO_CLEANUP_CLEAR_FUNC(regex_t, regfree); + #include "sciteco.h" #include "string-utils.h" #include "expressions.h" @@ -463,53 +470,38 @@ teco_pattern2regexp(teco_string_t *pattern, guint codepage, gboolean single_expr } static gboolean -teco_do_search(GRegex *re, gint from, gint to, gint *count, GError **error) +teco_do_search(regex_t *re, gsize from, gsize to, gint *count, GError **error) { - g_autoptr(GMatchInfo) info = NULL; - const gchar *buffer = (const gchar *)teco_interface_ssm(SCI_GETCHARACTERPOINTER, 0, 0); - GError *tmp_error = NULL; - - /* - * NOTE: The return boolean does NOT signal whether an error was generated. - */ - g_regex_match_full(re, buffer, (gssize)to, from, 0, &info, &tmp_error); - if (tmp_error) { - g_propagate_error(error, tmp_error); - return FALSE; - } + regmatch_t info = {.rm_so = from, .rm_eo = to}; + /* FIXME: avoid moving the gap here */ + const guchar *buffer = (const guchar *)teco_interface_ssm(SCI_GETCHARACTERPOINTER, 0, 0); gint matched_from = -1, matched_to = -1; if (*count >= 0) { - while (g_match_info_matches(info) && --(*count)) { - /* - * NOTE: The return boolean does NOT signal whether an error was generated. - */ - g_match_info_next(info, &tmp_error); - if (tmp_error) { - g_propagate_error(error, tmp_error); - return FALSE; - } - } - - if (!*count) + gint rc; + while ((rc = re_exec(re, buffer+from, to-from, NULL, 1, &info, REG_NOTEOL | REG_NOTBOL)) == REG_OKAY && --(*count)) + from += info.rm_eo; + if (rc == REG_OKAY) { /* successful */ - g_match_info_fetch_pos(info, 0, - &matched_from, &matched_to); + matched_from = from+info.rm_so; + matched_to = from+info.rm_eo; + } else if (rc != REG_NOMATCH) { + // FIXME: Use regerror() + g_set_error_literal(error, TECO_ERROR, TECO_ERROR_FAILED, + "Error executing regular expression"); + return FALSE; + } } else { /* only keep the last `count' matches, in a circular stack */ - typedef struct { - gint from, to; - } teco_range_t; - - gsize matched_size = sizeof(teco_range_t) * -*count; + gsize matched_size = sizeof(regmatch_t) * -*count; /* * matched_size could overflow. * NOTE: Glib 2.48 has g_size_checked_mul() which uses * compiler intrinsics. */ - if (matched_size / sizeof(teco_range_t) != -*count) + if (matched_size / sizeof(regmatch_t) != -*count) /* guaranteed to fail either teco_memory_check() or g_malloc() */ matched_size = G_MAXSIZE; @@ -522,32 +514,29 @@ teco_do_search(GRegex *re, gint from, gint to, gint *count, GError **error) if (!teco_memory_check(matched_size, error)) return FALSE; - g_autofree teco_range_t *matched = g_malloc(matched_size); + g_autofree regmatch_t *matched = g_malloc(matched_size); gint matched_total = 0, i = 0; - while (g_match_info_matches(info)) { - g_match_info_fetch_pos(info, 0, - &matched[i].from, &matched[i].to); - - /* - * NOTE: The return boolean does NOT signal whether an error was generated. - */ - g_match_info_next(info, &tmp_error); - if (tmp_error) { - g_propagate_error(error, tmp_error); - return FALSE; - } - + gint rc; + while ((rc = re_exec(re, buffer+from, to-from, NULL, 1, &matched[i], REG_NOTEOL | REG_NOTBOL | REG_STARTEND)) == REG_OKAY) { + matched[i].rm_so += from; + matched[i].rm_eo += from; + from = matched[i].rm_eo; i = ++matched_total % -(*count); } *count = MIN(*count + matched_total, 0); - if (!*count) { - /* successful -> i points to stack bottom */ - matched_from = matched[i].from; - matched_to = matched[i].to; + if (rc != REG_NOMATCH) { + // FIXME: Use regerror() + g_set_error_literal(error, TECO_ERROR, TECO_ERROR_FAILED, + "Error executing regular expression"); + return FALSE; } + + /* successful -> i points to stack bottom */ + matched_from = matched[i].rm_so; + matched_to = matched[i].rm_eo; } if (matched_from >= 0 && matched_to >= 0) @@ -560,14 +549,11 @@ teco_do_search(GRegex *re, gint from, gint to, gint *count, GError **error) static gboolean teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gsize new_chars, GError **error) { - /* FIXME: Should G_REGEX_OPTIMIZE be added under certain circumstances? */ - GRegexCompileFlags flags = G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_DOTALL; + gint flags = REG_EXTENDED | REG_ICASE; /* this is set in teco_state_search_initial() */ - if (ctx->expectstring.machine.codepage != SC_CP_UTF8) { - /* single byte encoding */ - flags |= G_REGEX_RAW; - } else if (!teco_string_validate_utf8(str)) { + if (ctx->expectstring.machine.codepage == SC_CP_UTF8 && + !teco_string_validate_utf8(str)) { /* * While SciTECO code is always guaranteed to be in valid UTF-8, * the result of string building may not (eg. if ^EQq inserts garbage). @@ -588,7 +574,7 @@ teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gs !search_reg->vtable->set_integer(search_reg, TECO_FAILURE, error)) return FALSE; - g_autoptr(GRegex) re = NULL; + g_auto(regex_t) re = {0}; teco_string_t pattern = *str; g_autofree gchar *re_pattern; /* NOTE: teco_pattern2regexp() modifies str pointer */ @@ -602,10 +588,18 @@ teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gs if (!*re_pattern) goto failure; /* - * FIXME: Should we propagate at least some of the errors? + * FIXME: We don't have to escape null characters in re_pattern. */ - re = g_regex_new(re_pattern, flags, 0, NULL); - if (!re) +#if 0 + gint rc = ctx->expectstring.machine.codepage == SC_CP_UTF8 + ? re_wcomp(&re, re_pattern, strlen(re_pattern), flags) + : re_comp(&re, re_pattern, strlen(re_pattern), flags); +#endif + // FIXME: Apparently this is the ASCII-only version, while re_wcomp() is the widechar version + // which expects UTF-32. + // This means that teco_pattern2regexp() would have to return an UTF-32 version. + gint rc = re_comp(&re, re_pattern, strlen(re_pattern), flags); + if (rc) goto failure; if (!teco_qreg_current && @@ -616,7 +610,7 @@ teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gs gint count = teco_search_parameters.count; - if (!teco_do_search(re, teco_search_parameters.from, teco_search_parameters.to, &count, error)) + if (!teco_do_search(&re, teco_search_parameters.from, teco_search_parameters.to, &count, error)) return FALSE; if (teco_search_parameters.to_buffer && count) { @@ -631,12 +625,12 @@ teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gs teco_buffer_edit(buffer); if (buffer == teco_search_parameters.to_buffer) { - if (!teco_do_search(re, 0, teco_search_parameters.dot, &count, error)) + if (!teco_do_search(&re, 0, teco_search_parameters.dot, &count, error)) return FALSE; break; } - if (!teco_do_search(re, 0, teco_interface_ssm(SCI_GETLENGTH, 0, 0), + if (!teco_do_search(&re, 0, teco_interface_ssm(SCI_GETLENGTH, 0, 0), &count, error)) return FALSE; } while (count); @@ -646,14 +640,14 @@ teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gs teco_buffer_edit(buffer); if (buffer == teco_search_parameters.to_buffer) { - if (!teco_do_search(re, teco_search_parameters.dot, + if (!teco_do_search(&re, teco_search_parameters.dot, teco_interface_ssm(SCI_GETLENGTH, 0, 0), &count, error)) return FALSE; break; } - if (!teco_do_search(re, 0, teco_interface_ssm(SCI_GETLENGTH, 0, 0), + if (!teco_do_search(&re, 0, teco_interface_ssm(SCI_GETLENGTH, 0, 0), &count, error)) return FALSE; } while (count); -- cgit v1.2.3