diff options
Diffstat (limited to 'src/search.c')
| -rw-r--r-- | src/search.c | 254 |
1 files changed, 129 insertions, 125 deletions
diff --git a/src/search.c b/src/search.c index 90975c2..601cc55 100644 --- a/src/search.c +++ b/src/search.c @@ -25,6 +25,10 @@ #include <glib.h> #include <glib/gprintf.h> +/* should always be from contrib/terex */ +#include <regalone.h> +#include <regex.h> + #include "sciteco.h" #include "string-utils.h" #include "expressions.h" @@ -57,6 +61,18 @@ TECO_DEFINE_UNDO_SCALAR(teco_search_parameters_t); */ static teco_search_parameters_t teco_search_parameters; +G_DEFINE_AUTO_CLEANUP_CLEAR_FUNC(regex_t, tere_free); + +/* not in error.h since we don't want to draw in the terex headers */ +static inline void +teco_error_regex_set(GError **error, gint rc, const regex_t *re) +{ + gchar buf[1024]; + tere_error(rc, re, buf, sizeof(buf)); + g_set_error(error, TECO_ERROR, TECO_ERROR_REGEX, + "Error executing regular expression: %s", buf); +} + /*$ "^X" "search mode" * mode^X -- Set or get search mode flag * -^X @@ -551,24 +567,21 @@ TECO_DEFINE_UNDO_OBJECT_OWN(ranges, teco_range_t *, g_free); /** * Extract the ranges of the given GMatchInfo. * - * @param match_info The result of g_regex_match(). + * @param match_info The result of re_exec(). + * @param count Number of matches (subpatterns). * @param offset The beginning of the match operation in bytes. * Match results will be relative to this offset. - * @param count Where to store the number of ranges (subpatterns). * @returns Ranges (subpatterns) in absolute byte positions. * They \b must still be converted to glyph positions afterwards. */ static teco_range_t * -teco_get_ranges(const GMatchInfo *match_info, gsize offset, guint *count) +teco_get_ranges(const regmatch_t *match_info, guint count, gsize offset) { - *count = g_match_info_get_match_count(match_info); - teco_range_t *ranges = g_new(teco_range_t, *count); - - for (gint i = 0; i < *count; i++) { - gint from, to; - g_match_info_fetch_pos(match_info, i, &from, &to); - ranges[i].from = offset+MAX(from, 0); - ranges[i].to = offset+MAX(to, 0); + teco_range_t *ranges = g_new(teco_range_t, count); + + for (gint i = 0; i < count; i++) { + ranges[i].from = offset+match_info[i].rm_so; + ranges[i].to = offset+match_info[i].rm_eo; } return ranges; @@ -613,32 +626,26 @@ G_DEFINE_AUTOPTR_CLEANUP_FUNC(teco_matches_t, teco_matches_free); * @return FALSE if an error occurred */ static gboolean -teco_do_search_forward(GRegex *re, gsize from, gsize to, gint *count, GError **error) +teco_do_search_forward(regex_t *re, gsize from, gsize to, gint *count, GError **error) { - g_autoptr(GMatchInfo) info = NULL; /* NOTE: can return NULL pointer for completely new and empty documents */ const gchar *buffer = (const gchar *)teco_interface_ssm(SCI_GETRANGEPOINTER, from, to-from) ? : ""; - GError *tmp_error = NULL; + + g_assert(*count > 0); /* - * NOTE: The return boolean does NOT signal whether an error was generated. + * FIXME: Repeated allocation could be avoided when scanning over buffer boundaries. + * If it's worth it... */ - g_regex_match_full(re, buffer, to-from, 0, 0, &info, &tmp_error); - if (tmp_error) { - g_propagate_error(error, tmp_error); - return FALSE; - } + g_autofree regmatch_t *info = g_new(regmatch_t, 1+re->re_nsub); - g_assert(*count > 0); - while (g_match_info_matches(info) && --(*count)) { - /* - * NOTE: The return boolean does NOT signal whether an error was generated. - */ - g_match_info_next(info, &tmp_error); - if (tmp_error) { - g_propagate_error(error, tmp_error); - return FALSE; - } + static const gint eflags = REG_NOTEOL | REG_NOTBOL; + + gint rc; + while ((rc = tere_exec(re, (const chr *)buffer, to-from, NULL, + 1+re->re_nsub, info, eflags)) == REG_OKAY && --(*count)) { + buffer += info[0].rm_eo; + from += info[0].rm_eo; /* * FIXME: A single pathological match could already be excessively slow. @@ -649,22 +656,21 @@ teco_do_search_forward(GRegex *re, gsize from, gsize to, gint *count, GError **e } } - if (!*count) { + if (rc == REG_OKAY) { /* successful */ - teco_undo_guint(teco_ranges_count); - teco_undo_ranges_own(teco_ranges) = teco_get_ranges(info, from, &teco_ranges_count); + g_assert(*count == 0); + teco_undo_guint(teco_ranges_count) = 1+re->re_nsub; + teco_undo_ranges_own(teco_ranges) = teco_get_ranges(info, teco_ranges_count, from); + } else if (rc != REG_NOMATCH) { + teco_error_regex_set(error, rc, re); + return FALSE; } return TRUE; } -/** - * Block size for backwards scanning or 0 - * - * @bug Block-wise matching is currently broken, - * so we disable this by default - see below. - */ -gsize teco_search_block_size = 0; //4*1024; +/** block size for backwards scanning or 0 */ +gsize teco_search_block_size = 4*1024; /** * Search backwards, in blocks of teco_search_block_size @@ -681,7 +687,7 @@ gsize teco_search_block_size = 0; //4*1024; * @see teco_do_search_forward */ static gboolean -teco_do_search_backwards(GRegex *re, gsize from, gsize to, gint *count, GError **error) +teco_do_search_backwards(regex_t *re, gsize from, gsize to, gint *count, GError **error) { /* * NOTE: can return NULL pointer for completely new and empty documents. @@ -692,6 +698,7 @@ teco_do_search_backwards(GRegex *re, gsize from, gsize to, gint *count, GError * const gchar *buffer = (const gchar *)teco_interface_ssm(SCI_GETRANGEPOINTER, from, to-from) ? : ""; g_assert(*count < 0); + guint matched_num = -*count; gsize total_size = sizeof(teco_matches_t) + sizeof(teco_match_t[matched_num]); @@ -713,98 +720,83 @@ teco_do_search_backwards(GRegex *re, gsize from, gsize to, gint *count, GError * if (!teco_memory_check(total_size, error)) return FALSE; + /* + * FIXME: The `matched` and `info` allocations are repeated when scanning + * over buffer boundaries and could be avoided by sharing them between + * teco_do_search() calls. If it's worth it... + */ g_autoptr(teco_matches_t) matched = g_malloc0(total_size); matched->count = matched_num; + g_autofree regmatch_t *info = g_new(regmatch_t, 1+re->re_nsub); + gint matched_total = 0; gint i = 0; /* ring buffer pointer into the `matched->matches` array */ gsize to_block = to-from; while (to_block > 0) { - g_autoptr(GMatchInfo) info = NULL; - gsize from_block = teco_search_block_size > 0 ? MAX(0, to_block - teco_search_block_size) : 0; - /* - * FIXME: DEC TECO search semantics could actually demand - * allowing matches to extend beyond the [from,to] range. - */ - GRegexMatchFlags flags = to_block != to-from ? G_REGEX_MATCH_PARTIAL_HARD : 0; + /* how many bytes have been consumed in the current block */ + gsize offset = 0; - GError *tmp_error = NULL; + static const gint eflags = REG_NOTEOL | REG_NOTBOL; + /* for partial matches - mandatory when using REG_EXPECT */ + rm_detail_t details; - /* - * NOTE: The return boolean does NOT signal whether an error was generated. - * FIXME: Why isn't it possible to specify a start_position != 0? - */ - g_regex_match_full(re, buffer+from_block, to_block-from_block, 0, - flags, &info, &tmp_error); - if (tmp_error) { - g_propagate_error(error, tmp_error); - return FALSE; - } + gint rc; for (;;) { + /* + * FIXME: A single pathological match could already be excessively slow. + */ if (G_UNLIKELY(teco_interface_is_interrupted())) { teco_error_interrupted_set(error); return FALSE; } - if (g_match_info_matches(info)) { - g_free(matched->matches[i].ranges); - matched->matches[i].ranges = teco_get_ranges(info, from+from_block, - &matched->matches[i].num_ranges); - i = ++matched_total % matched_num; - } else if (G_UNLIKELY(g_match_info_is_partial_match(info))) { - /* - * Match may fall on the block boundary, - * so retry matching the rest of the document. - * This is the only case where we have to rescan - * the same memory more than once. - * - * FIXME FIXME FIXME: We cannot retrieve the position here - * since g_match_info_fetch_pos() treats partial matches as errors. - * This is a confirmed glib bug and fast backwards searches - * will continue to be broken until we switch to a custom regexp - * engine. - */ - gint partial_start, partial_end; - G_GNUC_UNUSED gboolean rc; - rc = g_match_info_fetch_pos(info, 0, &partial_start, &partial_end); - //g_assert(rc == TRUE); - if (!rc) - /* make sure that test case fails */ - abort(); - g_assert(partial_end == to_block-from_block); - - g_autoptr(GMatchInfo) partial_info = NULL; - - g_regex_match_full(re, buffer+partial_start, to-from-partial_start, 0, - G_REGEX_MATCH_ANCHORED, &partial_info, &tmp_error); - if (tmp_error) { - g_propagate_error(error, tmp_error); - return FALSE; - } + rc = tere_exec(re, (const chr *)buffer+from_block+offset, to_block-from_block-offset, + &details, 1+re->re_nsub, info, eflags); + if (rc != REG_OKAY) + break; - if (g_match_info_matches(partial_info)) { - g_free(matched->matches[i].ranges); - matched->matches[i].ranges = teco_get_ranges(partial_info, from+partial_start, - &matched->matches[i].num_ranges); - i = ++matched_total % matched_num; - } + /* normal full match */ + g_free(matched->matches[i].ranges); + matched->matches[i].num_ranges = 1+re->re_nsub; + matched->matches[i].ranges = teco_get_ranges(info, matched->matches[i].num_ranges, + from+from_block+offset); + i = ++matched_total % matched_num; - /* there might still be other matches within the current block */ - } else { - break; - } + offset += info[0].rm_eo; + } + + if (rc != REG_NOMATCH) { + teco_error_regex_set(error, rc, re); + return FALSE; + } + if (G_UNLIKELY(to_block != to-from && + details.rm_extend.rm_eo == to_block-from_block-offset)) { /* - * NOTE: The return boolean does NOT signal whether an error was generated. + * Match may fall on the block boundary, + * so retry matching the rest of the document. + * This is the only case where we have to rescan + * the same memory more than once. */ - g_match_info_next(info, &tmp_error); - if (tmp_error) { - g_propagate_error(error, tmp_error); + gsize partial_start = from_block+offset+details.rm_extend.rm_so; + + rc = tere_exec(re, (const chr *)buffer+partial_start, to-from-partial_start, + &details, 1+re->re_nsub, info, eflags | REG_ANCHORED); + + if (rc == REG_OKAY) { + g_free(matched->matches[i].ranges); + matched->matches[i].num_ranges = 1+re->re_nsub; + matched->matches[i].ranges = teco_get_ranges(info, matched->matches[i].num_ranges, + from+partial_start); + i = ++matched_total % matched_num; + } else if (rc != REG_NOMATCH) { + teco_error_regex_set(error, rc, re); return FALSE; } } @@ -831,6 +823,7 @@ teco_do_search_backwards(GRegex *re, gsize from, gsize to, gint *count, GError * matched_num -= matched_total; i = 0; + /* try previous block */ to_block = from_block; } @@ -840,7 +833,7 @@ teco_do_search_backwards(GRegex *re, gsize from, gsize to, gint *count, GError * } static gboolean -teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error) +teco_do_search(regex_t *re, gsize from, gsize to, gint *count, GError **error) { gboolean rc = *count >= 0 ? teco_do_search_forward(re, from, to, count, error) : teco_do_search_backwards(re, from, to, count, error); @@ -871,8 +864,7 @@ teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error) static gboolean teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new_chars, GError **error) { - /* FIXME: Should G_REGEX_OPTIMIZE be added under certain circumstances? */ - GRegexCompileFlags flags = G_REGEX_MULTILINE | G_REGEX_DOTALL; + gint cflags = REG_ADVANCED; teco_qreg_t *reg = teco_qreg_table_find(ctx->qreg_table_locals, "\x18", 1); /* ^X */ g_assert(reg != NULL); @@ -880,15 +872,24 @@ teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new if (!reg->vtable->get_integer(reg, &search_mode, error)) return FALSE; if (teco_is_failure(search_mode)) - flags |= G_REGEX_CASELESS; + cflags |= REG_ICASE; if (ctx->flags.modifier_colon == 2) - flags |= G_REGEX_ANCHORED; + cflags |= REG_BOSONLY; /* anchored */ + + gint count = teco_search_parameters.count; + + /* + * Backwards searches require partial match information. + * Fortunately, it appears to be almost for free. + */ + if (count < 0) + cflags |= REG_EXPECT; /* this is set in teco_state_search_initial() */ if (ctx->expectstring.machine.codepage != SC_CP_UTF8) { /* single byte encoding */ - flags |= G_REGEX_RAW; + cflags |= REG_RAW; } else if (!teco_string_validate_utf8(str)) { /* * While SciTECO code is always guaranteed to be in valid UTF-8, @@ -913,7 +914,6 @@ teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new g_autoptr(teco_machine_qregspec_t) qreg_machine; qreg_machine = teco_machine_qregspec_new(TECO_QREG_REQUIRED, ctx->qreg_table_locals, FALSE); - g_autoptr(GRegex) re = NULL; g_autofree gchar *re_pattern; /* NOTE: teco_pattern2regexp() modifies str pointer */ re_pattern = teco_pattern2regexp(&str, qreg_machine, @@ -923,13 +923,19 @@ teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new #ifdef DEBUG g_printf("REGEXP: %s\n", re_pattern); #endif + + g_auto(regex_t) re; + memset(&re, 0, sizeof(re)); + if (!*re_pattern) goto failure; + /* - * FIXME: Should we propagate at least some of the errors? + * FIXME: No need to escape null-chars in re_pattern. + * Actually no need to generate a regexp for TECO patterns. */ - re = g_regex_new(re_pattern, flags, 0, NULL); - if (!re) + gint rc = tere_comp(&re, (chr *)re_pattern, strlen(re_pattern), cflags); + if (rc != REG_OKAY) goto failure; if (!teco_qreg_current && @@ -938,9 +944,7 @@ teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new teco_buffer_edit(teco_search_parameters.from_buffer); } - gint count = teco_search_parameters.count; - - if (!teco_do_search(re, teco_search_parameters.from, teco_search_parameters.to, &count, error)) + if (!teco_do_search(&re, teco_search_parameters.from, teco_search_parameters.to, &count, error)) return FALSE; if (teco_search_parameters.to_buffer && count) { @@ -956,12 +960,12 @@ teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new teco_buffer_edit(buffer); if (buffer == teco_search_parameters.to_buffer) { - if (!teco_do_search(re, 0, teco_search_parameters.dot, &count, error)) + if (!teco_do_search(&re, 0, teco_search_parameters.dot, &count, error)) return FALSE; break; } - if (!teco_do_search(re, 0, teco_interface_ssm(SCI_GETLENGTH, 0, 0), + if (!teco_do_search(&re, 0, teco_interface_ssm(SCI_GETLENGTH, 0, 0), &count, error)) return FALSE; } while (count); @@ -972,14 +976,14 @@ teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new teco_buffer_edit(buffer); if (buffer == teco_search_parameters.to_buffer) { - if (!teco_do_search(re, teco_search_parameters.dot, + if (!teco_do_search(&re, teco_search_parameters.dot, teco_interface_ssm(SCI_GETLENGTH, 0, 0), &count, error)) return FALSE; break; } - if (!teco_do_search(re, 0, teco_interface_ssm(SCI_GETLENGTH, 0, 0), + if (!teco_do_search(&re, 0, teco_interface_ssm(SCI_GETLENGTH, 0, 0), &count, error)) return FALSE; } while (count); |
