From aa7b0bb1445feeefafdcf47fd639b10500b45c03 Mon Sep 17 00:00:00 2001 From: Robin Haberkorn Date: Sun, 31 May 2026 21:19:24 +0200 Subject: implemented but disabled block-wise backwards search algorithm * The block-wise search algorithm allows for efficient backwards searches on large files. * On the downside the results are not entirely symmetric to forward searches. It therefore makes sense to still support the old correct but possibly slow algorithm. Since the old algorithm is just a special case of the new one (with a single block stretching the entire search range), you can configure the block size using `8EJ`. * Unfortunately, the new block-wise algorithm won't work due to a bug in GRegex (only in the glib wrapper code). It is therefore disabled for the time being by default and will probably only be enabled once we switch to a new regexp engine. See https://gitlab.gnome.org/GNOME/glib/-/merge_requests/5199 --- src/search.c | 303 +++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 233 insertions(+), 70 deletions(-) (limited to 'src/search.c') diff --git a/src/search.c b/src/search.c index 89bf688..92be012 100644 --- a/src/search.c +++ b/src/search.c @@ -20,6 +20,7 @@ #endif #include +#include #include #include @@ -573,8 +574,46 @@ teco_get_ranges(const GMatchInfo *match_info, gsize offset, guint *count) return ranges; } +/* only keep the last `count' matches, in a circular stack */ +typedef struct { + guint num_ranges; + teco_range_t *ranges; +} teco_match_t; + +typedef struct { + guint count; + teco_match_t matches[]; +} teco_matches_t; + +static inline void +teco_matches_free(teco_matches_t *p) +{ + for (int i = 0; i < p->count; i++) + g_free(p->matches[i].ranges); + g_free(p); +} + +G_DEFINE_AUTOPTR_CLEANUP_FUNC(teco_matches_t, teco_matches_free); + +/** + * Search forwards in the current buffer. + * + * If `*count == 0` after the call, the n-th match was found in + * the current document and teco_ranges/teco_ranges_count is + * set accordingly but must still be converted into glyph + * offsets. + * + * @param re Compiled regular expression. + * @param from Beginning of the range to search (in bytes). + * @param to End of the range to search (in bytes). + * @param count Pointer to an integer (n-th match to find). + * This is updated to reflect how many matches are still outstanding. + * A negative value indicates backwards searches. + * @param error A GError + * @return FALSE if an error occurred + */ static gboolean -teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error) +teco_do_search_forward(GRegex *re, gsize from, gsize to, gint *count, GError **error) { g_autoptr(GMatchInfo) info = NULL; /* NOTE: can return NULL pointer for completely new and empty documents */ @@ -590,68 +629,175 @@ teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error) return FALSE; } - guint num_ranges = 0; - teco_range_t *matched_ranges = NULL; - - if (*count >= 0) { - while (g_match_info_matches(info) && --(*count)) { - /* - * NOTE: The return boolean does NOT signal whether an error was generated. - */ - g_match_info_next(info, &tmp_error); - if (tmp_error) { - g_propagate_error(error, tmp_error); - return FALSE; - } + g_assert(*count > 0); + while (g_match_info_matches(info) && --(*count)) { + /* + * NOTE: The return boolean does NOT signal whether an error was generated. + */ + g_match_info_next(info, &tmp_error); + if (tmp_error) { + g_propagate_error(error, tmp_error); + return FALSE; + } - if (G_UNLIKELY(teco_interface_is_interrupted())) { - teco_error_interrupted_set(error); - return FALSE; - } + /* + * FIXME: A single pathological match could already be excessively slow. + */ + if (G_UNLIKELY(teco_interface_is_interrupted())) { + teco_error_interrupted_set(error); + return FALSE; } + } - if (!*count) - /* successful */ - matched_ranges = teco_get_ranges(info, from, &num_ranges); - } else { - /* only keep the last `count' matches, in a circular stack */ - typedef struct { - guint num_ranges; - teco_range_t *ranges; - } teco_match_t; + if (!*count) { + /* successful */ + teco_undo_guint(teco_ranges_count); + teco_undo_ranges_own(teco_ranges) = teco_get_ranges(info, from, &teco_ranges_count); + } + + return TRUE; +} - guint matched_num = -*count; - gsize matched_size = sizeof(teco_match_t[matched_num]); +/** + * Block size for backwards scanning or 0 + * + * @bug Block-wise matching is currently broken, + * so we disable this by default - see below. + */ +gsize teco_search_block_size = 0; //4*1024; +/** + * Search backwards, in blocks of teco_search_block_size + * (within each block a forward scan is performed). + * + * This cannot always guarantee longest matches if a match is + * found at the beginning of a block. + * Only if teco_search_block_size is 0 the entire range + * if forward scanned, which ensures longest matches + * entirely symmetrical to forward searches. + * For large block sizes (or 0) even trivial backwards searches + * can be painstakingly slow on large files, though. + * + * @see teco_do_search_forward + */ +static gboolean +teco_do_search_backwards(GRegex *re, gsize from, gsize to, gint *count, GError **error) +{ + /* + * NOTE: can return NULL pointer for completely new and empty documents. + * + * Theoretically we could also get range pointers on demand which would + * be more likely to preserve the buffer gap. + */ + const gchar *buffer = (const gchar *)teco_interface_ssm(SCI_GETRANGEPOINTER, from, to-from) ? : ""; + + g_assert(*count < 0); + guint matched_num = -*count; + gsize total_size = sizeof(teco_matches_t) + sizeof(teco_match_t[matched_num]); + + /* + * matched_size could overflow. + * NOTE: Glib 2.48 has g_size_checked_mul() which uses + * compiler intrinsics. + */ + if ((gsize)matched_num > (G_MAXSIZE - sizeof(teco_matches_t))/sizeof(teco_match_t)) + /* guaranteed to fail either teco_memory_check() or g_malloc() */ + total_size = G_MAXSIZE; + + /* + * NOTE: It's theoretically possible that the allocation of the `matched` + * array causes an OOM if (-count) is large enough and regular + * memory limiting in teco_machine_main_step() wouldn't help. + * That's why we exceptionally have to check before allocating. + */ + if (!teco_memory_check(total_size, error)) + return FALSE; + + g_autoptr(teco_matches_t) matched = g_malloc0(total_size); + matched->count = matched_num; + + gint matched_total = 0; + gint i = 0; /* ring buffer pointer into the `matched->matches` array */ + + gsize to_block = to-from; + + while (to_block > 0) { + g_autoptr(GMatchInfo) info = NULL; + + gsize from_block = teco_search_block_size > 0 + ? MAX(0, to_block - teco_search_block_size) : 0; /* - * matched_size could overflow. - * NOTE: Glib 2.48 has g_size_checked_mul() which uses - * compiler intrinsics. + * FIXME: DEC TECO search semantics could actually demand + * allowing matches to extend beyond the [from,to] range. */ - if (matched_size / sizeof(teco_match_t) != matched_num) - /* guaranteed to fail either teco_memory_check() or g_malloc() */ - matched_size = G_MAXSIZE; + GRegexMatchFlags flags = to_block != to-from ? G_REGEX_MATCH_PARTIAL_HARD : 0; + + GError *tmp_error = NULL; /* - * NOTE: It's theoretically possible that the allocation of the `matched` - * array causes an OOM if (-count) is large enough and regular - * memory limiting in teco_machine_main_step() wouldn't help. - * That's why we exceptionally have to check before allocating. + * NOTE: The return boolean does NOT signal whether an error was generated. + * FIXME: Why isn't it possible to specify a start_position != 0? */ - if (!teco_memory_check(matched_size, error)) + g_regex_match_full(re, buffer+from_block, to_block-from_block, 0, + flags, &info, &tmp_error); + if (tmp_error) { + g_propagate_error(error, tmp_error); return FALSE; + } - /* - * NOTE: This needs to be deep-freed, which does not currently - * happen automatically. - */ - g_autofree teco_match_t *matched = g_malloc0(matched_size); + for (;;) { + if (G_UNLIKELY(teco_interface_is_interrupted())) { + teco_error_interrupted_set(error); + return FALSE; + } - gint matched_total = 0, i = 0; + if (g_match_info_is_partial_match(info)) { + /* + * Match may fall on the block boundary, + * so retry matching the rest of the document. + * This is the only case where we have to rescan + * the same memory more than once. + * + * FIXME FIXME FIXME: We cannot retrieve the position here + * since g_match_info_fetch_pos() treats partial matches as errors. + * This is a confirmed glib bug and fast backwards searches + * will continue to be broken until we switch to a custom regexp + * engine. + */ + gint partial_start, partial_end; + G_GNUC_UNUSED gboolean rc; + rc = g_match_info_fetch_pos(info, 0, &partial_start, &partial_end); + //g_assert(rc == TRUE); + if (!rc) + abort(); + + if (partial_end == to_block-from_block) { + g_autoptr(GMatchInfo) partial_info = NULL; + + g_regex_match_full(re, buffer+partial_start, to-from-partial_start, 0, + G_REGEX_MATCH_ANCHORED, &partial_info, &tmp_error); + if (tmp_error) { + g_propagate_error(error, tmp_error); + return FALSE; + } + + if (g_match_info_matches(partial_info)) { + g_free(matched->matches[i].ranges); + matched->matches[i].ranges = teco_get_ranges(partial_info, from+partial_start, + &matched->matches[i].num_ranges); + i = ++matched_total % matched_num; + } + } - while (g_match_info_matches(info)) { - g_free(matched[i].ranges); - matched[i].ranges = teco_get_ranges(info, from, &matched[i].num_ranges); + /* there might still be other matches within the current block */ + } else if (g_match_info_matches(info)) { + g_free(matched->matches[i].ranges); + matched->matches[i].ranges = teco_get_ranges(info, from+from_block, + &matched->matches[i].num_ranges); + i = ++matched_total % matched_num; + } else { + break; + } /* * NOTE: The return boolean does NOT signal whether an error was generated. @@ -659,37 +805,50 @@ teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error) g_match_info_next(info, &tmp_error); if (tmp_error) { g_propagate_error(error, tmp_error); - for (int i = 0; i < matched_num; i++) - g_free(matched[i].ranges); return FALSE; } - - if (G_UNLIKELY(teco_interface_is_interrupted())) { - teco_error_interrupted_set(error); - for (int i = 0; i < matched_num; i++) - g_free(matched[i].ranges); - return FALSE; - } - - i = ++matched_total % -(*count); } - *count = MIN(*count + matched_total, 0); - if (!*count) { + if (matched_total >= matched_num) { /* successful -> i points to stack bottom */ - num_ranges = matched[i].num_ranges; - matched_ranges = matched[i].ranges; - matched[i].ranges = NULL; + teco_undo_guint(teco_ranges_count) = matched->matches[i].num_ranges; + teco_undo_ranges_own(teco_ranges) = matched->matches[i].ranges; + matched->matches[i].ranges = NULL; + + *count = 0; + return TRUE; } - for (gint i = 0; i < matched_num; i++) - g_free(matched[i].ranges); + /* + * We got less matches than the required number. + * We know that the result won't be any of the existing + * matches, so we can discard them and shrink our + * circular buffer. + * This avoids merging the result lists of two blocks. + * Elements are freed when being overwritten just like + * when the buffer overflows. + */ + matched_num -= matched_total; + i = 0; + + to_block = from_block; } - if (matched_ranges) { + *count += matched_total; + g_assert(*count < 0); + return TRUE; +} + +static gboolean +teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error) +{ + gboolean rc = *count >= 0 ? teco_do_search_forward(re, from, to, count, error) + : teco_do_search_backwards(re, from, to, count, error); + if (!rc) + return FALSE; + + if (!*count) { /* match success */ - teco_undo_ranges_own(teco_ranges) = matched_ranges; - teco_undo_guint(teco_ranges_count) = num_ranges; g_assert(teco_ranges_count > 0); teco_interface_ssm(SCI_SETSEL, teco_ranges[0].from, teco_ranges[0].to); @@ -697,6 +856,8 @@ teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error) /* * teco_get_ranges() returned byte positions, * while everything else expects glyph offsets. + * They are fixed up only now to avoid unnecessary + * possibly costly teco_interface_bytes2glyphs() calls. */ for (guint i = 0; i < teco_ranges_count; i++) { teco_ranges[i].from = teco_interface_bytes2glyphs(teco_ranges[i].from); @@ -789,6 +950,7 @@ teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new teco_ring_undo_edit(); if (count > 0) { + /* search forward */ do { buffer = teco_buffer_next(buffer) ? : teco_ring_first(); teco_buffer_edit(buffer); @@ -804,6 +966,7 @@ teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new return FALSE; } while (count); } else /* count < 0 */ { + /* search backwards */ do { buffer = teco_buffer_prev(buffer) ? : teco_ring_last(); teco_buffer_edit(buffer); -- cgit v1.2.3