diff options
| -rw-r--r-- | src/core-commands.c | 31 | ||||
| -rw-r--r-- | src/search.c | 303 | ||||
| -rw-r--r-- | src/search.h | 2 | ||||
| -rw-r--r-- | tests/testsuite.at | 17 |
4 files changed, 276 insertions, 77 deletions
diff --git a/src/core-commands.c b/src/core-commands.c index 63d8dbc..5ca508c 100644 --- a/src/core-commands.c +++ b/src/core-commands.c @@ -2110,6 +2110,7 @@ teco_state_ecommand_flags(teco_machine_main_t *ctx, GError **error) * of buffers in the ring. * (\fBread-only\fP) * .IP 2: + * .SCITECO_TOPIC "memory limit" * The current memory limit in bytes. * This limit helps to prevent dangerous out-of-memory * conditions (e.g. resulting from infinite loops) by @@ -2134,7 +2135,7 @@ teco_state_ecommand_flags(teco_machine_main_t *ctx, GError **error) * requirements are too large for the new limit \(em if * this happens you may have to clear your command-line * first. - * Memory limiting is enabled by default. + * Memory limiting is enabled by default (default: 500MB). * .IP 3: * .SCITECO_TOPIC palette * This \fBwrite-only\fP property allows disabling use of @@ -2167,6 +2168,7 @@ teco_state_ecommand_flags(teco_machine_main_t *ctx, GError **error) * You are recommended to add \(lq0,3EJ\(rq to all color * schemes that rely on exact RGB values. * .IP 4: + * .SCITECO_TOPIC caretx * The column after the last horizontal movement. * This is only used by \fBfnkeys.tes\fP and is similar to the Scintilla-internal * setting \fBSCI_CHOOSECARETX\fP. @@ -2208,6 +2210,22 @@ teco_state_ecommand_flags(teco_machine_main_t *ctx, GError **error) * work both on white on black and black on white terminals. * Use -1 to disable default foreground or background colors. * It has no effect when using the GTK interface. + * .IP 8: + * .SCITECO_TOPIC "block size" + * Block size during backwards searches (e.g. \(lq-S\(rq). + * When searching backwards \*(ST will perform forward searches + * in within blocks before the search range's end. + * If not enough matches could be produced, it moves back + * another block. + * The smaller the block size the more likely that a match + * will not be the leftmost longest, but the less matching overhead. + * A negative or 0 value (\(lq0,8EJ\(rq) is the maximum block size and + * ensures forward searches over the entire search range. + * Only this setting guarantees leftmost longest matches that + * are entirely symmetric to forward searches, but can be + * unpractically slow on huge files. + * The default is 0. + * \# FIXME: Feature is currently broken! * . * .IP -1: * Type of the last mouse event (\fBread-only\fP). @@ -2263,7 +2281,8 @@ teco_state_ecommand_properties(teco_machine_main_t *ctx, GError **error) EJ_CARETX, EJ_CMDLINE_HEIGHT, EJ_RECOVERY_INTERVAL, - EJ_DEFAULT_COLORS + EJ_DEFAULT_COLORS, + EJ_SEARCH_BLOCK_SIZE }; static teco_int_t caret_x = 0; @@ -2328,6 +2347,10 @@ teco_state_ecommand_properties(teco_machine_main_t *ctx, GError **error) break; } + case EJ_SEARCH_BLOCK_SIZE: + teco_undo_gsize(teco_search_block_size) = MAX(0, value); + break; + default: g_set_error(error, TECO_ERROR, TECO_ERROR_FAILED, "Cannot set property %" TECO_INT_FORMAT " " @@ -2391,6 +2414,10 @@ teco_state_ecommand_properties(teco_machine_main_t *ctx, GError **error) teco_expressions_push(teco_ring_recovery_interval); break; + case EJ_SEARCH_BLOCK_SIZE: + teco_expressions_push(teco_search_block_size); + break; + default: g_set_error(error, TECO_ERROR, TECO_ERROR_FAILED, "Invalid property %" TECO_INT_FORMAT " " diff --git a/src/search.c b/src/search.c index 89bf688..92be012 100644 --- a/src/search.c +++ b/src/search.c @@ -20,6 +20,7 @@ #endif #include <string.h> +#include <stdlib.h> #include <glib.h> #include <glib/gprintf.h> @@ -573,8 +574,46 @@ teco_get_ranges(const GMatchInfo *match_info, gsize offset, guint *count) return ranges; } +/* only keep the last `count' matches, in a circular stack */ +typedef struct { + guint num_ranges; + teco_range_t *ranges; +} teco_match_t; + +typedef struct { + guint count; + teco_match_t matches[]; +} teco_matches_t; + +static inline void +teco_matches_free(teco_matches_t *p) +{ + for (int i = 0; i < p->count; i++) + g_free(p->matches[i].ranges); + g_free(p); +} + +G_DEFINE_AUTOPTR_CLEANUP_FUNC(teco_matches_t, teco_matches_free); + +/** + * Search forwards in the current buffer. + * + * If `*count == 0` after the call, the n-th match was found in + * the current document and teco_ranges/teco_ranges_count is + * set accordingly but must still be converted into glyph + * offsets. + * + * @param re Compiled regular expression. + * @param from Beginning of the range to search (in bytes). + * @param to End of the range to search (in bytes). + * @param count Pointer to an integer (n-th match to find). + * This is updated to reflect how many matches are still outstanding. + * A negative value indicates backwards searches. + * @param error A GError + * @return FALSE if an error occurred + */ static gboolean -teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error) +teco_do_search_forward(GRegex *re, gsize from, gsize to, gint *count, GError **error) { g_autoptr(GMatchInfo) info = NULL; /* NOTE: can return NULL pointer for completely new and empty documents */ @@ -590,68 +629,175 @@ teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error) return FALSE; } - guint num_ranges = 0; - teco_range_t *matched_ranges = NULL; - - if (*count >= 0) { - while (g_match_info_matches(info) && --(*count)) { - /* - * NOTE: The return boolean does NOT signal whether an error was generated. - */ - g_match_info_next(info, &tmp_error); - if (tmp_error) { - g_propagate_error(error, tmp_error); - return FALSE; - } + g_assert(*count > 0); + while (g_match_info_matches(info) && --(*count)) { + /* + * NOTE: The return boolean does NOT signal whether an error was generated. + */ + g_match_info_next(info, &tmp_error); + if (tmp_error) { + g_propagate_error(error, tmp_error); + return FALSE; + } - if (G_UNLIKELY(teco_interface_is_interrupted())) { - teco_error_interrupted_set(error); - return FALSE; - } + /* + * FIXME: A single pathological match could already be excessively slow. + */ + if (G_UNLIKELY(teco_interface_is_interrupted())) { + teco_error_interrupted_set(error); + return FALSE; } + } - if (!*count) - /* successful */ - matched_ranges = teco_get_ranges(info, from, &num_ranges); - } else { - /* only keep the last `count' matches, in a circular stack */ - typedef struct { - guint num_ranges; - teco_range_t *ranges; - } teco_match_t; + if (!*count) { + /* successful */ + teco_undo_guint(teco_ranges_count); + teco_undo_ranges_own(teco_ranges) = teco_get_ranges(info, from, &teco_ranges_count); + } + + return TRUE; +} - guint matched_num = -*count; - gsize matched_size = sizeof(teco_match_t[matched_num]); +/** + * Block size for backwards scanning or 0 + * + * @bug Block-wise matching is currently broken, + * so we disable this by default - see below. + */ +gsize teco_search_block_size = 0; //4*1024; +/** + * Search backwards, in blocks of teco_search_block_size + * (within each block a forward scan is performed). + * + * This cannot always guarantee longest matches if a match is + * found at the beginning of a block. + * Only if teco_search_block_size is 0 the entire range + * if forward scanned, which ensures longest matches + * entirely symmetrical to forward searches. + * For large block sizes (or 0) even trivial backwards searches + * can be painstakingly slow on large files, though. + * + * @see teco_do_search_forward + */ +static gboolean +teco_do_search_backwards(GRegex *re, gsize from, gsize to, gint *count, GError **error) +{ + /* + * NOTE: can return NULL pointer for completely new and empty documents. + * + * Theoretically we could also get range pointers on demand which would + * be more likely to preserve the buffer gap. + */ + const gchar *buffer = (const gchar *)teco_interface_ssm(SCI_GETRANGEPOINTER, from, to-from) ? : ""; + + g_assert(*count < 0); + guint matched_num = -*count; + gsize total_size = sizeof(teco_matches_t) + sizeof(teco_match_t[matched_num]); + + /* + * matched_size could overflow. + * NOTE: Glib 2.48 has g_size_checked_mul() which uses + * compiler intrinsics. + */ + if ((gsize)matched_num > (G_MAXSIZE - sizeof(teco_matches_t))/sizeof(teco_match_t)) + /* guaranteed to fail either teco_memory_check() or g_malloc() */ + total_size = G_MAXSIZE; + + /* + * NOTE: It's theoretically possible that the allocation of the `matched` + * array causes an OOM if (-count) is large enough and regular + * memory limiting in teco_machine_main_step() wouldn't help. + * That's why we exceptionally have to check before allocating. + */ + if (!teco_memory_check(total_size, error)) + return FALSE; + + g_autoptr(teco_matches_t) matched = g_malloc0(total_size); + matched->count = matched_num; + + gint matched_total = 0; + gint i = 0; /* ring buffer pointer into the `matched->matches` array */ + + gsize to_block = to-from; + + while (to_block > 0) { + g_autoptr(GMatchInfo) info = NULL; + + gsize from_block = teco_search_block_size > 0 + ? MAX(0, to_block - teco_search_block_size) : 0; /* - * matched_size could overflow. - * NOTE: Glib 2.48 has g_size_checked_mul() which uses - * compiler intrinsics. + * FIXME: DEC TECO search semantics could actually demand + * allowing matches to extend beyond the [from,to] range. */ - if (matched_size / sizeof(teco_match_t) != matched_num) - /* guaranteed to fail either teco_memory_check() or g_malloc() */ - matched_size = G_MAXSIZE; + GRegexMatchFlags flags = to_block != to-from ? G_REGEX_MATCH_PARTIAL_HARD : 0; + + GError *tmp_error = NULL; /* - * NOTE: It's theoretically possible that the allocation of the `matched` - * array causes an OOM if (-count) is large enough and regular - * memory limiting in teco_machine_main_step() wouldn't help. - * That's why we exceptionally have to check before allocating. + * NOTE: The return boolean does NOT signal whether an error was generated. + * FIXME: Why isn't it possible to specify a start_position != 0? */ - if (!teco_memory_check(matched_size, error)) + g_regex_match_full(re, buffer+from_block, to_block-from_block, 0, + flags, &info, &tmp_error); + if (tmp_error) { + g_propagate_error(error, tmp_error); return FALSE; + } - /* - * NOTE: This needs to be deep-freed, which does not currently - * happen automatically. - */ - g_autofree teco_match_t *matched = g_malloc0(matched_size); + for (;;) { + if (G_UNLIKELY(teco_interface_is_interrupted())) { + teco_error_interrupted_set(error); + return FALSE; + } - gint matched_total = 0, i = 0; + if (g_match_info_is_partial_match(info)) { + /* + * Match may fall on the block boundary, + * so retry matching the rest of the document. + * This is the only case where we have to rescan + * the same memory more than once. + * + * FIXME FIXME FIXME: We cannot retrieve the position here + * since g_match_info_fetch_pos() treats partial matches as errors. + * This is a confirmed glib bug and fast backwards searches + * will continue to be broken until we switch to a custom regexp + * engine. + */ + gint partial_start, partial_end; + G_GNUC_UNUSED gboolean rc; + rc = g_match_info_fetch_pos(info, 0, &partial_start, &partial_end); + //g_assert(rc == TRUE); + if (!rc) + abort(); + + if (partial_end == to_block-from_block) { + g_autoptr(GMatchInfo) partial_info = NULL; + + g_regex_match_full(re, buffer+partial_start, to-from-partial_start, 0, + G_REGEX_MATCH_ANCHORED, &partial_info, &tmp_error); + if (tmp_error) { + g_propagate_error(error, tmp_error); + return FALSE; + } + + if (g_match_info_matches(partial_info)) { + g_free(matched->matches[i].ranges); + matched->matches[i].ranges = teco_get_ranges(partial_info, from+partial_start, + &matched->matches[i].num_ranges); + i = ++matched_total % matched_num; + } + } - while (g_match_info_matches(info)) { - g_free(matched[i].ranges); - matched[i].ranges = teco_get_ranges(info, from, &matched[i].num_ranges); + /* there might still be other matches within the current block */ + } else if (g_match_info_matches(info)) { + g_free(matched->matches[i].ranges); + matched->matches[i].ranges = teco_get_ranges(info, from+from_block, + &matched->matches[i].num_ranges); + i = ++matched_total % matched_num; + } else { + break; + } /* * NOTE: The return boolean does NOT signal whether an error was generated. @@ -659,37 +805,50 @@ teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error) g_match_info_next(info, &tmp_error); if (tmp_error) { g_propagate_error(error, tmp_error); - for (int i = 0; i < matched_num; i++) - g_free(matched[i].ranges); return FALSE; } - - if (G_UNLIKELY(teco_interface_is_interrupted())) { - teco_error_interrupted_set(error); - for (int i = 0; i < matched_num; i++) - g_free(matched[i].ranges); - return FALSE; - } - - i = ++matched_total % -(*count); } - *count = MIN(*count + matched_total, 0); - if (!*count) { + if (matched_total >= matched_num) { /* successful -> i points to stack bottom */ - num_ranges = matched[i].num_ranges; - matched_ranges = matched[i].ranges; - matched[i].ranges = NULL; + teco_undo_guint(teco_ranges_count) = matched->matches[i].num_ranges; + teco_undo_ranges_own(teco_ranges) = matched->matches[i].ranges; + matched->matches[i].ranges = NULL; + + *count = 0; + return TRUE; } - for (gint i = 0; i < matched_num; i++) - g_free(matched[i].ranges); + /* + * We got less matches than the required number. + * We know that the result won't be any of the existing + * matches, so we can discard them and shrink our + * circular buffer. + * This avoids merging the result lists of two blocks. + * Elements are freed when being overwritten just like + * when the buffer overflows. + */ + matched_num -= matched_total; + i = 0; + + to_block = from_block; } - if (matched_ranges) { + *count += matched_total; + g_assert(*count < 0); + return TRUE; +} + +static gboolean +teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error) +{ + gboolean rc = *count >= 0 ? teco_do_search_forward(re, from, to, count, error) + : teco_do_search_backwards(re, from, to, count, error); + if (!rc) + return FALSE; + + if (!*count) { /* match success */ - teco_undo_ranges_own(teco_ranges) = matched_ranges; - teco_undo_guint(teco_ranges_count) = num_ranges; g_assert(teco_ranges_count > 0); teco_interface_ssm(SCI_SETSEL, teco_ranges[0].from, teco_ranges[0].to); @@ -697,6 +856,8 @@ teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error) /* * teco_get_ranges() returned byte positions, * while everything else expects glyph offsets. + * They are fixed up only now to avoid unnecessary + * possibly costly teco_interface_bytes2glyphs() calls. */ for (guint i = 0; i < teco_ranges_count; i++) { teco_ranges[i].from = teco_interface_bytes2glyphs(teco_ranges[i].from); @@ -789,6 +950,7 @@ teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new teco_ring_undo_edit(); if (count > 0) { + /* search forward */ do { buffer = teco_buffer_next(buffer) ? : teco_ring_first(); teco_buffer_edit(buffer); @@ -804,6 +966,7 @@ teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new return FALSE; } while (count); } else /* count < 0 */ { + /* search backwards */ do { buffer = teco_buffer_prev(buffer) ? : teco_ring_last(); teco_buffer_edit(buffer); diff --git a/src/search.h b/src/search.h index 9bd62f7..dc6413e 100644 --- a/src/search.h +++ b/src/search.h @@ -20,6 +20,8 @@ #include "parser.h" +extern gsize teco_search_block_size; + void teco_state_control_search_mode(teco_machine_main_t *ctx, GError **error); extern teco_state_t teco_state_search; diff --git a/tests/testsuite.at b/tests/testsuite.at index 74fab24..fc8ab37 100644 --- a/tests/testsuite.at +++ b/tests/testsuite.at @@ -670,11 +670,10 @@ AT_CLEANUP #AT_XFAIL_IF(true) #AT_CLEANUP -AT_SETUP([Recursion overflow]) -# Should no longer dump core. -# It could fail because the memory limit is exceeed, -# but not in this case since we limit the recursion. -TE_CHECK([[@^Um{U.a Q.a-100000"<%.aMm'} 0Mm]], 0, ignore, ignore) +AT_SETUP([Block-wise backwards search]) +# Crashes are caused by a glib bug when a match falls on block boundaries. +# See teco_do_search_backwards() +TE_CHECK([[2,8EJ @I/ABCD/ -:@S/BC/"F(0/0)' .-3"N(0/0)' ^S+2"N(0/0)']], 0, ignore, ignore) AT_XFAIL_IF(true) AT_CLEANUP @@ -684,6 +683,14 @@ TE_CHECK([[@I/ /J :@S/^ES^X/"S(0/0)']], 0, ignore, ignore) AT_XFAIL_IF(true) AT_CLEANUP +AT_SETUP([Recursion overflow]) +# Should no longer dump core. +# It could fail because the memory limit is exceeed, +# but not in this case since we limit the recursion. +TE_CHECK([[@^Um{U.a Q.a-100000"<%.aMm'} 0Mm]], 0, ignore, ignore) +AT_XFAIL_IF(true) +AT_CLEANUP + AT_SETUP([Rub out from empty string argument]) # Should rub out the modifiers as well. # Will currently fail because it tries to execute `:@{`. |
