aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRobin Haberkorn <rhaberkorn@fmsbw.de>2026-05-31 21:19:24 +0200
committerRobin Haberkorn <rhaberkorn@fmsbw.de>2026-05-31 21:19:24 +0200
commitaa7b0bb1445feeefafdcf47fd639b10500b45c03 (patch)
tree529351a0ee2862448680b0189d7ee72768f45357
parent957f24cf63261424288a8794f61cfdf5e5606fad (diff)
implemented but disabled block-wise backwards search algorithm
* The block-wise search algorithm allows for efficient backwards searches on large files. * On the downside the results are not entirely symmetric to forward searches. It therefore makes sense to still support the old correct but possibly slow algorithm. Since the old algorithm is just a special case of the new one (with a single block stretching the entire search range), you can configure the block size using `8EJ`. * Unfortunately, the new block-wise algorithm won't work due to a bug in GRegex (only in the glib wrapper code). It is therefore disabled for the time being by default and will probably only be enabled once we switch to a new regexp engine. See https://gitlab.gnome.org/GNOME/glib/-/merge_requests/5199
-rw-r--r--src/core-commands.c31
-rw-r--r--src/search.c303
-rw-r--r--src/search.h2
-rw-r--r--tests/testsuite.at17
4 files changed, 276 insertions, 77 deletions
diff --git a/src/core-commands.c b/src/core-commands.c
index 63d8dbc..5ca508c 100644
--- a/src/core-commands.c
+++ b/src/core-commands.c
@@ -2110,6 +2110,7 @@ teco_state_ecommand_flags(teco_machine_main_t *ctx, GError **error)
* of buffers in the ring.
* (\fBread-only\fP)
* .IP 2:
+ * .SCITECO_TOPIC "memory limit"
* The current memory limit in bytes.
* This limit helps to prevent dangerous out-of-memory
* conditions (e.g. resulting from infinite loops) by
@@ -2134,7 +2135,7 @@ teco_state_ecommand_flags(teco_machine_main_t *ctx, GError **error)
* requirements are too large for the new limit \(em if
* this happens you may have to clear your command-line
* first.
- * Memory limiting is enabled by default.
+ * Memory limiting is enabled by default (default: 500MB).
* .IP 3:
* .SCITECO_TOPIC palette
* This \fBwrite-only\fP property allows disabling use of
@@ -2167,6 +2168,7 @@ teco_state_ecommand_flags(teco_machine_main_t *ctx, GError **error)
* You are recommended to add \(lq0,3EJ\(rq to all color
* schemes that rely on exact RGB values.
* .IP 4:
+ * .SCITECO_TOPIC caretx
* The column after the last horizontal movement.
* This is only used by \fBfnkeys.tes\fP and is similar to the Scintilla-internal
* setting \fBSCI_CHOOSECARETX\fP.
@@ -2208,6 +2210,22 @@ teco_state_ecommand_flags(teco_machine_main_t *ctx, GError **error)
* work both on white on black and black on white terminals.
* Use -1 to disable default foreground or background colors.
* It has no effect when using the GTK interface.
+ * .IP 8:
+ * .SCITECO_TOPIC "block size"
+ * Block size during backwards searches (e.g. \(lq-S\(rq).
+ * When searching backwards \*(ST will perform forward searches
+ * in within blocks before the search range's end.
+ * If not enough matches could be produced, it moves back
+ * another block.
+ * The smaller the block size the more likely that a match
+ * will not be the leftmost longest, but the less matching overhead.
+ * A negative or 0 value (\(lq0,8EJ\(rq) is the maximum block size and
+ * ensures forward searches over the entire search range.
+ * Only this setting guarantees leftmost longest matches that
+ * are entirely symmetric to forward searches, but can be
+ * unpractically slow on huge files.
+ * The default is 0.
+ * \# FIXME: Feature is currently broken!
* .
* .IP -1:
* Type of the last mouse event (\fBread-only\fP).
@@ -2263,7 +2281,8 @@ teco_state_ecommand_properties(teco_machine_main_t *ctx, GError **error)
EJ_CARETX,
EJ_CMDLINE_HEIGHT,
EJ_RECOVERY_INTERVAL,
- EJ_DEFAULT_COLORS
+ EJ_DEFAULT_COLORS,
+ EJ_SEARCH_BLOCK_SIZE
};
static teco_int_t caret_x = 0;
@@ -2328,6 +2347,10 @@ teco_state_ecommand_properties(teco_machine_main_t *ctx, GError **error)
break;
}
+ case EJ_SEARCH_BLOCK_SIZE:
+ teco_undo_gsize(teco_search_block_size) = MAX(0, value);
+ break;
+
default:
g_set_error(error, TECO_ERROR, TECO_ERROR_FAILED,
"Cannot set property %" TECO_INT_FORMAT " "
@@ -2391,6 +2414,10 @@ teco_state_ecommand_properties(teco_machine_main_t *ctx, GError **error)
teco_expressions_push(teco_ring_recovery_interval);
break;
+ case EJ_SEARCH_BLOCK_SIZE:
+ teco_expressions_push(teco_search_block_size);
+ break;
+
default:
g_set_error(error, TECO_ERROR, TECO_ERROR_FAILED,
"Invalid property %" TECO_INT_FORMAT " "
diff --git a/src/search.c b/src/search.c
index 89bf688..92be012 100644
--- a/src/search.c
+++ b/src/search.c
@@ -20,6 +20,7 @@
#endif
#include <string.h>
+#include <stdlib.h>
#include <glib.h>
#include <glib/gprintf.h>
@@ -573,8 +574,46 @@ teco_get_ranges(const GMatchInfo *match_info, gsize offset, guint *count)
return ranges;
}
+/* only keep the last `count' matches, in a circular stack */
+typedef struct {
+ guint num_ranges;
+ teco_range_t *ranges;
+} teco_match_t;
+
+typedef struct {
+ guint count;
+ teco_match_t matches[];
+} teco_matches_t;
+
+static inline void
+teco_matches_free(teco_matches_t *p)
+{
+ for (int i = 0; i < p->count; i++)
+ g_free(p->matches[i].ranges);
+ g_free(p);
+}
+
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(teco_matches_t, teco_matches_free);
+
+/**
+ * Search forwards in the current buffer.
+ *
+ * If `*count == 0` after the call, the n-th match was found in
+ * the current document and teco_ranges/teco_ranges_count is
+ * set accordingly but must still be converted into glyph
+ * offsets.
+ *
+ * @param re Compiled regular expression.
+ * @param from Beginning of the range to search (in bytes).
+ * @param to End of the range to search (in bytes).
+ * @param count Pointer to an integer (n-th match to find).
+ * This is updated to reflect how many matches are still outstanding.
+ * A negative value indicates backwards searches.
+ * @param error A GError
+ * @return FALSE if an error occurred
+ */
static gboolean
-teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error)
+teco_do_search_forward(GRegex *re, gsize from, gsize to, gint *count, GError **error)
{
g_autoptr(GMatchInfo) info = NULL;
/* NOTE: can return NULL pointer for completely new and empty documents */
@@ -590,68 +629,175 @@ teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error)
return FALSE;
}
- guint num_ranges = 0;
- teco_range_t *matched_ranges = NULL;
-
- if (*count >= 0) {
- while (g_match_info_matches(info) && --(*count)) {
- /*
- * NOTE: The return boolean does NOT signal whether an error was generated.
- */
- g_match_info_next(info, &tmp_error);
- if (tmp_error) {
- g_propagate_error(error, tmp_error);
- return FALSE;
- }
+ g_assert(*count > 0);
+ while (g_match_info_matches(info) && --(*count)) {
+ /*
+ * NOTE: The return boolean does NOT signal whether an error was generated.
+ */
+ g_match_info_next(info, &tmp_error);
+ if (tmp_error) {
+ g_propagate_error(error, tmp_error);
+ return FALSE;
+ }
- if (G_UNLIKELY(teco_interface_is_interrupted())) {
- teco_error_interrupted_set(error);
- return FALSE;
- }
+ /*
+ * FIXME: A single pathological match could already be excessively slow.
+ */
+ if (G_UNLIKELY(teco_interface_is_interrupted())) {
+ teco_error_interrupted_set(error);
+ return FALSE;
}
+ }
- if (!*count)
- /* successful */
- matched_ranges = teco_get_ranges(info, from, &num_ranges);
- } else {
- /* only keep the last `count' matches, in a circular stack */
- typedef struct {
- guint num_ranges;
- teco_range_t *ranges;
- } teco_match_t;
+ if (!*count) {
+ /* successful */
+ teco_undo_guint(teco_ranges_count);
+ teco_undo_ranges_own(teco_ranges) = teco_get_ranges(info, from, &teco_ranges_count);
+ }
+
+ return TRUE;
+}
- guint matched_num = -*count;
- gsize matched_size = sizeof(teco_match_t[matched_num]);
+/**
+ * Block size for backwards scanning or 0
+ *
+ * @bug Block-wise matching is currently broken,
+ * so we disable this by default - see below.
+ */
+gsize teco_search_block_size = 0; //4*1024;
+/**
+ * Search backwards, in blocks of teco_search_block_size
+ * (within each block a forward scan is performed).
+ *
+ * This cannot always guarantee longest matches if a match is
+ * found at the beginning of a block.
+ * Only if teco_search_block_size is 0 the entire range
+ * if forward scanned, which ensures longest matches
+ * entirely symmetrical to forward searches.
+ * For large block sizes (or 0) even trivial backwards searches
+ * can be painstakingly slow on large files, though.
+ *
+ * @see teco_do_search_forward
+ */
+static gboolean
+teco_do_search_backwards(GRegex *re, gsize from, gsize to, gint *count, GError **error)
+{
+ /*
+ * NOTE: can return NULL pointer for completely new and empty documents.
+ *
+ * Theoretically we could also get range pointers on demand which would
+ * be more likely to preserve the buffer gap.
+ */
+ const gchar *buffer = (const gchar *)teco_interface_ssm(SCI_GETRANGEPOINTER, from, to-from) ? : "";
+
+ g_assert(*count < 0);
+ guint matched_num = -*count;
+ gsize total_size = sizeof(teco_matches_t) + sizeof(teco_match_t[matched_num]);
+
+ /*
+ * matched_size could overflow.
+ * NOTE: Glib 2.48 has g_size_checked_mul() which uses
+ * compiler intrinsics.
+ */
+ if ((gsize)matched_num > (G_MAXSIZE - sizeof(teco_matches_t))/sizeof(teco_match_t))
+ /* guaranteed to fail either teco_memory_check() or g_malloc() */
+ total_size = G_MAXSIZE;
+
+ /*
+ * NOTE: It's theoretically possible that the allocation of the `matched`
+ * array causes an OOM if (-count) is large enough and regular
+ * memory limiting in teco_machine_main_step() wouldn't help.
+ * That's why we exceptionally have to check before allocating.
+ */
+ if (!teco_memory_check(total_size, error))
+ return FALSE;
+
+ g_autoptr(teco_matches_t) matched = g_malloc0(total_size);
+ matched->count = matched_num;
+
+ gint matched_total = 0;
+ gint i = 0; /* ring buffer pointer into the `matched->matches` array */
+
+ gsize to_block = to-from;
+
+ while (to_block > 0) {
+ g_autoptr(GMatchInfo) info = NULL;
+
+ gsize from_block = teco_search_block_size > 0
+ ? MAX(0, to_block - teco_search_block_size) : 0;
/*
- * matched_size could overflow.
- * NOTE: Glib 2.48 has g_size_checked_mul() which uses
- * compiler intrinsics.
+ * FIXME: DEC TECO search semantics could actually demand
+ * allowing matches to extend beyond the [from,to] range.
*/
- if (matched_size / sizeof(teco_match_t) != matched_num)
- /* guaranteed to fail either teco_memory_check() or g_malloc() */
- matched_size = G_MAXSIZE;
+ GRegexMatchFlags flags = to_block != to-from ? G_REGEX_MATCH_PARTIAL_HARD : 0;
+
+ GError *tmp_error = NULL;
/*
- * NOTE: It's theoretically possible that the allocation of the `matched`
- * array causes an OOM if (-count) is large enough and regular
- * memory limiting in teco_machine_main_step() wouldn't help.
- * That's why we exceptionally have to check before allocating.
+ * NOTE: The return boolean does NOT signal whether an error was generated.
+ * FIXME: Why isn't it possible to specify a start_position != 0?
*/
- if (!teco_memory_check(matched_size, error))
+ g_regex_match_full(re, buffer+from_block, to_block-from_block, 0,
+ flags, &info, &tmp_error);
+ if (tmp_error) {
+ g_propagate_error(error, tmp_error);
return FALSE;
+ }
- /*
- * NOTE: This needs to be deep-freed, which does not currently
- * happen automatically.
- */
- g_autofree teco_match_t *matched = g_malloc0(matched_size);
+ for (;;) {
+ if (G_UNLIKELY(teco_interface_is_interrupted())) {
+ teco_error_interrupted_set(error);
+ return FALSE;
+ }
- gint matched_total = 0, i = 0;
+ if (g_match_info_is_partial_match(info)) {
+ /*
+ * Match may fall on the block boundary,
+ * so retry matching the rest of the document.
+ * This is the only case where we have to rescan
+ * the same memory more than once.
+ *
+ * FIXME FIXME FIXME: We cannot retrieve the position here
+ * since g_match_info_fetch_pos() treats partial matches as errors.
+ * This is a confirmed glib bug and fast backwards searches
+ * will continue to be broken until we switch to a custom regexp
+ * engine.
+ */
+ gint partial_start, partial_end;
+ G_GNUC_UNUSED gboolean rc;
+ rc = g_match_info_fetch_pos(info, 0, &partial_start, &partial_end);
+ //g_assert(rc == TRUE);
+ if (!rc)
+ abort();
+
+ if (partial_end == to_block-from_block) {
+ g_autoptr(GMatchInfo) partial_info = NULL;
+
+ g_regex_match_full(re, buffer+partial_start, to-from-partial_start, 0,
+ G_REGEX_MATCH_ANCHORED, &partial_info, &tmp_error);
+ if (tmp_error) {
+ g_propagate_error(error, tmp_error);
+ return FALSE;
+ }
+
+ if (g_match_info_matches(partial_info)) {
+ g_free(matched->matches[i].ranges);
+ matched->matches[i].ranges = teco_get_ranges(partial_info, from+partial_start,
+ &matched->matches[i].num_ranges);
+ i = ++matched_total % matched_num;
+ }
+ }
- while (g_match_info_matches(info)) {
- g_free(matched[i].ranges);
- matched[i].ranges = teco_get_ranges(info, from, &matched[i].num_ranges);
+ /* there might still be other matches within the current block */
+ } else if (g_match_info_matches(info)) {
+ g_free(matched->matches[i].ranges);
+ matched->matches[i].ranges = teco_get_ranges(info, from+from_block,
+ &matched->matches[i].num_ranges);
+ i = ++matched_total % matched_num;
+ } else {
+ break;
+ }
/*
* NOTE: The return boolean does NOT signal whether an error was generated.
@@ -659,37 +805,50 @@ teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error)
g_match_info_next(info, &tmp_error);
if (tmp_error) {
g_propagate_error(error, tmp_error);
- for (int i = 0; i < matched_num; i++)
- g_free(matched[i].ranges);
return FALSE;
}
-
- if (G_UNLIKELY(teco_interface_is_interrupted())) {
- teco_error_interrupted_set(error);
- for (int i = 0; i < matched_num; i++)
- g_free(matched[i].ranges);
- return FALSE;
- }
-
- i = ++matched_total % -(*count);
}
- *count = MIN(*count + matched_total, 0);
- if (!*count) {
+ if (matched_total >= matched_num) {
/* successful -> i points to stack bottom */
- num_ranges = matched[i].num_ranges;
- matched_ranges = matched[i].ranges;
- matched[i].ranges = NULL;
+ teco_undo_guint(teco_ranges_count) = matched->matches[i].num_ranges;
+ teco_undo_ranges_own(teco_ranges) = matched->matches[i].ranges;
+ matched->matches[i].ranges = NULL;
+
+ *count = 0;
+ return TRUE;
}
- for (gint i = 0; i < matched_num; i++)
- g_free(matched[i].ranges);
+ /*
+ * We got less matches than the required number.
+ * We know that the result won't be any of the existing
+ * matches, so we can discard them and shrink our
+ * circular buffer.
+ * This avoids merging the result lists of two blocks.
+ * Elements are freed when being overwritten just like
+ * when the buffer overflows.
+ */
+ matched_num -= matched_total;
+ i = 0;
+
+ to_block = from_block;
}
- if (matched_ranges) {
+ *count += matched_total;
+ g_assert(*count < 0);
+ return TRUE;
+}
+
+static gboolean
+teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error)
+{
+ gboolean rc = *count >= 0 ? teco_do_search_forward(re, from, to, count, error)
+ : teco_do_search_backwards(re, from, to, count, error);
+ if (!rc)
+ return FALSE;
+
+ if (!*count) {
/* match success */
- teco_undo_ranges_own(teco_ranges) = matched_ranges;
- teco_undo_guint(teco_ranges_count) = num_ranges;
g_assert(teco_ranges_count > 0);
teco_interface_ssm(SCI_SETSEL, teco_ranges[0].from, teco_ranges[0].to);
@@ -697,6 +856,8 @@ teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error)
/*
* teco_get_ranges() returned byte positions,
* while everything else expects glyph offsets.
+ * They are fixed up only now to avoid unnecessary
+ * possibly costly teco_interface_bytes2glyphs() calls.
*/
for (guint i = 0; i < teco_ranges_count; i++) {
teco_ranges[i].from = teco_interface_bytes2glyphs(teco_ranges[i].from);
@@ -789,6 +950,7 @@ teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new
teco_ring_undo_edit();
if (count > 0) {
+ /* search forward */
do {
buffer = teco_buffer_next(buffer) ? : teco_ring_first();
teco_buffer_edit(buffer);
@@ -804,6 +966,7 @@ teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new
return FALSE;
} while (count);
} else /* count < 0 */ {
+ /* search backwards */
do {
buffer = teco_buffer_prev(buffer) ? : teco_ring_last();
teco_buffer_edit(buffer);
diff --git a/src/search.h b/src/search.h
index 9bd62f7..dc6413e 100644
--- a/src/search.h
+++ b/src/search.h
@@ -20,6 +20,8 @@
#include "parser.h"
+extern gsize teco_search_block_size;
+
void teco_state_control_search_mode(teco_machine_main_t *ctx, GError **error);
extern teco_state_t teco_state_search;
diff --git a/tests/testsuite.at b/tests/testsuite.at
index 74fab24..fc8ab37 100644
--- a/tests/testsuite.at
+++ b/tests/testsuite.at
@@ -670,11 +670,10 @@ AT_CLEANUP
#AT_XFAIL_IF(true)
#AT_CLEANUP
-AT_SETUP([Recursion overflow])
-# Should no longer dump core.
-# It could fail because the memory limit is exceeed,
-# but not in this case since we limit the recursion.
-TE_CHECK([[@^Um{U.a Q.a-100000"<%.aMm'} 0Mm]], 0, ignore, ignore)
+AT_SETUP([Block-wise backwards search])
+# Crashes are caused by a glib bug when a match falls on block boundaries.
+# See teco_do_search_backwards()
+TE_CHECK([[2,8EJ @I/ABCD/ -:@S/BC/"F(0/0)' .-3"N(0/0)' ^S+2"N(0/0)']], 0, ignore, ignore)
AT_XFAIL_IF(true)
AT_CLEANUP
@@ -684,6 +683,14 @@ TE_CHECK([[@I/ /J :@S/^ES^X/"S(0/0)']], 0, ignore, ignore)
AT_XFAIL_IF(true)
AT_CLEANUP
+AT_SETUP([Recursion overflow])
+# Should no longer dump core.
+# It could fail because the memory limit is exceeed,
+# but not in this case since we limit the recursion.
+TE_CHECK([[@^Um{U.a Q.a-100000"<%.aMm'} 0Mm]], 0, ignore, ignore)
+AT_XFAIL_IF(true)
+AT_CLEANUP
+
AT_SETUP([Rub out from empty string argument])
# Should rub out the modifiers as well.
# Will currently fail because it tries to execute `:@{`.