aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/search.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/search.c')
-rw-r--r--src/search.c303
1 files changed, 233 insertions, 70 deletions
diff --git a/src/search.c b/src/search.c
index 89bf688..92be012 100644
--- a/src/search.c
+++ b/src/search.c
@@ -20,6 +20,7 @@
#endif
#include <string.h>
+#include <stdlib.h>
#include <glib.h>
#include <glib/gprintf.h>
@@ -573,8 +574,46 @@ teco_get_ranges(const GMatchInfo *match_info, gsize offset, guint *count)
return ranges;
}
+/* only keep the last `count' matches, in a circular stack */
+typedef struct {
+ guint num_ranges;
+ teco_range_t *ranges;
+} teco_match_t;
+
+typedef struct {
+ guint count;
+ teco_match_t matches[];
+} teco_matches_t;
+
+static inline void
+teco_matches_free(teco_matches_t *p)
+{
+ for (int i = 0; i < p->count; i++)
+ g_free(p->matches[i].ranges);
+ g_free(p);
+}
+
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(teco_matches_t, teco_matches_free);
+
+/**
+ * Search forwards in the current buffer.
+ *
+ * If `*count == 0` after the call, the n-th match was found in
+ * the current document and teco_ranges/teco_ranges_count is
+ * set accordingly but must still be converted into glyph
+ * offsets.
+ *
+ * @param re Compiled regular expression.
+ * @param from Beginning of the range to search (in bytes).
+ * @param to End of the range to search (in bytes).
+ * @param count Pointer to an integer (n-th match to find).
+ * This is updated to reflect how many matches are still outstanding.
+ * A negative value indicates backwards searches.
+ * @param error A GError
+ * @return FALSE if an error occurred
+ */
static gboolean
-teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error)
+teco_do_search_forward(GRegex *re, gsize from, gsize to, gint *count, GError **error)
{
g_autoptr(GMatchInfo) info = NULL;
/* NOTE: can return NULL pointer for completely new and empty documents */
@@ -590,68 +629,175 @@ teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error)
return FALSE;
}
- guint num_ranges = 0;
- teco_range_t *matched_ranges = NULL;
-
- if (*count >= 0) {
- while (g_match_info_matches(info) && --(*count)) {
- /*
- * NOTE: The return boolean does NOT signal whether an error was generated.
- */
- g_match_info_next(info, &tmp_error);
- if (tmp_error) {
- g_propagate_error(error, tmp_error);
- return FALSE;
- }
+ g_assert(*count > 0);
+ while (g_match_info_matches(info) && --(*count)) {
+ /*
+ * NOTE: The return boolean does NOT signal whether an error was generated.
+ */
+ g_match_info_next(info, &tmp_error);
+ if (tmp_error) {
+ g_propagate_error(error, tmp_error);
+ return FALSE;
+ }
- if (G_UNLIKELY(teco_interface_is_interrupted())) {
- teco_error_interrupted_set(error);
- return FALSE;
- }
+ /*
+ * FIXME: A single pathological match could already be excessively slow.
+ */
+ if (G_UNLIKELY(teco_interface_is_interrupted())) {
+ teco_error_interrupted_set(error);
+ return FALSE;
}
+ }
- if (!*count)
- /* successful */
- matched_ranges = teco_get_ranges(info, from, &num_ranges);
- } else {
- /* only keep the last `count' matches, in a circular stack */
- typedef struct {
- guint num_ranges;
- teco_range_t *ranges;
- } teco_match_t;
+ if (!*count) {
+ /* successful */
+ teco_undo_guint(teco_ranges_count);
+ teco_undo_ranges_own(teco_ranges) = teco_get_ranges(info, from, &teco_ranges_count);
+ }
+
+ return TRUE;
+}
- guint matched_num = -*count;
- gsize matched_size = sizeof(teco_match_t[matched_num]);
+/**
+ * Block size for backwards scanning or 0
+ *
+ * @bug Block-wise matching is currently broken,
+ * so we disable this by default - see below.
+ */
+gsize teco_search_block_size = 0; //4*1024;
+/**
+ * Search backwards, in blocks of teco_search_block_size
+ * (within each block a forward scan is performed).
+ *
+ * This cannot always guarantee longest matches if a match is
+ * found at the beginning of a block.
+ * Only if teco_search_block_size is 0 the entire range
+ * if forward scanned, which ensures longest matches
+ * entirely symmetrical to forward searches.
+ * For large block sizes (or 0) even trivial backwards searches
+ * can be painstakingly slow on large files, though.
+ *
+ * @see teco_do_search_forward
+ */
+static gboolean
+teco_do_search_backwards(GRegex *re, gsize from, gsize to, gint *count, GError **error)
+{
+ /*
+ * NOTE: can return NULL pointer for completely new and empty documents.
+ *
+ * Theoretically we could also get range pointers on demand which would
+ * be more likely to preserve the buffer gap.
+ */
+ const gchar *buffer = (const gchar *)teco_interface_ssm(SCI_GETRANGEPOINTER, from, to-from) ? : "";
+
+ g_assert(*count < 0);
+ guint matched_num = -*count;
+ gsize total_size = sizeof(teco_matches_t) + sizeof(teco_match_t[matched_num]);
+
+ /*
+ * matched_size could overflow.
+ * NOTE: Glib 2.48 has g_size_checked_mul() which uses
+ * compiler intrinsics.
+ */
+ if ((gsize)matched_num > (G_MAXSIZE - sizeof(teco_matches_t))/sizeof(teco_match_t))
+ /* guaranteed to fail either teco_memory_check() or g_malloc() */
+ total_size = G_MAXSIZE;
+
+ /*
+ * NOTE: It's theoretically possible that the allocation of the `matched`
+ * array causes an OOM if (-count) is large enough and regular
+ * memory limiting in teco_machine_main_step() wouldn't help.
+ * That's why we exceptionally have to check before allocating.
+ */
+ if (!teco_memory_check(total_size, error))
+ return FALSE;
+
+ g_autoptr(teco_matches_t) matched = g_malloc0(total_size);
+ matched->count = matched_num;
+
+ gint matched_total = 0;
+ gint i = 0; /* ring buffer pointer into the `matched->matches` array */
+
+ gsize to_block = to-from;
+
+ while (to_block > 0) {
+ g_autoptr(GMatchInfo) info = NULL;
+
+ gsize from_block = teco_search_block_size > 0
+ ? MAX(0, to_block - teco_search_block_size) : 0;
/*
- * matched_size could overflow.
- * NOTE: Glib 2.48 has g_size_checked_mul() which uses
- * compiler intrinsics.
+ * FIXME: DEC TECO search semantics could actually demand
+ * allowing matches to extend beyond the [from,to] range.
*/
- if (matched_size / sizeof(teco_match_t) != matched_num)
- /* guaranteed to fail either teco_memory_check() or g_malloc() */
- matched_size = G_MAXSIZE;
+ GRegexMatchFlags flags = to_block != to-from ? G_REGEX_MATCH_PARTIAL_HARD : 0;
+
+ GError *tmp_error = NULL;
/*
- * NOTE: It's theoretically possible that the allocation of the `matched`
- * array causes an OOM if (-count) is large enough and regular
- * memory limiting in teco_machine_main_step() wouldn't help.
- * That's why we exceptionally have to check before allocating.
+ * NOTE: The return boolean does NOT signal whether an error was generated.
+ * FIXME: Why isn't it possible to specify a start_position != 0?
*/
- if (!teco_memory_check(matched_size, error))
+ g_regex_match_full(re, buffer+from_block, to_block-from_block, 0,
+ flags, &info, &tmp_error);
+ if (tmp_error) {
+ g_propagate_error(error, tmp_error);
return FALSE;
+ }
- /*
- * NOTE: This needs to be deep-freed, which does not currently
- * happen automatically.
- */
- g_autofree teco_match_t *matched = g_malloc0(matched_size);
+ for (;;) {
+ if (G_UNLIKELY(teco_interface_is_interrupted())) {
+ teco_error_interrupted_set(error);
+ return FALSE;
+ }
- gint matched_total = 0, i = 0;
+ if (g_match_info_is_partial_match(info)) {
+ /*
+ * Match may fall on the block boundary,
+ * so retry matching the rest of the document.
+ * This is the only case where we have to rescan
+ * the same memory more than once.
+ *
+ * FIXME FIXME FIXME: We cannot retrieve the position here
+ * since g_match_info_fetch_pos() treats partial matches as errors.
+ * This is a confirmed glib bug and fast backwards searches
+ * will continue to be broken until we switch to a custom regexp
+ * engine.
+ */
+ gint partial_start, partial_end;
+ G_GNUC_UNUSED gboolean rc;
+ rc = g_match_info_fetch_pos(info, 0, &partial_start, &partial_end);
+ //g_assert(rc == TRUE);
+ if (!rc)
+ abort();
+
+ if (partial_end == to_block-from_block) {
+ g_autoptr(GMatchInfo) partial_info = NULL;
+
+ g_regex_match_full(re, buffer+partial_start, to-from-partial_start, 0,
+ G_REGEX_MATCH_ANCHORED, &partial_info, &tmp_error);
+ if (tmp_error) {
+ g_propagate_error(error, tmp_error);
+ return FALSE;
+ }
+
+ if (g_match_info_matches(partial_info)) {
+ g_free(matched->matches[i].ranges);
+ matched->matches[i].ranges = teco_get_ranges(partial_info, from+partial_start,
+ &matched->matches[i].num_ranges);
+ i = ++matched_total % matched_num;
+ }
+ }
- while (g_match_info_matches(info)) {
- g_free(matched[i].ranges);
- matched[i].ranges = teco_get_ranges(info, from, &matched[i].num_ranges);
+ /* there might still be other matches within the current block */
+ } else if (g_match_info_matches(info)) {
+ g_free(matched->matches[i].ranges);
+ matched->matches[i].ranges = teco_get_ranges(info, from+from_block,
+ &matched->matches[i].num_ranges);
+ i = ++matched_total % matched_num;
+ } else {
+ break;
+ }
/*
* NOTE: The return boolean does NOT signal whether an error was generated.
@@ -659,37 +805,50 @@ teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error)
g_match_info_next(info, &tmp_error);
if (tmp_error) {
g_propagate_error(error, tmp_error);
- for (int i = 0; i < matched_num; i++)
- g_free(matched[i].ranges);
return FALSE;
}
-
- if (G_UNLIKELY(teco_interface_is_interrupted())) {
- teco_error_interrupted_set(error);
- for (int i = 0; i < matched_num; i++)
- g_free(matched[i].ranges);
- return FALSE;
- }
-
- i = ++matched_total % -(*count);
}
- *count = MIN(*count + matched_total, 0);
- if (!*count) {
+ if (matched_total >= matched_num) {
/* successful -> i points to stack bottom */
- num_ranges = matched[i].num_ranges;
- matched_ranges = matched[i].ranges;
- matched[i].ranges = NULL;
+ teco_undo_guint(teco_ranges_count) = matched->matches[i].num_ranges;
+ teco_undo_ranges_own(teco_ranges) = matched->matches[i].ranges;
+ matched->matches[i].ranges = NULL;
+
+ *count = 0;
+ return TRUE;
}
- for (gint i = 0; i < matched_num; i++)
- g_free(matched[i].ranges);
+ /*
+ * We got less matches than the required number.
+ * We know that the result won't be any of the existing
+ * matches, so we can discard them and shrink our
+ * circular buffer.
+ * This avoids merging the result lists of two blocks.
+ * Elements are freed when being overwritten just like
+ * when the buffer overflows.
+ */
+ matched_num -= matched_total;
+ i = 0;
+
+ to_block = from_block;
}
- if (matched_ranges) {
+ *count += matched_total;
+ g_assert(*count < 0);
+ return TRUE;
+}
+
+static gboolean
+teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error)
+{
+ gboolean rc = *count >= 0 ? teco_do_search_forward(re, from, to, count, error)
+ : teco_do_search_backwards(re, from, to, count, error);
+ if (!rc)
+ return FALSE;
+
+ if (!*count) {
/* match success */
- teco_undo_ranges_own(teco_ranges) = matched_ranges;
- teco_undo_guint(teco_ranges_count) = num_ranges;
g_assert(teco_ranges_count > 0);
teco_interface_ssm(SCI_SETSEL, teco_ranges[0].from, teco_ranges[0].to);
@@ -697,6 +856,8 @@ teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error)
/*
* teco_get_ranges() returned byte positions,
* while everything else expects glyph offsets.
+ * They are fixed up only now to avoid unnecessary
+ * possibly costly teco_interface_bytes2glyphs() calls.
*/
for (guint i = 0; i < teco_ranges_count; i++) {
teco_ranges[i].from = teco_interface_bytes2glyphs(teco_ranges[i].from);
@@ -789,6 +950,7 @@ teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new
teco_ring_undo_edit();
if (count > 0) {
+ /* search forward */
do {
buffer = teco_buffer_next(buffer) ? : teco_ring_first();
teco_buffer_edit(buffer);
@@ -804,6 +966,7 @@ teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new
return FALSE;
} while (count);
} else /* count < 0 */ {
+ /* search backwards */
do {
buffer = teco_buffer_prev(buffer) ? : teco_ring_last();
teco_buffer_edit(buffer);