aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/search.c
diff options
context:
space:
mode:
authorRobin Haberkorn <rhaberkorn@fmsbw.de>2026-06-28 00:39:51 +0200
committerRobin Haberkorn <rhaberkorn@fmsbw.de>2026-06-28 00:39:51 +0200
commit4fe5bc6f3867096965270c90f2e1e5df77b8825f (patch)
tree07823673c598cf4289ea0ae769c32924e1fcce10 /src/search.c
parentc5cb45fab6d4a63a4fcff2cf7f6801dae2ac4db2 (diff)
terex is the new regular expression engine now and replaces PCRE (GRegex)
* terex is based on Henry Spencer's regular expression engine for Tcl. It is a hybrid NFA/DFA design which has better worst-time runtimes than the backtracking PCRE. Memory usage is also limited and can no longer increase catastrophically. * It should no longer be possible to crash SciTECO with pathological searches. * Since it reliably supports partial matches (REG_EXPECT) we can now enable the new backwards-search algorithm by default. This used to be broken because of a glib bug, which I already fixed. It would however take a long time until this ends up on the majority of glib installations. * Regexp executions can still be quite slow if you are looking for a pattern at the end of a huge file, which can hang the editor, but this can now at least theoretically be solved by adding hooks into terex to poll for interruptions. * We can now also get rid of a TECO-pattern to regexp translation step by directly generating terex tokens (TODO). * Performance-wise terex appears to be slower than PCRE for simple forward searches even when linking everything with optimzations (FIXME). * Having a stand-alone regular expression engine is also a huge step in getting rid of glib. See also: https://git.fmsbw.de/terex/about/
Diffstat (limited to 'src/search.c')
-rw-r--r--src/search.c254
1 files changed, 129 insertions, 125 deletions
diff --git a/src/search.c b/src/search.c
index 90975c2..601cc55 100644
--- a/src/search.c
+++ b/src/search.c
@@ -25,6 +25,10 @@
#include <glib.h>
#include <glib/gprintf.h>
+/* should always be from contrib/terex */
+#include <regalone.h>
+#include <regex.h>
+
#include "sciteco.h"
#include "string-utils.h"
#include "expressions.h"
@@ -57,6 +61,18 @@ TECO_DEFINE_UNDO_SCALAR(teco_search_parameters_t);
*/
static teco_search_parameters_t teco_search_parameters;
+G_DEFINE_AUTO_CLEANUP_CLEAR_FUNC(regex_t, tere_free);
+
+/* not in error.h since we don't want to draw in the terex headers */
+static inline void
+teco_error_regex_set(GError **error, gint rc, const regex_t *re)
+{
+ gchar buf[1024];
+ tere_error(rc, re, buf, sizeof(buf));
+ g_set_error(error, TECO_ERROR, TECO_ERROR_REGEX,
+ "Error executing regular expression: %s", buf);
+}
+
/*$ "^X" "search mode"
* mode^X -- Set or get search mode flag
* -^X
@@ -551,24 +567,21 @@ TECO_DEFINE_UNDO_OBJECT_OWN(ranges, teco_range_t *, g_free);
/**
* Extract the ranges of the given GMatchInfo.
*
- * @param match_info The result of g_regex_match().
+ * @param match_info The result of re_exec().
+ * @param count Number of matches (subpatterns).
* @param offset The beginning of the match operation in bytes.
* Match results will be relative to this offset.
- * @param count Where to store the number of ranges (subpatterns).
* @returns Ranges (subpatterns) in absolute byte positions.
* They \b must still be converted to glyph positions afterwards.
*/
static teco_range_t *
-teco_get_ranges(const GMatchInfo *match_info, gsize offset, guint *count)
+teco_get_ranges(const regmatch_t *match_info, guint count, gsize offset)
{
- *count = g_match_info_get_match_count(match_info);
- teco_range_t *ranges = g_new(teco_range_t, *count);
-
- for (gint i = 0; i < *count; i++) {
- gint from, to;
- g_match_info_fetch_pos(match_info, i, &from, &to);
- ranges[i].from = offset+MAX(from, 0);
- ranges[i].to = offset+MAX(to, 0);
+ teco_range_t *ranges = g_new(teco_range_t, count);
+
+ for (gint i = 0; i < count; i++) {
+ ranges[i].from = offset+match_info[i].rm_so;
+ ranges[i].to = offset+match_info[i].rm_eo;
}
return ranges;
@@ -613,32 +626,26 @@ G_DEFINE_AUTOPTR_CLEANUP_FUNC(teco_matches_t, teco_matches_free);
* @return FALSE if an error occurred
*/
static gboolean
-teco_do_search_forward(GRegex *re, gsize from, gsize to, gint *count, GError **error)
+teco_do_search_forward(regex_t *re, gsize from, gsize to, gint *count, GError **error)
{
- g_autoptr(GMatchInfo) info = NULL;
/* NOTE: can return NULL pointer for completely new and empty documents */
const gchar *buffer = (const gchar *)teco_interface_ssm(SCI_GETRANGEPOINTER, from, to-from) ? : "";
- GError *tmp_error = NULL;
+
+ g_assert(*count > 0);
/*
- * NOTE: The return boolean does NOT signal whether an error was generated.
+ * FIXME: Repeated allocation could be avoided when scanning over buffer boundaries.
+ * If it's worth it...
*/
- g_regex_match_full(re, buffer, to-from, 0, 0, &info, &tmp_error);
- if (tmp_error) {
- g_propagate_error(error, tmp_error);
- return FALSE;
- }
+ g_autofree regmatch_t *info = g_new(regmatch_t, 1+re->re_nsub);
- g_assert(*count > 0);
- while (g_match_info_matches(info) && --(*count)) {
- /*
- * NOTE: The return boolean does NOT signal whether an error was generated.
- */
- g_match_info_next(info, &tmp_error);
- if (tmp_error) {
- g_propagate_error(error, tmp_error);
- return FALSE;
- }
+ static const gint eflags = REG_NOTEOL | REG_NOTBOL;
+
+ gint rc;
+ while ((rc = tere_exec(re, (const chr *)buffer, to-from, NULL,
+ 1+re->re_nsub, info, eflags)) == REG_OKAY && --(*count)) {
+ buffer += info[0].rm_eo;
+ from += info[0].rm_eo;
/*
* FIXME: A single pathological match could already be excessively slow.
@@ -649,22 +656,21 @@ teco_do_search_forward(GRegex *re, gsize from, gsize to, gint *count, GError **e
}
}
- if (!*count) {
+ if (rc == REG_OKAY) {
/* successful */
- teco_undo_guint(teco_ranges_count);
- teco_undo_ranges_own(teco_ranges) = teco_get_ranges(info, from, &teco_ranges_count);
+ g_assert(*count == 0);
+ teco_undo_guint(teco_ranges_count) = 1+re->re_nsub;
+ teco_undo_ranges_own(teco_ranges) = teco_get_ranges(info, teco_ranges_count, from);
+ } else if (rc != REG_NOMATCH) {
+ teco_error_regex_set(error, rc, re);
+ return FALSE;
}
return TRUE;
}
-/**
- * Block size for backwards scanning or 0
- *
- * @bug Block-wise matching is currently broken,
- * so we disable this by default - see below.
- */
-gsize teco_search_block_size = 0; //4*1024;
+/** block size for backwards scanning or 0 */
+gsize teco_search_block_size = 4*1024;
/**
* Search backwards, in blocks of teco_search_block_size
@@ -681,7 +687,7 @@ gsize teco_search_block_size = 0; //4*1024;
* @see teco_do_search_forward
*/
static gboolean
-teco_do_search_backwards(GRegex *re, gsize from, gsize to, gint *count, GError **error)
+teco_do_search_backwards(regex_t *re, gsize from, gsize to, gint *count, GError **error)
{
/*
* NOTE: can return NULL pointer for completely new and empty documents.
@@ -692,6 +698,7 @@ teco_do_search_backwards(GRegex *re, gsize from, gsize to, gint *count, GError *
const gchar *buffer = (const gchar *)teco_interface_ssm(SCI_GETRANGEPOINTER, from, to-from) ? : "";
g_assert(*count < 0);
+
guint matched_num = -*count;
gsize total_size = sizeof(teco_matches_t) + sizeof(teco_match_t[matched_num]);
@@ -713,98 +720,83 @@ teco_do_search_backwards(GRegex *re, gsize from, gsize to, gint *count, GError *
if (!teco_memory_check(total_size, error))
return FALSE;
+ /*
+ * FIXME: The `matched` and `info` allocations are repeated when scanning
+ * over buffer boundaries and could be avoided by sharing them between
+ * teco_do_search() calls. If it's worth it...
+ */
g_autoptr(teco_matches_t) matched = g_malloc0(total_size);
matched->count = matched_num;
+ g_autofree regmatch_t *info = g_new(regmatch_t, 1+re->re_nsub);
+
gint matched_total = 0;
gint i = 0; /* ring buffer pointer into the `matched->matches` array */
gsize to_block = to-from;
while (to_block > 0) {
- g_autoptr(GMatchInfo) info = NULL;
-
gsize from_block = teco_search_block_size > 0
? MAX(0, to_block - teco_search_block_size) : 0;
- /*
- * FIXME: DEC TECO search semantics could actually demand
- * allowing matches to extend beyond the [from,to] range.
- */
- GRegexMatchFlags flags = to_block != to-from ? G_REGEX_MATCH_PARTIAL_HARD : 0;
+ /* how many bytes have been consumed in the current block */
+ gsize offset = 0;
- GError *tmp_error = NULL;
+ static const gint eflags = REG_NOTEOL | REG_NOTBOL;
+ /* for partial matches - mandatory when using REG_EXPECT */
+ rm_detail_t details;
- /*
- * NOTE: The return boolean does NOT signal whether an error was generated.
- * FIXME: Why isn't it possible to specify a start_position != 0?
- */
- g_regex_match_full(re, buffer+from_block, to_block-from_block, 0,
- flags, &info, &tmp_error);
- if (tmp_error) {
- g_propagate_error(error, tmp_error);
- return FALSE;
- }
+ gint rc;
for (;;) {
+ /*
+ * FIXME: A single pathological match could already be excessively slow.
+ */
if (G_UNLIKELY(teco_interface_is_interrupted())) {
teco_error_interrupted_set(error);
return FALSE;
}
- if (g_match_info_matches(info)) {
- g_free(matched->matches[i].ranges);
- matched->matches[i].ranges = teco_get_ranges(info, from+from_block,
- &matched->matches[i].num_ranges);
- i = ++matched_total % matched_num;
- } else if (G_UNLIKELY(g_match_info_is_partial_match(info))) {
- /*
- * Match may fall on the block boundary,
- * so retry matching the rest of the document.
- * This is the only case where we have to rescan
- * the same memory more than once.
- *
- * FIXME FIXME FIXME: We cannot retrieve the position here
- * since g_match_info_fetch_pos() treats partial matches as errors.
- * This is a confirmed glib bug and fast backwards searches
- * will continue to be broken until we switch to a custom regexp
- * engine.
- */
- gint partial_start, partial_end;
- G_GNUC_UNUSED gboolean rc;
- rc = g_match_info_fetch_pos(info, 0, &partial_start, &partial_end);
- //g_assert(rc == TRUE);
- if (!rc)
- /* make sure that test case fails */
- abort();
- g_assert(partial_end == to_block-from_block);
-
- g_autoptr(GMatchInfo) partial_info = NULL;
-
- g_regex_match_full(re, buffer+partial_start, to-from-partial_start, 0,
- G_REGEX_MATCH_ANCHORED, &partial_info, &tmp_error);
- if (tmp_error) {
- g_propagate_error(error, tmp_error);
- return FALSE;
- }
+ rc = tere_exec(re, (const chr *)buffer+from_block+offset, to_block-from_block-offset,
+ &details, 1+re->re_nsub, info, eflags);
+ if (rc != REG_OKAY)
+ break;
- if (g_match_info_matches(partial_info)) {
- g_free(matched->matches[i].ranges);
- matched->matches[i].ranges = teco_get_ranges(partial_info, from+partial_start,
- &matched->matches[i].num_ranges);
- i = ++matched_total % matched_num;
- }
+ /* normal full match */
+ g_free(matched->matches[i].ranges);
+ matched->matches[i].num_ranges = 1+re->re_nsub;
+ matched->matches[i].ranges = teco_get_ranges(info, matched->matches[i].num_ranges,
+ from+from_block+offset);
+ i = ++matched_total % matched_num;
- /* there might still be other matches within the current block */
- } else {
- break;
- }
+ offset += info[0].rm_eo;
+ }
+
+ if (rc != REG_NOMATCH) {
+ teco_error_regex_set(error, rc, re);
+ return FALSE;
+ }
+ if (G_UNLIKELY(to_block != to-from &&
+ details.rm_extend.rm_eo == to_block-from_block-offset)) {
/*
- * NOTE: The return boolean does NOT signal whether an error was generated.
+ * Match may fall on the block boundary,
+ * so retry matching the rest of the document.
+ * This is the only case where we have to rescan
+ * the same memory more than once.
*/
- g_match_info_next(info, &tmp_error);
- if (tmp_error) {
- g_propagate_error(error, tmp_error);
+ gsize partial_start = from_block+offset+details.rm_extend.rm_so;
+
+ rc = tere_exec(re, (const chr *)buffer+partial_start, to-from-partial_start,
+ &details, 1+re->re_nsub, info, eflags | REG_ANCHORED);
+
+ if (rc == REG_OKAY) {
+ g_free(matched->matches[i].ranges);
+ matched->matches[i].num_ranges = 1+re->re_nsub;
+ matched->matches[i].ranges = teco_get_ranges(info, matched->matches[i].num_ranges,
+ from+partial_start);
+ i = ++matched_total % matched_num;
+ } else if (rc != REG_NOMATCH) {
+ teco_error_regex_set(error, rc, re);
return FALSE;
}
}
@@ -831,6 +823,7 @@ teco_do_search_backwards(GRegex *re, gsize from, gsize to, gint *count, GError *
matched_num -= matched_total;
i = 0;
+ /* try previous block */
to_block = from_block;
}
@@ -840,7 +833,7 @@ teco_do_search_backwards(GRegex *re, gsize from, gsize to, gint *count, GError *
}
static gboolean
-teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error)
+teco_do_search(regex_t *re, gsize from, gsize to, gint *count, GError **error)
{
gboolean rc = *count >= 0 ? teco_do_search_forward(re, from, to, count, error)
: teco_do_search_backwards(re, from, to, count, error);
@@ -871,8 +864,7 @@ teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error)
static gboolean
teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new_chars, GError **error)
{
- /* FIXME: Should G_REGEX_OPTIMIZE be added under certain circumstances? */
- GRegexCompileFlags flags = G_REGEX_MULTILINE | G_REGEX_DOTALL;
+ gint cflags = REG_ADVANCED;
teco_qreg_t *reg = teco_qreg_table_find(ctx->qreg_table_locals, "\x18", 1); /* ^X */
g_assert(reg != NULL);
@@ -880,15 +872,24 @@ teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new
if (!reg->vtable->get_integer(reg, &search_mode, error))
return FALSE;
if (teco_is_failure(search_mode))
- flags |= G_REGEX_CASELESS;
+ cflags |= REG_ICASE;
if (ctx->flags.modifier_colon == 2)
- flags |= G_REGEX_ANCHORED;
+ cflags |= REG_BOSONLY; /* anchored */
+
+ gint count = teco_search_parameters.count;
+
+ /*
+ * Backwards searches require partial match information.
+ * Fortunately, it appears to be almost for free.
+ */
+ if (count < 0)
+ cflags |= REG_EXPECT;
/* this is set in teco_state_search_initial() */
if (ctx->expectstring.machine.codepage != SC_CP_UTF8) {
/* single byte encoding */
- flags |= G_REGEX_RAW;
+ cflags |= REG_RAW;
} else if (!teco_string_validate_utf8(str)) {
/*
* While SciTECO code is always guaranteed to be in valid UTF-8,
@@ -913,7 +914,6 @@ teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new
g_autoptr(teco_machine_qregspec_t) qreg_machine;
qreg_machine = teco_machine_qregspec_new(TECO_QREG_REQUIRED, ctx->qreg_table_locals, FALSE);
- g_autoptr(GRegex) re = NULL;
g_autofree gchar *re_pattern;
/* NOTE: teco_pattern2regexp() modifies str pointer */
re_pattern = teco_pattern2regexp(&str, qreg_machine,
@@ -923,13 +923,19 @@ teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new
#ifdef DEBUG
g_printf("REGEXP: %s\n", re_pattern);
#endif
+
+ g_auto(regex_t) re;
+ memset(&re, 0, sizeof(re));
+
if (!*re_pattern)
goto failure;
+
/*
- * FIXME: Should we propagate at least some of the errors?
+ * FIXME: No need to escape null-chars in re_pattern.
+ * Actually no need to generate a regexp for TECO patterns.
*/
- re = g_regex_new(re_pattern, flags, 0, NULL);
- if (!re)
+ gint rc = tere_comp(&re, (chr *)re_pattern, strlen(re_pattern), cflags);
+ if (rc != REG_OKAY)
goto failure;
if (!teco_qreg_current &&
@@ -938,9 +944,7 @@ teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new
teco_buffer_edit(teco_search_parameters.from_buffer);
}
- gint count = teco_search_parameters.count;
-
- if (!teco_do_search(re, teco_search_parameters.from, teco_search_parameters.to, &count, error))
+ if (!teco_do_search(&re, teco_search_parameters.from, teco_search_parameters.to, &count, error))
return FALSE;
if (teco_search_parameters.to_buffer && count) {
@@ -956,12 +960,12 @@ teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new
teco_buffer_edit(buffer);
if (buffer == teco_search_parameters.to_buffer) {
- if (!teco_do_search(re, 0, teco_search_parameters.dot, &count, error))
+ if (!teco_do_search(&re, 0, teco_search_parameters.dot, &count, error))
return FALSE;
break;
}
- if (!teco_do_search(re, 0, teco_interface_ssm(SCI_GETLENGTH, 0, 0),
+ if (!teco_do_search(&re, 0, teco_interface_ssm(SCI_GETLENGTH, 0, 0),
&count, error))
return FALSE;
} while (count);
@@ -972,14 +976,14 @@ teco_state_search_process(teco_machine_main_t *ctx, teco_string_t str, gsize new
teco_buffer_edit(buffer);
if (buffer == teco_search_parameters.to_buffer) {
- if (!teco_do_search(re, teco_search_parameters.dot,
+ if (!teco_do_search(&re, teco_search_parameters.dot,
teco_interface_ssm(SCI_GETLENGTH, 0, 0),
&count, error))
return FALSE;
break;
}
- if (!teco_do_search(re, 0, teco_interface_ssm(SCI_GETLENGTH, 0, 0),
+ if (!teco_do_search(&re, 0, teco_interface_ssm(SCI_GETLENGTH, 0, 0),
&count, error))
return FALSE;
} while (count);