aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/search.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/search.c')
-rw-r--r--src/search.c211
1 files changed, 143 insertions, 68 deletions
diff --git a/src/search.c b/src/search.c
index 733eab9..0d04895 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2012-2023 Robin Haberkorn
+ * Copyright (C) 2012-2024 Robin Haberkorn
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -38,11 +38,8 @@
#include "search.h"
typedef struct {
- /*
- * FIXME: Should perhaps all be teco_int_t?
- */
- gint dot;
- gint from, to;
+ gssize dot;
+ gssize from, to;
gint count;
teco_buffer_t *from_buffer, *to_buffer;
@@ -63,6 +60,9 @@ teco_state_search_initial(teco_machine_main_t *ctx, GError **error)
if (ctx->mode > TECO_MODE_NORMAL)
return TRUE;
+ teco_machine_stringbuilding_set_codepage(&ctx->expectstring.machine,
+ teco_interface_get_codepage());
+
if (G_UNLIKELY(!teco_search_qreg_machine))
teco_search_qreg_machine = teco_machine_qregspec_new(TECO_QREG_REQUIRED, ctx->qreg_table_locals,
ctx->parent.must_undo);
@@ -79,16 +79,16 @@ teco_state_search_initial(teco_machine_main_t *ctx, GError **error)
return FALSE;
if (v1 <= v2) {
teco_search_parameters.count = 1;
- teco_search_parameters.from = (gint)v1;
- teco_search_parameters.to = (gint)v2;
+ teco_search_parameters.from = teco_interface_glyphs2bytes(v1);
+ teco_search_parameters.to = teco_interface_glyphs2bytes(v2);
} else {
teco_search_parameters.count = -1;
- teco_search_parameters.from = (gint)v2;
- teco_search_parameters.to = (gint)v1;
+ teco_search_parameters.from = teco_interface_glyphs2bytes(v2);
+ teco_search_parameters.to = teco_interface_glyphs2bytes(v1);
}
- if (!teco_validate_pos(teco_search_parameters.from) ||
- !teco_validate_pos(teco_search_parameters.to)) {
+ if (teco_search_parameters.from < 0 ||
+ teco_search_parameters.to < 0) {
/*
* FIXME: In derived classes, the command name will
* no longer be correct.
@@ -114,24 +114,10 @@ teco_state_search_initial(teco_machine_main_t *ctx, GError **error)
return TRUE;
}
-static const gchar *
-teco_regexp_escape_chr(gchar chr)
-{
- static gchar escaped[] = {'\\', '\0', '\0', '\0'};
-
- if (!chr) {
- escaped[1] = 'c';
- escaped[2] = '@';
- return escaped;
- }
-
- escaped[1] = chr;
- escaped[2] = '\0';
- return g_ascii_isalnum(chr) ? escaped + 1 : escaped;
-}
-
typedef enum {
TECO_SEARCH_STATE_START,
+ TECO_SEARCH_STATE_CTL,
+ TECO_SEARCH_STATE_ESCAPE,
TECO_SEARCH_STATE_NOT,
TECO_SEARCH_STATE_CTL_E,
TECO_SEARCH_STATE_ANYQ,
@@ -153,6 +139,7 @@ typedef enum {
* The pointer is modified and always left after
* the last character used, so it may point to the
* terminating null byte after the call.
+ * @param codepage The codepage of pattern.
* @param escape_default Whether to treat single characters
* as classes or not.
* @param error A GError.
@@ -161,10 +148,13 @@ typedef enum {
* When a non-empty string is returned, the state has always
* been reset to TECO_STATE_STATE_START.
* Must be freed with g_free().
+ *
+ * @fixme The allocations could be avoided by letting it append
+ * to the target regexp teco_string_t directly.
*/
static gchar *
teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern,
- gboolean escape_default, GError **error)
+ guint codepage, gboolean escape_default, GError **error)
{
while (pattern->len > 0) {
switch (*state) {
@@ -184,8 +174,12 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern,
*/
if (!escape_default)
return g_strdup("");
- pattern->len--;
- return g_strdup(teco_regexp_escape_chr(*pattern->data++));
+ gsize len = codepage == SC_CP_UTF8
+ ? g_utf8_next_char(pattern->data) - pattern->data : 1;
+ gchar *escaped = g_regex_escape_string(pattern->data, len);
+ pattern->data += len;
+ pattern->len -= len;
+ return escaped;
}
break;
@@ -246,25 +240,36 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern,
case TECO_SEARCH_STATE_ANYQ: {
teco_qreg_t *reg;
-
+ gsize len;
+ gunichar chr;
+
+ if (codepage == SC_CP_UTF8) {
+ len = g_utf8_next_char(pattern->data) - pattern->data;
+ chr = g_utf8_get_char(pattern->data);
+ } else {
+ len = 1;
+ chr = *pattern->data;
+ }
switch (teco_machine_qregspec_input(teco_search_qreg_machine,
- *pattern->data, &reg, NULL, error)) {
+ chr, &reg, NULL, error)) {
case TECO_MACHINE_QREGSPEC_ERROR:
return NULL;
case TECO_MACHINE_QREGSPEC_MORE:
/* incomplete, but consume byte */
- break;
+ pattern->data += len;
+ pattern->len -= len;
+ continue;
case TECO_MACHINE_QREGSPEC_DONE:
teco_machine_qregspec_reset(teco_search_qreg_machine);
g_auto(teco_string_t) str = {NULL, 0};
- if (!reg->vtable->get_string(reg, &str.data, &str.len, error))
+ if (!reg->vtable->get_string(reg, &str.data, &str.len, NULL, error))
return NULL;
- pattern->data++;
- pattern->len--;
+ pattern->data += len;
+ pattern->len -= len;
*state = TECO_SEARCH_STATE_START;
return g_regex_escape_string(str.data, str.len);
}
@@ -303,6 +308,7 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern,
* successfully scanned character, so it can be
* called recursively. It may also point to the
* terminating null byte after the call.
+ * @param codepage The codepage of pattern.
* @param single_expr Whether to scan a single pattern
* expression or an arbitrary sequence.
* @param error A GError.
@@ -310,19 +316,31 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern,
* Must be freed with g_free().
*/
static gchar *
-teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error)
+teco_pattern2regexp(teco_string_t *pattern, guint codepage, gboolean single_expr, GError **error)
{
teco_search_state_t state = TECO_SEARCH_STATE_START;
g_auto(teco_string_t) re = {NULL, 0};
do {
/*
+ * Previous character was caret.
+ * Make sure it is handled like a control character.
+ * This is necessary even though we have string building activated,
+ * to support constructs like ^Q^Q (typed with carets) in order to
+ * quote pattern matching characters.
+ */
+ if (state == TECO_SEARCH_STATE_CTL) {
+ *pattern->data = TECO_CTL_KEY(g_ascii_toupper(*pattern->data));
+ state = TECO_SEARCH_STATE_START;
+ }
+
+ /*
* First check whether it is a class.
* This will not treat individual characters
* as classes, so we do not convert them to regexp
* classes unnecessarily.
*/
- g_autofree gchar *temp = teco_class2regexp(&state, pattern, FALSE, error);
+ g_autofree gchar *temp = teco_class2regexp(&state, pattern, codepage, FALSE, error);
if (!temp)
return NULL;
@@ -344,18 +362,40 @@ teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error
switch (state) {
case TECO_SEARCH_STATE_START:
switch (*pattern->data) {
- case TECO_CTL_KEY('X'): teco_string_append_c(&re, '.'); break;
- case TECO_CTL_KEY('N'): state = TECO_SEARCH_STATE_NOT; break;
- default: {
- const gchar *escaped = teco_regexp_escape_chr(*pattern->data);
- teco_string_append(&re, escaped, strlen(escaped));
- }
+ case '^':
+ state = TECO_SEARCH_STATE_CTL;
+ break;
+ case TECO_CTL_KEY('Q'):
+ case TECO_CTL_KEY('R'):
+ state = TECO_SEARCH_STATE_ESCAPE;
+ break;
+ case TECO_CTL_KEY('X'):
+ teco_string_append_c(&re, '.');
+ break;
+ case TECO_CTL_KEY('N'):
+ state = TECO_SEARCH_STATE_NOT;
+ break;
+ default:
+ state = TECO_SEARCH_STATE_ESCAPE;
+ continue;
}
break;
+ case TECO_SEARCH_STATE_ESCAPE: {
+ state = TECO_SEARCH_STATE_START;
+ gsize len = codepage == SC_CP_UTF8
+ ? g_utf8_next_char(pattern->data) - pattern->data : 1;
+ /* the allocation could theoretically be avoided by escaping char-wise */
+ g_autofree gchar *escaped = g_regex_escape_string(pattern->data, len);
+ teco_string_append(&re, escaped, strlen(escaped));
+ pattern->data += len;
+ pattern->len -= len;
+ continue;
+ }
+
case TECO_SEARCH_STATE_NOT: {
state = TECO_SEARCH_STATE_START;
- g_autofree gchar *temp = teco_class2regexp(&state, pattern, TRUE, error);
+ g_autofree gchar *temp = teco_class2regexp(&state, pattern, codepage, TRUE, error);
if (!temp)
return NULL;
if (!*temp)
@@ -391,7 +431,7 @@ teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error
case TECO_SEARCH_STATE_MANY: {
/* consume exactly one pattern element */
- g_autofree gchar *temp = teco_pattern2regexp(pattern, TRUE, error);
+ g_autofree gchar *temp = teco_pattern2regexp(pattern, codepage, TRUE, error);
if (!temp)
return NULL;
if (!*temp)
@@ -417,7 +457,7 @@ teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error
state = TECO_SEARCH_STATE_START;
break;
default: {
- g_autofree gchar *temp = teco_pattern2regexp(pattern, TRUE, error);
+ g_autofree gchar *temp = teco_pattern2regexp(pattern, codepage, TRUE, error);
if (!temp)
return NULL;
if (!*temp)
@@ -454,16 +494,17 @@ teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error
}
static gboolean
-teco_do_search(GRegex *re, gint from, gint to, gint *count, GError **error)
+teco_do_search(GRegex *re, gsize from, gsize to, gint *count, GError **error)
{
g_autoptr(GMatchInfo) info = NULL;
- const gchar *buffer = (const gchar *)teco_interface_ssm(SCI_GETCHARACTERPOINTER, 0, 0);
+ /* NOTE: can return NULL pointer for completely new and empty documents */
+ const gchar *buffer = (const gchar *)teco_interface_ssm(SCI_GETRANGEPOINTER, from, to-from) ? : "";
GError *tmp_error = NULL;
/*
* NOTE: The return boolean does NOT signal whether an error was generated.
*/
- g_regex_match_full(re, buffer, (gssize)to, from, 0, &info, &tmp_error);
+ g_regex_match_full(re, buffer, to-from, 0, 0, &info, &tmp_error);
if (tmp_error) {
g_propagate_error(error, tmp_error);
return FALSE;
@@ -543,7 +584,7 @@ teco_do_search(GRegex *re, gint from, gint to, gint *count, GError **error)
if (matched_from >= 0 && matched_to >= 0)
/* match success */
- teco_interface_ssm(SCI_SETSEL, matched_from, matched_to);
+ teco_interface_ssm(SCI_SETSEL, from+matched_from, from+matched_to);
return TRUE;
}
@@ -551,8 +592,22 @@ teco_do_search(GRegex *re, gint from, gint to, gint *count, GError **error)
static gboolean
teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gsize new_chars, GError **error)
{
- static const GRegexCompileFlags flags = G_REGEX_CASELESS | G_REGEX_MULTILINE |
- G_REGEX_DOTALL | G_REGEX_RAW;
+ /* FIXME: Should G_REGEX_OPTIMIZE be added under certain circumstances? */
+ GRegexCompileFlags flags = G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_DOTALL;
+
+ /* this is set in teco_state_search_initial() */
+ if (ctx->expectstring.machine.codepage != SC_CP_UTF8) {
+ /* single byte encoding */
+ flags |= G_REGEX_RAW;
+ } else if (!teco_string_validate_utf8(str)) {
+ /*
+ * While SciTECO code is always guaranteed to be in valid UTF-8,
+ * the result of string building may not (eg. if ^EQq inserts garbage).
+ */
+ g_set_error_literal(error, TECO_ERROR, TECO_ERROR_CODEPOINT,
+ "Invalid UTF-8 byte sequence in search pattern");
+ return FALSE;
+ }
if (teco_current_doc_must_undo())
undo__teco_interface_ssm(SCI_SETSEL,
@@ -567,8 +622,9 @@ teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gs
g_autoptr(GRegex) re = NULL;
teco_string_t pattern = *str;
+ g_autofree gchar *re_pattern;
/* NOTE: teco_pattern2regexp() modifies str pointer */
- g_autofree gchar *re_pattern = teco_pattern2regexp(&pattern, FALSE, error);
+ re_pattern = teco_pattern2regexp(&pattern, ctx->expectstring.machine.codepage, FALSE, error);
if (!re_pattern)
return FALSE;
teco_machine_qregspec_reset(teco_search_qreg_machine);
@@ -668,13 +724,15 @@ teco_state_search_done(teco_machine_main_t *ctx, const teco_string_t *str, GErro
undo__teco_interface_ssm(SCI_SETANCHOR, anchor, 0);
if (!search_reg->vtable->undo_set_string(search_reg, error) ||
- !search_reg->vtable->set_string(search_reg, str->data, str->len, error))
+ !search_reg->vtable->set_string(search_reg, str->data, str->len,
+ teco_default_codepage(), error))
return NULL;
teco_interface_ssm(SCI_SETANCHOR, anchor, 0);
} else {
g_auto(teco_string_t) search_str = {NULL, 0};
- if (!search_reg->vtable->get_string(search_reg, &search_str.data, &search_str.len, error) ||
+ if (!search_reg->vtable->get_string(search_reg, &search_str.data, &search_str.len,
+ NULL, error) ||
!teco_state_search_process(ctx, &search_str, search_str.len, error))
return NULL;
}
@@ -890,12 +948,12 @@ teco_state_search_kill_done(teco_machine_main_t *ctx, const teco_string_t *str,
if (teco_is_failure(search_state))
return &teco_state_start;
- gint dot = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0);
+ sptr_t dot = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0);
teco_interface_ssm(SCI_BEGINUNDOACTION, 0, 0);
if (teco_search_parameters.dot < dot) {
/* kill forwards */
- gint anchor = teco_interface_ssm(SCI_GETANCHOR, 0, 0);
+ sptr_t anchor = teco_interface_ssm(SCI_GETANCHOR, 0, 0);
if (teco_current_doc_must_undo())
undo__teco_interface_ssm(SCI_GOTOPOS, dot, 0);
@@ -903,18 +961,23 @@ teco_state_search_kill_done(teco_machine_main_t *ctx, const teco_string_t *str,
teco_interface_ssm(SCI_DELETERANGE, teco_search_parameters.dot,
anchor - teco_search_parameters.dot);
+
+ /* NOTE: An undo action is not always created. */
+ if (teco_current_doc_must_undo() &&
+ teco_search_parameters.dot != anchor)
+ undo__teco_interface_ssm(SCI_UNDO, 0, 0);
} else {
/* kill backwards */
teco_interface_ssm(SCI_DELETERANGE, dot, teco_search_parameters.dot - dot);
+
+ /* NOTE: An undo action is not always created. */
+ if (teco_current_doc_must_undo() &&
+ teco_search_parameters.dot != dot)
+ undo__teco_interface_ssm(SCI_UNDO, 0, 0);
}
teco_interface_ssm(SCI_ENDUNDOACTION, 0, 0);
teco_ring_dirtify();
- /* NOTE: An undo action is not always created. */
- if (teco_current_doc_must_undo() &&
- teco_search_parameters.dot != dot)
- undo__teco_interface_ssm(SCI_UNDO, 0, 0);
-
return &teco_state_start;
}
@@ -981,11 +1044,20 @@ teco_state_search_delete_done(teco_machine_main_t *ctx, const teco_string_t *str
*/
TECO_DEFINE_STATE_SEARCH(teco_state_search_delete);
+static gboolean
+teco_state_replace_insert_initial(teco_machine_main_t *ctx, GError **error)
+{
+ if (ctx->mode == TECO_MODE_NORMAL)
+ teco_machine_stringbuilding_set_codepage(&ctx->expectstring.machine,
+ teco_interface_get_codepage());
+ return TRUE;
+}
+
/*
* FIXME: Could be static
*/
TECO_DEFINE_STATE_INSERT(teco_state_replace_insert,
- .initial_cb = NULL
+ .initial_cb = (teco_state_initial_cb_t)teco_state_replace_insert_initial
);
static teco_state_t *
@@ -1058,11 +1130,13 @@ teco_state_replace_default_insert_done_overwrite(teco_machine_main_t *ctx, const
if (str->len > 0) {
if (!replace_reg->vtable->undo_set_string(replace_reg, error) ||
- !replace_reg->vtable->set_string(replace_reg, str->data, str->len, error))
+ !replace_reg->vtable->set_string(replace_reg, str->data, str->len,
+ teco_default_codepage(), error))
return NULL;
} else {
g_auto(teco_string_t) replace_str = {NULL, 0};
- if (!replace_reg->vtable->get_string(replace_reg, &replace_str.data, &replace_str.len, error) ||
+ if (!replace_reg->vtable->get_string(replace_reg, &replace_str.data, &replace_str.len,
+ NULL, error) ||
(replace_str.len > 0 && !teco_state_insert_process(ctx, &replace_str, replace_str.len, error)))
return NULL;
}
@@ -1089,7 +1163,8 @@ teco_state_replace_default_ignore_done(teco_machine_main_t *ctx, const teco_stri
g_assert(replace_reg != NULL);
if (!replace_reg->vtable->undo_set_string(replace_reg, error) ||
- !replace_reg->vtable->set_string(replace_reg, str->data, str->len, error))
+ !replace_reg->vtable->set_string(replace_reg, str->data, str->len,
+ teco_default_codepage(), error))
return NULL;
return &teco_state_start;