aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--src/search.c57
-rw-r--r--tests/testsuite.at6
2 files changed, 58 insertions, 5 deletions
diff --git a/src/search.c b/src/search.c
index 601cc55..0df483f 100644
--- a/src/search.c
+++ b/src/search.c
@@ -166,6 +166,53 @@ teco_state_search_initial(teco_machine_main_t *ctx, GError **error)
return TRUE;
}
+/**
+ * Escape an arbitrary string, so it can be embedded into an
+ * Advanced Regular Expression.
+ *
+ * This is similar to g_regex_escape_string() but in contrast
+ * the result can be used within character classes as well.
+ *
+ * @param str String to escape.
+ * @param len Length of string.
+ * @return Regular expression as a newly allocated null-terminated string.
+ */
+static gchar *
+teco_regex_escape(const gchar *str, gsize len)
+{
+ /*
+ * Considering the expected size of strings to escape,
+ * it's probably not worth to calculate a minimal size.
+ */
+ gchar *escaped = g_malloc(len*4+1);
+ gchar *p = escaped;
+
+ /*
+ * This works in UTF-8 as well since none of the escaped characters
+ * can be a continuation byte.
+ */
+ while (len > 0) {
+ if (!*str) {
+ /* there is no shorter __unambiguous__ way */
+ *p++ = '\\';
+ *p++ = '0';
+ *p++ = '0';
+ *p++ = '0';
+ } else if (strchr("^.*+?(|[-]{\\$", *str)) {
+ *p++ = '\\';
+ *p++ = *str;
+ } else {
+ *p++ = *str;
+ }
+
+ str++;
+ len--;
+ }
+ *p = '\0';
+
+ return escaped;
+}
+
typedef enum {
TECO_SEARCH_STATE_START,
TECO_SEARCH_STATE_CTL,
@@ -230,7 +277,7 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern,
return g_strdup("");
gsize len = codepage == SC_CP_UTF8
? g_utf8_next_char(pattern->data) - pattern->data : 1;
- gchar *escaped = g_regex_escape_string(pattern->data, len);
+ gchar *escaped = teco_regex_escape(pattern->data, len);
pattern->data += len;
pattern->len -= len;
return escaped;
@@ -325,7 +372,7 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern,
pattern->data += len;
pattern->len -= len;
*state = TECO_SEARCH_STATE_START;
- return g_regex_escape_string(str.data ? : "", str.len);
+ return teco_regex_escape(str.data ? : "", str.len);
}
break;
}
@@ -444,7 +491,7 @@ teco_pattern2regexp(teco_string_t *pattern, teco_machine_qregspec_t *qreg_machin
gsize len = codepage == SC_CP_UTF8
? g_utf8_next_char(pattern->data) - pattern->data : 1;
/* the allocation could theoretically be avoided by escaping char-wise */
- g_autofree gchar *escaped = g_regex_escape_string(pattern->data, len);
+ g_autofree gchar *escaped = teco_regex_escape(pattern->data, len);
teco_string_append(&re, escaped, strlen(escaped));
pattern->data += len;
pattern->len -= len;
@@ -736,8 +783,8 @@ teco_do_search_backwards(regex_t *re, gsize from, gsize to, gint *count, GError
gsize to_block = to-from;
while (to_block > 0) {
- gsize from_block = teco_search_block_size > 0
- ? MAX(0, to_block - teco_search_block_size) : 0;
+ gsize from_block = teco_search_block_size > 0 && to_block >= teco_search_block_size
+ ? to_block - teco_search_block_size : 0;
/* how many bytes have been consumed in the current block */
gsize offset = 0;
diff --git a/tests/testsuite.at b/tests/testsuite.at
index a97e0f8..0f7e32b 100644
--- a/tests/testsuite.at
+++ b/tests/testsuite.at
@@ -515,6 +515,11 @@ AT_SETUP([Search for one of characters in uninitialized Q-Register])
TE_CHECK([[:@S/^EGa/"S(0/0)']], 0, ignore, ignore)
AT_CLEANUP
+AT_SETUP([Search for class with special characters])
+# Relevant when patterns are internally converted to regular expressions.
+TE_CHECK([[![! @I/^@-]B/J ![! @EUc/^@1]A-C/ ::@S/^EM^EGc/"F(0/0)' ^S+3"N(0/0)']], 0, ignore, ignore)
+AT_CLEANUP
+
AT_SETUP([Search accesses wrong Q-Register table])
TE_CHECK([[@^U.#xx/123/ @^Um{:@S/^EG.#xx/$} :Mm Mm]], 1, ignore, ignore)
AT_CLEANUP
@@ -532,6 +537,7 @@ TE_CHECK([[100000<@I"^J">J @S"^EM^X"]], 0, ignore, ignore)
AT_CLEANUP
AT_SETUP([Block-wise backwards search])
+TE_CHECK([[@I/ABCD/ -:@S/A/"F(0/0)']], 0, ignore, ignore)
# Failed when using GRegex (PCRE), which had broken support for partial matches.
# This is not an issue with terex.
TE_CHECK([[2,8EJ @I/ABCD/ -:@S/BC/"F(0/0)' .-3"N(0/0)' ^S+2"N(0/0)']], 0, ignore, ignore)