aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/search.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/search.c')
-rw-r--r--src/search.c86
1 files changed, 74 insertions, 12 deletions
diff --git a/src/search.c b/src/search.c
index 601cc55..ce4a338 100644
--- a/src/search.c
+++ b/src/search.c
@@ -166,6 +166,53 @@ teco_state_search_initial(teco_machine_main_t *ctx, GError **error)
return TRUE;
}
+/**
+ * Escape an arbitrary string, so it can be embedded into an
+ * Advanced Regular Expression.
+ *
+ * This is similar to g_regex_escape_string() but in contrast
+ * the result can be used within character classes as well.
+ *
+ * @param str String to escape.
+ * @param len Length of string.
+ * @return Regular expression as a newly allocated null-terminated string.
+ */
+static gchar *
+teco_regex_escape(const gchar *str, gsize len)
+{
+ /*
+ * Considering the expected size of strings to escape,
+ * it's probably not worth to calculate a minimal size.
+ */
+ gchar *escaped = g_malloc(len*4+1);
+ gchar *p = escaped;
+
+ /*
+ * This works in UTF-8 as well since none of the escaped characters
+ * can be a continuation byte.
+ */
+ while (len > 0) {
+ if (!*str) {
+ /* there is no shorter __unambiguous__ way */
+ *p++ = '\\';
+ *p++ = '0';
+ *p++ = '0';
+ *p++ = '0';
+ } else if (strchr("^.*+?(|[-]{\\$", *str)) {
+ *p++ = '\\';
+ *p++ = *str;
+ } else {
+ *p++ = *str;
+ }
+
+ str++;
+ len--;
+ }
+ *p = '\0';
+
+ return escaped;
+}
+
typedef enum {
TECO_SEARCH_STATE_START,
TECO_SEARCH_STATE_CTL,
@@ -230,7 +277,7 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern,
return g_strdup("");
gsize len = codepage == SC_CP_UTF8
? g_utf8_next_char(pattern->data) - pattern->data : 1;
- gchar *escaped = g_regex_escape_string(pattern->data, len);
+ gchar *escaped = teco_regex_escape(pattern->data, len);
pattern->data += len;
pattern->len -= len;
return escaped;
@@ -325,7 +372,7 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern,
pattern->data += len;
pattern->len -= len;
*state = TECO_SEARCH_STATE_START;
- return g_regex_escape_string(str.data ? : "", str.len);
+ return teco_regex_escape(str.data ? : "", str.len);
}
break;
}
@@ -378,14 +425,29 @@ teco_pattern2regexp(teco_string_t *pattern, teco_machine_qregspec_t *qreg_machin
g_auto(teco_string_t) re = {NULL, 0};
do {
- /*
- * Previous character was caret.
- * Make sure it is handled like a control character.
- * This is necessary even though we have string building activated,
- * to support constructs like ^Q^Q (typed with carets) in order to
- * quote pattern matching characters.
- */
if (state == TECO_SEARCH_STATE_CTL) {
+ if (*pattern->data == '~') {
+ /* rest of pattern is a regular expression */
+ teco_string_append(&re, pattern->data+1, pattern->len-1);
+ /*
+ * FIXME: In terex, it actually could contain null bytes.
+ */
+ if (teco_string_contains(re, '\0')) {
+ g_set_error_literal(error, TECO_ERROR, TECO_ERROR_FAILED,
+ "Regular expression must not contain null-byte"
+ " - use \\0 instead");
+ return NULL;
+ }
+ return g_steal_pointer(&re.data) ? : g_strdup("");
+ }
+
+ /*
+ * Previous character was caret.
+ * Make sure it is handled like a control character.
+ * This is necessary even though we have string building activated,
+ * to support constructs like ^Q^Q (typed with carets) in order to
+ * quote pattern matching characters.
+ */
*pattern->data = TECO_CTL_KEY(g_ascii_toupper(*pattern->data));
state = TECO_SEARCH_STATE_START;
}
@@ -444,7 +506,7 @@ teco_pattern2regexp(teco_string_t *pattern, teco_machine_qregspec_t *qreg_machin
gsize len = codepage == SC_CP_UTF8
? g_utf8_next_char(pattern->data) - pattern->data : 1;
/* the allocation could theoretically be avoided by escaping char-wise */
- g_autofree gchar *escaped = g_regex_escape_string(pattern->data, len);
+ g_autofree gchar *escaped = teco_regex_escape(pattern->data, len);
teco_string_append(&re, escaped, strlen(escaped));
pattern->data += len;
pattern->len -= len;
@@ -736,8 +798,8 @@ teco_do_search_backwards(regex_t *re, gsize from, gsize to, gint *count, GError
gsize to_block = to-from;
while (to_block > 0) {
- gsize from_block = teco_search_block_size > 0
- ? MAX(0, to_block - teco_search_block_size) : 0;
+ gsize from_block = teco_search_block_size > 0 && to_block >= teco_search_block_size
+ ? to_block - teco_search_block_size : 0;
/* how many bytes have been consumed in the current block */
gsize offset = 0;