fixed ^EGq (character class) pattern construct for embedded null bytes and `-`

This was using g_regex_escape_string() which always translates a null byte to `\0`, which is ambiguous if followed by other digits, so a null byte followed by a digit would result in a wrong regular expression. Actually the same could happen outside of character classes, ie. `@S/^@1/` was also broken. Also it does not escape `-`, so the result cannot be used in character classes. This is fixed now in a new custom implementation teco_regex_escape(). Once moving to a custom terex lexer, we won't need any of this of course unless we want to provide a regex escaping string building construct. We are now completely free of GRegex.
author: Robin Haberkorn <rhaberkorn@fmsbw.de> 2026-06-28 13:44:41 +0200
committer: Robin Haberkorn <rhaberkorn@fmsbw.de> 2026-06-28 13:44:41 +0200
commit: 7bd7bdad687e5f790afda6f0f22444f3a169a6b1 (patch)
tree: 4cafda53c6a7aec2cd49b6b7dd3b2d488e462b0e
parent: 0dfb113b47d958093e6ae086c9695e5be83b24b8 (diff)
2 files changed, 55 insertions, 3 deletions
diff --git a/src/search.c b/src/search.c
index 601cc55..c156589 100644
--- a/src/search.c
+++ b/src/search.c
@@ -166,6 +166,53 @@ teco_state_search_initial(teco_machine_main_t *ctx, GError **error)
 	return TRUE;
 }
 
+/**
+ * Escape an arbitrary string, so it can be embedded into an
+ * Advanced Regular Expression.
+ *
+ * This is similar to g_regex_escape_string() but in contrast
+ * the result can be used within character classes as well.
+ *
+ * @param str String to escape.
+ * @param len Length of string.
+ * @return Regular expression as a newly allocated null-terminated string.
+ */
+static gchar *
+teco_regex_escape(const gchar *str, gsize len)
+{
+	/*
+	 * Considering the expected size of strings to escape,
+	 * it's probably not worth to calculate a minimal size.
+	 */
+	gchar *escaped = g_malloc(len*4+1);
+	gchar *p = escaped;
+
+	/*
+	 * This works in UTF-8 as well since none of the escaped characters
+	 * can be a continuation byte.
+	 */
+	while (len > 0) {
+		if (!*str) {
+			/* there is no shorter __unambiguous__ way */
+			*p++ = '\\';
+			*p++ = '0';
+			*p++ = '0';
+			*p++ = '0';
+		} else if (strchr("^.*+?([-]{\\$", *str)) {
+			*p++ = '\\';
+			*p++ = *str;
+		} else {
+			*p++ = *str;
+		}
+
+		str++;
+		len--;
+	}
+	*p = '\0';
+
+	return escaped;
+}
+
 typedef enum {
 	TECO_SEARCH_STATE_START,
 	TECO_SEARCH_STATE_CTL,
@@ -230,7 +277,7 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern,
 					return g_strdup("");
 				gsize len = codepage == SC_CP_UTF8
 						? g_utf8_next_char(pattern->data) - pattern->data : 1;
-				gchar *escaped = g_regex_escape_string(pattern->data, len);
+				gchar *escaped = teco_regex_escape(pattern->data, len);
 				pattern->data += len;
 				pattern->len -= len;
 				return escaped;
@@ -325,7 +372,7 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern,
 				pattern->data += len;
 				pattern->len -= len;
 				*state = TECO_SEARCH_STATE_START;
-				return g_regex_escape_string(str.data ? : "", str.len);
+				return teco_regex_escape(str.data ? : "", str.len);
 			}
 			break;
 		}
@@ -444,7 +491,7 @@ teco_pattern2regexp(teco_string_t *pattern, teco_machine_qregspec_t *qreg_machin
 			gsize len = codepage == SC_CP_UTF8
 					? g_utf8_next_char(pattern->data) - pattern->data : 1;
 			/* the allocation could theoretically be avoided by escaping char-wise */
-			g_autofree gchar *escaped = g_regex_escape_string(pattern->data, len);
+			g_autofree gchar *escaped = teco_regex_escape(pattern->data, len);
 			teco_string_append(&re, escaped, strlen(escaped));
 			pattern->data += len;
 			pattern->len -= len;
diff --git a/tests/testsuite.at b/tests/testsuite.at
index a97e0f8..98425d1 100644
--- a/tests/testsuite.at
+++ b/tests/testsuite.at
@@ -515,6 +515,11 @@ AT_SETUP([Search for one of characters in uninitialized Q-Register])
 TE_CHECK([[:@S/^EGa/"S(0/0)']], 0, ignore, ignore)
 AT_CLEANUP
 
+AT_SETUP([Search for class with special characters])
+# Relevant when patterns are internally converted to regular expressions.
+TE_CHECK([[![! @I/^@-]B/J ![! @EUc/^@1]A-C/ ::@S/^EM^EGc/"F(0/0)' ^S+3"N(0/0)']], 0, ignore, ignore)
+AT_CLEANUP
+
 AT_SETUP([Search accesses wrong Q-Register table])
 TE_CHECK([[@^U.#xx/123/ @^Um{:@S/^EG.#xx/$} :Mm Mm]], 1, ignore, ignore)
 AT_CLEANUP
author	Robin Haberkorn <rhaberkorn@fmsbw.de>	2026-06-28 13:44:41 +0200
committer	Robin Haberkorn <rhaberkorn@fmsbw.de>	2026-06-28 13:44:41 +0200
commit	7bd7bdad687e5f790afda6f0f22444f3a169a6b1 (patch)
tree	4cafda53c6a7aec2cd49b6b7dd3b2d488e462b0e
parent	0dfb113b47d958093e6ae086c9695e5be83b24b8 (diff)