From 2a050759ab621b87d0782cc8235907a1757b46cc Mon Sep 17 00:00:00 2001
From: Robin Haberkorn <robin.haberkorn@googlemail.com>
Date: Wed, 11 Sep 2024 14:30:24 +0200
Subject: fixed searches in single-byte encoded documents

* while code is guaranteed to be in valid UTF-8, this cannot be
  said about the result of string building.
* The search pattern can end up with invalid Unicode bytes even when
  searching on UTF-8 buffers, e.g. if ^EQq inserts garbage.
  There are currently no checks.
* When searching on a raw buffer, it must be possible to
  search for arbitrary bytes (^EUq).
  Since teco_pattern2regexp() was always expecting clean UTF-8 input,
  this would sometimes skip over too many bytes and could even crash.
* Instead, teco_pattern2regexp() now takes the <S> target codepage
  into account.
---
 src/parser.c       | 16 +++++----------
 src/search.c       | 57 ++++++++++++++++++++++++++++++++++++++----------------
 src/string-utils.h | 13 +++++++++++++
 tests/testsuite.at |  9 +--------
 4 files changed, 59 insertions(+), 36 deletions(-)
diff --git a/src/parser.c b/src/parser.c
index 321803a..3c37f81 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -158,17 +158,11 @@ gboolean
 teco_execute_macro(const gchar *macro, gsize macro_len,
                    teco_qreg_table_t *qreg_table_locals, GError **error)
 {
-	/*
-	 * Validate UTF-8, but accept null bytes.
-	 * NOTE: there is g_utf8_validate_len() in Glib 2.60
-	 */
-	const gchar *p = macro;
-	while (!g_utf8_validate(p, macro_len - (p - macro), &p) && !*p)
-		p++;
-	if (p - macro < macro_len) {
-		g_set_error(error, TECO_ERROR, TECO_ERROR_CODEPOINT,
-		            "Invalid UTF-8 byte sequence at %" G_GSIZE_FORMAT,
-		            p - macro);
+	const teco_string_t str = {(gchar *)macro, macro_len};
+
+	if (!teco_string_validate_utf8(&str)) {
+		g_set_error_literal(error, TECO_ERROR, TECO_ERROR_CODEPOINT,
+		                    "Invalid UTF-8 byte sequence in macro");
 		return FALSE;
 	}
 
diff --git a/src/search.c b/src/search.c
index 43a2936..01c598e 100644
--- a/src/search.c
+++ b/src/search.c
@@ -136,6 +136,7 @@ typedef enum {
  *                The pointer is modified and always left after
  *                the last character used, so it may point to the
  *                terminating null byte after the call.
+ * @param codepage The codepage of pattern.
  * @param escape_default Whether to treat single characters
  *                       as classes or not.
  * @param error A GError.
@@ -150,7 +151,7 @@ typedef enum {
  */
 static gchar *
 teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern,
-                  gboolean escape_default, GError **error)
+                  guint codepage, gboolean escape_default, GError **error)
 {
 	while (pattern->len > 0) {
 		switch (*state) {
@@ -170,7 +171,8 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern,
 				 */
 				if (!escape_default)
 					return g_strdup("");
-				gsize len = g_utf8_next_char(pattern->data) - pattern->data;
+				gsize len = codepage == SC_CP_UTF8
+						? g_utf8_next_char(pattern->data) - pattern->data : 1;
 				gchar *escaped = g_regex_escape_string(pattern->data, len);
 				pattern->data += len;
 				pattern->len -= len;
@@ -235,16 +237,26 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern,
 
 		case TECO_SEARCH_STATE_ANYQ: {
 			teco_qreg_t *reg;
-
-			/* FIXME: Once the parser is UTF-8, we need pass a code point here */
+			gsize len;
+			gunichar chr;
+
+			if (codepage == SC_CP_UTF8) {
+				len = g_utf8_next_char(pattern->data) - pattern->data;
+				chr = g_utf8_get_char(pattern->data);
+			} else {
+				len = 1;
+				chr = *pattern->data;
+			}
 			switch (teco_machine_qregspec_input(teco_search_qreg_machine,
-			                                    *pattern->data, &reg, NULL, error)) {
+			                                    chr, &reg, NULL, error)) {
 			case TECO_MACHINE_QREGSPEC_ERROR:
 				return NULL;
 
 			case TECO_MACHINE_QREGSPEC_MORE:
 				/* incomplete, but consume byte */
-				break;
+				pattern->data += len;
+				pattern->len -= len;
+				continue;
 
 			case TECO_MACHINE_QREGSPEC_DONE:
 				teco_machine_qregspec_reset(teco_search_qreg_machine);
@@ -253,8 +265,8 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern,
 				if (!reg->vtable->get_string(reg, &str.data, &str.len, NULL, error))
 					return NULL;
 
-				pattern->data++;
-				pattern->len--;
+				pattern->data += len;
+				pattern->len -= len;
 				*state = TECO_SEARCH_STATE_START;
 				return g_regex_escape_string(str.data, str.len);
 			}
@@ -289,11 +301,11 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern,
  * string argument) are currently not reported as errors.
  *
  * @param pattern The pattern to scan through.
- *                It must always be in UTF-8.
  *                Modifies the pointer to point after the last
  *                successfully scanned character, so it can be
  *                called recursively. It may also point to the
  *                terminating null byte after the call.
+ * @param codepage The codepage of pattern.
  * @param single_expr Whether to scan a single pattern
  *                    expression or an arbitrary sequence.
  * @param error A GError.
@@ -301,7 +313,7 @@ teco_class2regexp(teco_search_state_t *state, teco_string_t *pattern,
  *         Must be freed with g_free().
  */
 static gchar *
-teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error)
+teco_pattern2regexp(teco_string_t *pattern, guint codepage, gboolean single_expr, GError **error)
 {
 	teco_search_state_t state = TECO_SEARCH_STATE_START;
 	g_auto(teco_string_t) re = {NULL, 0};
@@ -313,7 +325,7 @@ teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error
 		 * as classes, so we do not convert them to regexp
 		 * classes unnecessarily.
 		 */
-		g_autofree gchar *temp = teco_class2regexp(&state, pattern, FALSE, error);
+		g_autofree gchar *temp = teco_class2regexp(&state, pattern, codepage, FALSE, error);
 		if (!temp)
 			return NULL;
 
@@ -338,7 +350,8 @@ teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error
 			case TECO_CTL_KEY('X'): teco_string_append_c(&re, '.'); break;
 			case TECO_CTL_KEY('N'): state = TECO_SEARCH_STATE_NOT; break;
 			default: {
-				gsize len = g_utf8_next_char(pattern->data) - pattern->data;
+				gsize len = codepage == SC_CP_UTF8
+						? g_utf8_next_char(pattern->data) - pattern->data : 1;
 				/* the allocation could theoretically be avoided by escaping char-wise */
 				g_autofree gchar *escaped = g_regex_escape_string(pattern->data, len);
 				teco_string_append(&re, escaped, strlen(escaped));
@@ -351,7 +364,7 @@ teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error
 
 		case TECO_SEARCH_STATE_NOT: {
 			state = TECO_SEARCH_STATE_START;
-			g_autofree gchar *temp = teco_class2regexp(&state, pattern, TRUE, error);
+			g_autofree gchar *temp = teco_class2regexp(&state, pattern, codepage, TRUE, error);
 			if (!temp)
 				return NULL;
 			if (!*temp)
@@ -387,7 +400,7 @@ teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error
 
 		case TECO_SEARCH_STATE_MANY: {
 			/* consume exactly one pattern element */
-			g_autofree gchar *temp = teco_pattern2regexp(pattern, TRUE, error);
+			g_autofree gchar *temp = teco_pattern2regexp(pattern, codepage, TRUE, error);
 			if (!temp)
 				return NULL;
 			if (!*temp)
@@ -413,7 +426,7 @@ teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error
 				state = TECO_SEARCH_STATE_START;
 				break;
 			default: {
-				g_autofree gchar *temp = teco_pattern2regexp(pattern, TRUE, error);
+				g_autofree gchar *temp = teco_pattern2regexp(pattern, codepage, TRUE, error);
 				if (!temp)
 					return NULL;
 				if (!*temp)
@@ -551,9 +564,18 @@ teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gs
 	GRegexCompileFlags flags = G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_DOTALL;
 
 	/* this is set in teco_state_search_initial() */
-	if (ctx->expectstring.machine.codepage != SC_CP_UTF8)
+	if (ctx->expectstring.machine.codepage != SC_CP_UTF8) {
 		/* single byte encoding */
 		flags |= G_REGEX_RAW;
+	} else if (!teco_string_validate_utf8(str)) {
+		/*
+		 * While SciTECO code is always guaranteed to be in valid UTF-8,
+		 * the result of string building may not (eg. if ^EQq inserts garbage).
+		 */
+		g_set_error_literal(error, TECO_ERROR, TECO_ERROR_CODEPOINT,
+		                    "Invalid UTF-8 byte sequence in search pattern");
+		return FALSE;
+	}
 
 	if (teco_current_doc_must_undo())
 		undo__teco_interface_ssm(SCI_SETSEL,
@@ -568,8 +590,9 @@ teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gs
 
 	g_autoptr(GRegex) re = NULL;
 	teco_string_t pattern = *str;
+	g_autofree gchar *re_pattern;
 	/* NOTE: teco_pattern2regexp() modifies str pointer */
-	g_autofree gchar *re_pattern = teco_pattern2regexp(&pattern, FALSE, error);
+	re_pattern = teco_pattern2regexp(&pattern, ctx->expectstring.machine.codepage, FALSE, error);
 	if (!re_pattern)
 		return FALSE;
 	teco_machine_qregspec_reset(teco_search_qreg_machine);
diff --git a/src/string-utils.h b/src/string-utils.h
index 1b4957f..806140e 100644
--- a/src/string-utils.h
+++ b/src/string-utils.h
@@ -197,6 +197,19 @@ teco_string_rindex(const teco_string_t *str, gchar chr)
 
 const gchar *teco_string_last_occurrence(const teco_string_t *str, const gchar *chars);
 
+/**
+ * Validate whether string consists exclusively of valid UTF-8, but accept null bytes.
+ * @note there is g_utf8_validate_len() in Glib 2.60
+ */
+static inline gboolean
+teco_string_validate_utf8(const teco_string_t *str)
+{
+	const gchar *p = str->data;
+	while (!g_utf8_validate(p, str->len - (p - str->data), &p) && !*p)
+		p++;
+	return p - str->data == str->len;
+}
+
 /** @memberof teco_string_t */
 static inline void
 teco_string_clear(teco_string_t *str)
diff --git a/tests/testsuite.at b/tests/testsuite.at
index 0733d2a..0c7612a 100644
--- a/tests/testsuite.at
+++ b/tests/testsuite.at
@@ -85,6 +85,7 @@ AT_CHECK([$SCITECO -e "@EQa//0EE 1U*0EE 0:@EUa/f^@^@/ :Qa-4\"N(0/0)' Ga Z-4\"N(0
 AT_CHECK([$SCITECO -e "0EE 129@I// -A-129\"N(0/0)' HXa @EQa// EE\"N(0/0)'"], 0, ignore, ignore)
 AT_CHECK([$SCITECO -8e "129@:^Ua// 0Qa-129\"N(0/0)'"], 0, ignore, ignore)
 AT_CHECK([$SCITECO -e "1EE 167Ua @I/^EUa/ .-1\"N(0/0)'"], 0, ignore, ignore)
+AT_CHECK([$SCITECO -8e "194Ua Qa@I//J :@S/^EUa/\"F(0/0)'"], 0, ignore, ignore)
 AT_CLEANUP
 
 AT_SETUP([Unicode])
@@ -203,11 +204,3 @@ AT_SKIP_IF([case $host in *-*-*bsd* | *-*-darwin*) true;; *) false;; esac])
 AT_CHECK([$SCITECO -e "@^Um{U.a Q.a-100000\"<%.aMm'} 0Mm"], 0, ignore, ignore)
 AT_XFAIL_IF(true)
 AT_CLEANUP
-
-AT_SETUP([Unicode glitches])
-# While TECO code must always be UTF-8, strings after string building
-# can be in single-byte encodings as well.
-# It must be possible to search for single bytes in single-byte encodings.
-AT_CHECK([$SCITECO -8e "164Ua Ga@I//J :@S/^EUa/\"F(0/0)'"], 0, ignore, ignore)
-AT_XFAIL_IF(true)
-AT_CLEANUP
-- 
cgit v1.2.3