the SciTECO parser is Unicode-based now (refs #5)

The following rules apply: * All SciTECO macros __must__ be in valid UTF-8, regardless of the the register's configured encoding. This is checked against before execution, so we can use glib's non-validating UTF-8 API afterwards. * Things will inevitably get slower as we have to validate all macros first and convert to gunichar for each and every character passed into the parser. As an optimization, it may make sense to have our own inlineable version of g_utf8_get_char() (TODO). Also, Unicode glyphs in syntactically significant positions may be case-folded - just like ASCII chars were. This is is of course slower than case folding ASCII. The impact of this should be measured and perhaps we should restrict case folding to a-z via teco_ascii_toupper(). * The language itself does not use any non-ANSI characters, so you don't have to use UTF-8 characters. * Wherever the parser expects a single character, it will now accept an arbitrary Unicode/UTF-8 glyph as well. In other words, you can call macros like M§ instead of having to write M[§]. You can also get the codepoint of any Unicode character with ^^x. Pressing an Unicode character in the start state or in Ex and Fx will now give a sane error message. * When pressing a key which produces a multi-byte UTF-8 sequence, the character gets translated back and forth multiple times: 1. It's converted to an UTF-8 string, either buffered or by IME methods (Gtk). On Curses we could directly get a wide char using wget_wch(), but it's not currently used, so we don't depend on widechar curses. 2. Parsed into gunichar for passing into the edit command callbacks. This also validates the codepoint - everything later on can assume valid codepoints and valid UTF-8 strings. 3. Once the edit command handling decides to insert the key into the command line, it is serialized back into an UTF-8 string as the command line macro has to be in UTF-8 (like all other macros). 4. The parser reads back gunichars without validation for passing into the parser callbacks. * Flickering in the Curses UI and Pango warnings in Gtk, due to incompletely inserted and displayed UTF-8 sequences, are now fixed.
author: Robin Haberkorn <robin.haberkorn@googlemail.com> 2024-09-11 12:21:42 +0200
committer: Robin Haberkorn <robin.haberkorn@googlemail.com> 2024-09-11 16:14:27 +0200
commit: 68578072bfaf6054a96bb6bcedfccb6e56a508fe (patch)
tree: b7916f665e77c698d2d0fda7cb9f3ac4356f502b
parent: adc067ba745cebf2e2a2f9523bc14136ca1d2680 (diff)
download: sciteco-68578072bfaf6054a96bb6bcedfccb6e56a508fe.tar.gz
29 files changed, 325 insertions, 202 deletions
diff --git a/doc/sciteco.7.template b/doc/sciteco.7.template
index a5b7f4a..d0574d7 100644
--- a/doc/sciteco.7.template
+++ b/doc/sciteco.7.template
@@ -87,10 +87,6 @@ regular commands for command-line editing.
 .
 When the user presses a key or key-combination it is first translated
 to an UTF-8 string.
-All immediate editing commands and regular \*(ST commands however operate on
-a language based solely on
-.B ASCII
-codes, which is a subset of Unicode.
 The rules for translating keys are as follows:
 .RS
 .IP 1. 4
@@ -138,6 +134,18 @@ This feature is called function key macros and explained in the
 next subsection.
 .RE
 .
+.LP
+All immediate editing commands and regular \*(ST commands however operate on
+a language based solely on
+.B ASCII
+codes, which is a subset of Unicode.
+\# This is because we cannot assume the presence of any particular non-ANSI
+\# symbol on a user's keyboard.
+Since the \*(ST parser is Unicode-aware, this does not exclude
+using Unicode glyphs wherever a single character is expected,
+ie. \fB^^\fIx\fR and \fBU\fIq\fR works with arbitrary Unicode glyphs.
+All \*(ST macros must be in valid UTF-8.
+.
 .SS Function Key Macros
 .
 .SCITECO_TOPIC "function key"
@@ -1082,8 +1090,8 @@ Consequently when querying the code at a character position
 or inserting characters by code, the code may be an Unicode
 codepoint instead of byte-sized integer.
 .LP
-Currently, \*(ST supports UTF-8 and single-byte ANSI encodings,
-that can also be used for editing raw binary files.
+Currently, \*(ST supports buffers in UTF-8 and single-byte
+ANSI encodings, that can also be used for editing raw binary files.
 \# You can configure other single-byte code pages with EE,
 \# but there isn't yet any way to insert characters.
 UTF-8 is the default codepage for new buffers and Q-Registers
diff --git a/src/cmdline.c b/src/cmdline.c
index 47ef86f..be7a5b1 100644
--- a/src/cmdline.c
+++ b/src/cmdline.c
@@ -194,7 +194,7 @@ teco_cmdline_rubin(GError **error)
 }
 
 gboolean
-teco_cmdline_keypress_c(gchar key, GError **error)
+teco_cmdline_keypress_wc(gunichar key, GError **error)
 {
 	teco_machine_t *machine = &teco_cmdline.machine.parent;
 	g_autoptr(GError) tmp_error = NULL;
@@ -283,6 +283,30 @@ teco_cmdline_keypress_c(gchar key, GError **error)
 	return TRUE;
 }
 
+/*
+ * FIXME: If one character causes an error, we should rub out the
+ * entire string.
+ * Usually it will be called only with single keys (strings containing
+ * single codepoints), but especially teco_cmdline_fnmacro() can emulate
+ * many key presses at once.
+ */
+gboolean
+teco_cmdline_keypress(const gchar *str, gsize len, GError **error)
+{
+	for (guint i = 0; i < len; i += g_utf8_next_char(str+i) - (str+i)) {
+		gunichar chr = g_utf8_get_char_validated(str+i, len-i);
+		if ((gint32)chr < 0) {
+			g_set_error_literal(error, TECO_ERROR, TECO_ERROR_CODEPOINT,
+			                    "Invalid UTF-8 sequence");
+			return FALSE;
+		}
+		if (!teco_cmdline_keypress_wc(chr, error))
+			return FALSE;
+	}
+
+	return TRUE;
+}
+
 gboolean
 teco_cmdline_fnmacro(const gchar *name, GError **error)
 {
@@ -361,7 +385,7 @@ teco_cmdline_cleanup(void)
  */
 
 gboolean
-teco_state_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
 {
 	switch (key) {
 	case '\n': /* insert EOL sequence */
@@ -431,23 +455,30 @@ teco_state_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gch
 	}
 
 	teco_interface_popup_clear();
-	return teco_cmdline_insert(&key, sizeof(key), error);
+
+	gchar buf[6];
+	gsize len = g_unichar_to_utf8(key, buf);
+	return teco_cmdline_insert(buf, len, error);
 }
 
 gboolean
-teco_state_caseinsensitive_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_caseinsensitive_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
 {
+	/*
+	 * Auto case folding is for syntactic characters,
+	 * so this could be done by working only with a-z and A-Z.
+	 * However, it's also not speed critical.
+	 */
 	if (teco_ed & TECO_ED_AUTOCASEFOLD)
-		/* will not modify non-letter keys */
-		key = g_ascii_islower(key) ? g_ascii_toupper(key)
-		                           : g_ascii_tolower(key);
+		key = g_unichar_islower(key) ? g_unichar_toupper(key)
+		                             : g_unichar_tolower(key);
 
 	return teco_state_process_edit_cmd(ctx, parent_ctx, key, error);
 }
 
 gboolean
 teco_state_stringbuilding_start_process_edit_cmd(teco_machine_stringbuilding_t *ctx, teco_machine_t *parent_ctx,
-                                                 gchar key, GError **error)
+                                                 gunichar key, GError **error)
 {
 	teco_state_t *current = ctx->parent.current;
 
@@ -597,7 +628,7 @@ teco_state_stringbuilding_start_process_edit_cmd(teco_machine_stringbuilding_t *
 
 gboolean
 teco_state_stringbuilding_qreg_process_edit_cmd(teco_machine_stringbuilding_t *ctx, teco_machine_t *parent_ctx,
-                                                gchar chr, GError **error)
+                                                gunichar chr, GError **error)
 {
 	g_assert(ctx->machine_qregspec != NULL);
 	/* We downcast since teco_machine_qregspec_t is private in qreg.c */
@@ -606,7 +637,7 @@ teco_state_stringbuilding_qreg_process_edit_cmd(teco_machine_stringbuilding_t *c
 }
 
 gboolean
-teco_state_expectstring_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_expectstring_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
 {
 	teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine;
 	teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current;
@@ -614,7 +645,7 @@ teco_state_expectstring_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_
 }
 
 gboolean
-teco_state_insert_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_insert_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
 {
 	teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine;
 	teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current;
@@ -650,7 +681,7 @@ teco_state_insert_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *par
 }
 
 gboolean
-teco_state_expectfile_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_expectfile_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
 {
 	teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine;
 	teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current;
@@ -720,8 +751,8 @@ teco_state_expectfile_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t
 		gboolean unambiguous = teco_file_auto_complete(ctx->expectstring.string.data, G_FILE_TEST_EXISTS, &new_chars);
 		teco_machine_stringbuilding_escape(stringbuilding_ctx, new_chars.data, new_chars.len, &new_chars_escaped);
 		if (unambiguous && ctx->expectstring.nesting == 1)
-			teco_string_append_c(&new_chars_escaped,
-			                     ctx->expectstring.machine.escape_char == '{' ? '}' : ctx->expectstring.machine.escape_char);
+			teco_string_append_wc(&new_chars_escaped,
+			                      ctx->expectstring.machine.escape_char == '{' ? '}' : ctx->expectstring.machine.escape_char);
 
 		return teco_cmdline_insert(new_chars_escaped.data, new_chars_escaped.len, error);
 	}
@@ -731,7 +762,7 @@ teco_state_expectfile_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t
 }
 
 gboolean
-teco_state_expectdir_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_expectdir_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
 {
 	teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine;
 	teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current;
@@ -773,7 +804,7 @@ teco_state_expectdir_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *
 }
 
 gboolean
-teco_state_expectqreg_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_expectqreg_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
 {
 	g_assert(ctx->expectqreg != NULL);
 	/*
@@ -785,7 +816,7 @@ teco_state_expectqreg_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t
 }
 
 gboolean
-teco_state_qregspec_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_qregspec_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
 {
 	switch (key) {
 	case '\t': { /* autocomplete Q-Register name */
@@ -820,7 +851,7 @@ teco_state_qregspec_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_
 }
 
 gboolean
-teco_state_qregspec_string_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_qregspec_string_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
 {
 	teco_machine_stringbuilding_t *stringbuilding_ctx = teco_machine_qregspec_get_stringbuilding(ctx);
 	teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current;
@@ -860,7 +891,7 @@ teco_state_qregspec_string_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_m
 }
 
 gboolean
-teco_state_execute_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_execute_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
 {
 	teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine;
 	teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current;
@@ -905,7 +936,7 @@ teco_state_execute_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *pa
 }
 
 gboolean
-teco_state_scintilla_symbols_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_scintilla_symbols_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
 {
 	teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine;
 	teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current;
@@ -950,7 +981,7 @@ teco_state_scintilla_symbols_process_edit_cmd(teco_machine_main_t *ctx, teco_mac
 }
 
 gboolean
-teco_state_goto_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_goto_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
 {
 	teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine;
 	teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current;
@@ -997,7 +1028,7 @@ teco_state_goto_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *paren
 }
 
 gboolean
-teco_state_help_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_help_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
 {
 	teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine;
 	teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current;
@@ -1028,8 +1059,8 @@ teco_state_help_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *paren
 		gboolean unambiguous = teco_help_auto_complete(ctx->expectstring.string.data, &new_chars);
 		teco_machine_stringbuilding_escape(stringbuilding_ctx, new_chars.data, new_chars.len, &new_chars_escaped);
 		if (unambiguous && ctx->expectstring.nesting == 1)
-			teco_string_append_c(&new_chars_escaped,
-			                     ctx->expectstring.machine.escape_char == '{' ? '}' : ctx->expectstring.machine.escape_char);
+			teco_string_append_wc(&new_chars_escaped,
+			                      ctx->expectstring.machine.escape_char == '{' ? '}' : ctx->expectstring.machine.escape_char);
 
 		return new_chars_escaped.len ? teco_cmdline_insert(new_chars_escaped.data, new_chars_escaped.len, error) : TRUE;
 	}
diff --git a/src/cmdline.h b/src/cmdline.h
index 7f40b5f..78d101c 100644
--- a/src/cmdline.h
+++ b/src/cmdline.h
@@ -64,16 +64,8 @@ gboolean teco_cmdline_insert(const gchar *data, gsize len, GError **error);
 
 gboolean teco_cmdline_rubin(GError **error);
 
-gboolean teco_cmdline_keypress_c(gchar key, GError **error);
-
-static inline gboolean
-teco_cmdline_keypress(const gchar *str, gsize len, GError **error)
-{
-	for (guint i = 0; i < len; i++)
-		if (!teco_cmdline_keypress_c(str[i], error))
-			return FALSE;
-	return TRUE;
-}
+gboolean teco_cmdline_keypress_wc(gunichar key, GError **error);
+gboolean teco_cmdline_keypress(const gchar *str, gsize len, GError **error);
 
 gboolean teco_cmdline_fnmacro(const gchar *name, GError **error);
 
diff --git a/src/core-commands.c b/src/core-commands.c
index 3686624..ef763d5 100644
--- a/src/core-commands.c
+++ b/src/core-commands.c
@@ -45,7 +45,7 @@
 #include "goto-commands.h"
 #include "core-commands.h"
 
-static teco_state_t *teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error);
+static teco_state_t *teco_state_control_input(teco_machine_main_t *ctx, gunichar chr, GError **error);
 
 /*
  * NOTE: This needs some extra code in teco_state_start_input().
@@ -1049,7 +1049,7 @@ teco_state_start_get(teco_machine_main_t *ctx, GError **error)
 }
 
 static teco_state_t *
-teco_state_start_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_start_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
 {
 	static teco_machine_main_transition_t transitions[] = {
 		/*
@@ -1388,7 +1388,7 @@ teco_state_fcommand_cond_else(teco_machine_main_t *ctx, GError **error)
 }
 
 static teco_state_t *
-teco_state_fcommand_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_fcommand_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
 {
 	static teco_machine_main_transition_t transitions[] = {
 		/*
@@ -1512,7 +1512,7 @@ teco_state_changedir_done(teco_machine_main_t *ctx, const teco_string_t *str, GE
 TECO_DEFINE_STATE_EXPECTDIR(teco_state_changedir);
 
 static teco_state_t *
-teco_state_condcommand_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_condcommand_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
 {
 	teco_int_t value = 0;
 	gboolean result = TRUE;
@@ -1800,7 +1800,7 @@ teco_state_control_glyphs2bytes(teco_machine_main_t *ctx, GError **error)
 }
 
 static teco_state_t *
-teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_control_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
 {
 	static teco_machine_main_transition_t transitions[] = {
 		/*
@@ -1841,10 +1841,10 @@ teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error)
 TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_control);
 
 static teco_state_t *
-teco_state_ascii_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_ascii_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
 {
 	if (ctx->mode == TECO_MODE_NORMAL)
-		teco_expressions_push((guchar)chr);
+		teco_expressions_push(chr);
 
 	return &teco_state_start;
 }
@@ -1877,7 +1877,7 @@ TECO_DEFINE_STATE(teco_state_ascii);
  * only be seen when executing the following command.
  */
 static teco_state_t *
-teco_state_escape_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_escape_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
 {
 	/*$ ^[^[ ^[$ $$ terminate return
 	 * [a1,a2,...]$$ -- Terminate command line or return from macro
@@ -2700,7 +2700,7 @@ teco_state_ecommand_exit(teco_machine_main_t *ctx, GError **error)
 }
 
 static teco_state_t *
-teco_state_ecommand_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_ecommand_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
 {
 	static teco_machine_main_transition_t transitions[] = {
 		/*
@@ -2874,10 +2874,9 @@ teco_state_insert_indent_initial(teco_machine_main_t *ctx, GError **error)
 		len -= teco_interface_ssm(SCI_GETCOLUMN,
 		                          teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0), 0) % len;
 
-		gchar spaces[len];
-
-		memset(spaces, ' ', sizeof(spaces));
-		teco_interface_ssm(SCI_ADDTEXT, sizeof(spaces), (sptr_t)spaces);
+		gchar space = ' ';
+		while (len-- > 0)
+			teco_interface_ssm(SCI_ADDTEXT, 1, (sptr_t)&space);
 	}
 	teco_interface_ssm(SCI_ENDUNDOACTION, 0, 0);
 	teco_ring_dirtify();
diff --git a/src/core-commands.h b/src/core-commands.h
index 370c7ba..e30770d 100644
--- a/src/core-commands.h
+++ b/src/core-commands.h
@@ -43,7 +43,7 @@ gboolean teco_state_insert_process(teco_machine_main_t *ctx, const teco_string_t
                                    gsize new_chars, GError **error);
 
 /* in cmdline.c */
-gboolean teco_state_insert_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar chr, GError **error);
+gboolean teco_state_insert_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar chr, GError **error);
 
 /**
  * @class TECO_DEFINE_STATE_INSERT
diff --git a/src/error.h b/src/error.h
index f60be1a..7543d02 100644
--- a/src/error.h
+++ b/src/error.h
@@ -61,10 +61,10 @@ typedef enum {
 } teco_error_t;
 
 static inline void
-teco_error_syntax_set(GError **error, gchar chr)
+teco_error_syntax_set(GError **error, gunichar chr)
 {
 	g_set_error(error, TECO_ERROR, TECO_ERROR_SYNTAX,
-	            "Syntax error \"%c\" (%d)", chr, chr);
+	            "Syntax error \"%C\" (U+%04" G_GINT32_MODIFIER "X)", chr, chr);
 }
 
 static inline void
diff --git a/src/expressions.c b/src/expressions.c
index ef785e0..1ba8706 100644
--- a/src/expressions.c
+++ b/src/expressions.c
@@ -114,10 +114,11 @@ teco_expressions_pop_num_calc(teco_int_t *ret, teco_int_t imply, GError **error)
 }
 
 void
-teco_expressions_add_digit(gchar digit)
+teco_expressions_add_digit(gunichar digit)
 {
 	teco_int_t n = teco_expressions_args() > 0 ? teco_expressions_pop_num(0) : 0;
 
+	/* use g_unichar_digit_value()? */
 	teco_expressions_push(n*teco_radix + (n < 0 ? -1 : 1)*(digit - '0'));
 }
 
diff --git a/src/expressions.h b/src/expressions.h
index 24c5eff..68d8ddb 100644
--- a/src/expressions.h
+++ b/src/expressions.h
@@ -123,7 +123,7 @@ teco_int_t teco_expressions_peek_num(guint index);
 teco_int_t teco_expressions_pop_num(guint index);
 gboolean teco_expressions_pop_num_calc(teco_int_t *ret, teco_int_t imply, GError **error);
 
-void teco_expressions_add_digit(gchar digit);
+void teco_expressions_add_digit(gunichar digit);
 
 void teco_expressions_push_op(teco_operator_t op);
 gboolean teco_expressions_push_calc(teco_operator_t op, GError **error);
diff --git a/src/goto-commands.c b/src/goto-commands.c
index 2326f64..bf80c0b 100644
--- a/src/goto-commands.c
+++ b/src/goto-commands.c
@@ -53,7 +53,7 @@ teco_state_label_initial(teco_machine_main_t *ctx, GError **error)
  * I'm unsure whether !-signs should be allowed within comments.
  */
 static teco_state_t *
-teco_state_label_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_label_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
 {
 	if (chr == '!') {
 		/*
@@ -85,7 +85,7 @@ teco_state_label_input(teco_machine_main_t *ctx, gchar chr, GError **error)
 
 	if (ctx->parent.must_undo)
 		undo__teco_string_truncate(&ctx->goto_label, ctx->goto_label.len);
-	teco_string_append_c(&ctx->goto_label, chr);
+	teco_string_append_wc(&ctx->goto_label, chr);
 	return &teco_state_label;
 }
 
@@ -138,7 +138,7 @@ teco_state_goto_done(teco_machine_main_t *ctx, const teco_string_t *str, GError
 }
 
 /* in cmdline.c */
-gboolean teco_state_goto_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar chr, GError **error);
+gboolean teco_state_goto_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar chr, GError **error);
 
 /*$ O
  * Olabel$ -- Go to label
diff --git a/src/help.c b/src/help.c
index 8364496..9ee7239 100644
--- a/src/help.c
+++ b/src/help.c
@@ -314,7 +314,7 @@ teco_state_help_done(teco_machine_main_t *ctx, const teco_string_t *str, GError
 }
 
 /* in cmdline.c */
-gboolean teco_state_help_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar chr, GError **error);
+gboolean teco_state_help_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar chr, GError **error);
 
 /*$ "?" help
  * ?[topic]$ -- Get help for topic
diff --git a/src/interface-curses/interface.c b/src/interface-curses/interface.c
index 443a903..96254a9 100644
--- a/src/interface-curses/interface.c
+++ b/src/interface-curses/interface.c
@@ -1582,6 +1582,9 @@ teco_interface_blocking_getch(void)
 void
 teco_interface_event_loop_iter(void)
 {
+	static gchar keybuf[4];
+	static gint keybuf_i = 0;
+
 	gint key = g_queue_is_empty(teco_interface.input_queue)
 			? teco_interface_blocking_getch()
 			: GPOINTER_TO_INT(g_queue_pop_head(teco_interface.input_queue));
@@ -1610,14 +1613,14 @@ teco_interface_event_loop_iter(void)
 		 * backspace.
 		 * In SciTECO backspace is normalized to ^H.
 		 */
-		if (!teco_cmdline_keypress_c(TECO_CTL_KEY('H'),
-		                             &teco_interface.event_loop_error))
+		if (!teco_cmdline_keypress_wc(TECO_CTL_KEY('H'),
+		                              &teco_interface.event_loop_error))
 			return;
 		break;
 	case KEY_ENTER:
 	case '\r':
 	case '\n':
-		if (!teco_cmdline_keypress_c('\n', &teco_interface.event_loop_error))
+		if (!teco_cmdline_keypress_wc('\n', &teco_interface.event_loop_error))
 			return;
 		break;
 
@@ -1658,8 +1661,19 @@ teco_interface_event_loop_iter(void)
 	 * Control keys and keys with printable representation
 	 */
 	default:
-		if (key <= 0xFF &&
-		    !teco_cmdline_keypress_c(key, &teco_interface.event_loop_error))
+		if (key > 0xFF)
+			return;
+
+		/*
+		 * NOTE: There's also wget_wch(), but it requires
+		 * a widechar version of Curses.
+		 */
+		keybuf[keybuf_i++] = key;
+		gunichar cp = g_utf8_get_char_validated(keybuf, keybuf_i);
+		if (keybuf_i >= sizeof(keybuf) || cp != (gunichar)-2)
+			keybuf_i = 0;
+		if ((gint32)cp < 0 ||
+		    !teco_cmdline_keypress_wc(cp, &teco_interface.event_loop_error))
 			return;
 	}
 
diff --git a/src/interface-gtk/interface.c b/src/interface-gtk/interface.c
index 2ad8335..9c1ce6a 100644
--- a/src/interface-gtk/interface.c
+++ b/src/interface-gtk/interface.c
@@ -927,19 +927,19 @@ teco_interface_handle_key_press(GdkEventKey *event, GError **error)
 
 	switch (event->keyval) {
 	case GDK_KEY_Escape:
-		if (!teco_cmdline_keypress_c('\e', error))
+		if (!teco_cmdline_keypress_wc('\e', error))
 			return FALSE;
 		break;
 	case GDK_KEY_BackSpace:
-		if (!teco_cmdline_keypress_c(TECO_CTL_KEY('H'), error))
+		if (!teco_cmdline_keypress_wc(TECO_CTL_KEY('H'), error))
 			return FALSE;
 		break;
 	case GDK_KEY_Tab:
-		if (!teco_cmdline_keypress_c('\t', error))
+		if (!teco_cmdline_keypress_wc('\t', error))
 			return FALSE;
 		break;
 	case GDK_KEY_Return:
-		if (!teco_cmdline_keypress_c('\n', error))
+		if (!teco_cmdline_keypress_wc('\n', error))
 			return FALSE;
 		break;
 
@@ -994,7 +994,7 @@ teco_interface_handle_key_press(GdkEventKey *event, GError **error)
 		if ((event->state & (GDK_CONTROL_MASK | GDK_MOD1_MASK)) == GDK_CONTROL_MASK) {
 			gchar c = teco_interface_get_ansi_key(event);
 			if (c) {
-				if (!teco_cmdline_keypress_c(TECO_CTL_KEY(g_ascii_toupper(c)), error))
+				if (!teco_cmdline_keypress_wc(TECO_CTL_KEY(g_ascii_toupper(c)), error))
 					return FALSE;
 				break;
 			}
diff --git a/src/parser.c b/src/parser.c
index ed21740..321803a 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -59,7 +59,7 @@ teco_loop_stack_cleanup(void)
 }
 
 gboolean
-teco_machine_input(teco_machine_t *ctx, gchar chr, GError **error)
+teco_machine_input(teco_machine_t *ctx, gunichar chr, GError **error)
 {
 	teco_state_t *next = ctx->current->input_cb(ctx, chr, error);
 	if (!next)
@@ -86,10 +86,20 @@ teco_state_end_of_macro(teco_machine_t *ctx, GError **error)
 }
 
 /**
+ * Execute macro from current PC to stop position.
+ *
  * Handles all expected exceptions and preparing them for stack frame insertion.
+ *
+ * @param ctx State machine.
+ * @param macro The macro to execute.
+ *   It does not have to be complete.
+ *   It must consist only of validated UTF-8 sequences, though.
+ * @param stop_pos Where to stop execution in bytes.
+ * @param error Location to store error.
+ * @return FALSE if an error occurred.
  */
 gboolean
-teco_machine_main_step(teco_machine_main_t *ctx, const gchar *macro, gint stop_pos, GError **error)
+teco_machine_main_step(teco_machine_main_t *ctx, const gchar *macro, gsize stop_pos, GError **error)
 {
 	while (ctx->macro_pc < stop_pos) {
 #ifdef DEBUG
@@ -110,9 +120,13 @@ teco_machine_main_step(teco_machine_main_t *ctx, const gchar *macro, gint stop_p
 		if (!teco_memory_check(0, error))
 			goto error_attach;
 
-		if (!teco_machine_input(&ctx->parent, macro[ctx->macro_pc], error))
+		/* UTF-8 sequences are already validated */
+		gunichar chr = g_utf8_get_char(macro+ctx->macro_pc);
+
+		if (!teco_machine_input(&ctx->parent, chr, error))
 			goto error_attach;
-		ctx->macro_pc++;
+
+		ctx->macro_pc = g_utf8_next_char(macro+ctx->macro_pc) - macro;
 	}
 
 	/*
@@ -145,6 +159,20 @@ teco_execute_macro(const gchar *macro, gsize macro_len,
                    teco_qreg_table_t *qreg_table_locals, GError **error)
 {
 	/*
+	 * Validate UTF-8, but accept null bytes.
+	 * NOTE: there is g_utf8_validate_len() in Glib 2.60
+	 */
+	const gchar *p = macro;
+	while (!g_utf8_validate(p, macro_len - (p - macro), &p) && !*p)
+		p++;
+	if (p - macro < macro_len) {
+		g_set_error(error, TECO_ERROR, TECO_ERROR_CODEPOINT,
+		            "Invalid UTF-8 byte sequence at %" G_GSIZE_FORMAT,
+		            p - macro);
+		return FALSE;
+	}
+
+	/*
 	 * This is not auto-cleaned up, so it can be initialized
 	 * on demand.
 	 */
@@ -309,26 +337,26 @@ teco_machine_main_eval_colon(teco_machine_main_t *ctx)
 teco_state_t *
 teco_machine_main_transition_input(teco_machine_main_t *ctx,
                                    teco_machine_main_transition_t *transitions,
-                                   guint len, gchar chr, GError **error)
+                                   guint len, gunichar chr, GError **error)
 {
-	if (chr < 0 || chr >= len || !transitions[(guint)chr].next) {
+	if (chr >= len || !transitions[chr].next) {
 		teco_error_syntax_set(error, chr);
 		return NULL;
 	}
 
-	if (ctx->mode == TECO_MODE_NORMAL && transitions[(guint)chr].transition_cb) {
+	if (ctx->mode == TECO_MODE_NORMAL && transitions[chr].transition_cb) {
 		/*
 		 * NOTE: We could also just let transition_cb return a boolean...
 		 */
 		GError *tmp_error = NULL;
-		transitions[(guint)chr].transition_cb(ctx, &tmp_error);
+		transitions[chr].transition_cb(ctx, &tmp_error);
 		if (tmp_error) {
 			g_propagate_error(error, tmp_error);
 			return NULL;
 		}
 	}
 
-	return transitions[(guint)chr].next;
+	return transitions[chr].next;
 }
 
 void
@@ -342,11 +370,11 @@ teco_machine_main_clear(teco_machine_main_t *ctx)
  * FIXME: All teco_state_stringbuilding_* states could be static?
  */
 static teco_state_t *teco_state_stringbuilding_ctl_input(teco_machine_stringbuilding_t *ctx,
-                                                         gchar chr, GError **error);
+                                                         gunichar chr, GError **error);
 TECO_DECLARE_STATE(teco_state_stringbuilding_ctl);
 
 static teco_state_t *teco_state_stringbuilding_escaped_input(teco_machine_stringbuilding_t *ctx,
-                                                             gchar chr, GError **error);
+                                                             gunichar chr, GError **error);
 TECO_DECLARE_STATE(teco_state_stringbuilding_escaped);
 
 TECO_DECLARE_STATE(teco_state_stringbuilding_lower);
@@ -360,7 +388,7 @@ TECO_DECLARE_STATE(teco_state_stringbuilding_ctle_quote);
 TECO_DECLARE_STATE(teco_state_stringbuilding_ctle_n);
 
 static teco_state_t *
-teco_state_stringbuilding_start_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_start_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
 {
 	if (chr == '^')
 		return &teco_state_stringbuilding_ctl;
@@ -372,7 +400,7 @@ teco_state_stringbuilding_start_input(teco_machine_stringbuilding_t *ctx, gchar
 
 /* in cmdline.c */
 gboolean teco_state_stringbuilding_start_process_edit_cmd(teco_machine_stringbuilding_t *ctx, teco_machine_t *parent_ctx,
-                                                          gchar key, GError **error);
+                                                          gunichar key, GError **error);
 
 TECO_DEFINE_STATE(teco_state_stringbuilding_start,
 		.is_start = TRUE,
@@ -381,7 +409,7 @@ TECO_DEFINE_STATE(teco_state_stringbuilding_start,
 );
 
 static teco_state_t *
-teco_state_stringbuilding_ctl_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_ctl_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
 {
 	chr = teco_ascii_toupper(chr);
 
@@ -396,40 +424,50 @@ teco_state_stringbuilding_ctl_input(teco_machine_stringbuilding_t *ctx, gchar ch
 		chr = TECO_CTL_KEY(chr);
 	}
 
+	/*
+	 * Source code is always in UTF-8, so it does not
+	 * make sense to handle ctx->codepage != SC_CP_UTF8
+	 * separately.
+	 */
 	if (ctx->result)
-		teco_string_append_c(ctx->result, chr);
+		teco_string_append_wc(ctx->result, chr);
 	return &teco_state_stringbuilding_start;
 }
 
 TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_stringbuilding_ctl);
 
 static teco_state_t *
-teco_state_stringbuilding_escaped_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_escaped_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
 {
 	if (!ctx->result)
 		/* parse-only mode */
 		return &teco_state_stringbuilding_start;
 
-	/* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */
+	/*
+	 * The subtle difference between UTF-8 and single-byte targets
+	 * is that we don't try to casefold non-ANSI characters in single-byte mode.
+	 */
 	switch (ctx->mode) {
 	case TECO_STRINGBUILDING_MODE_UPPER:
-		chr = g_ascii_toupper(chr);
+		chr = ctx->codepage == SC_CP_UTF8 || chr < 0x80
+					? g_unichar_toupper(chr) : chr;
 		break;
 	case TECO_STRINGBUILDING_MODE_LOWER:
-		chr = g_ascii_tolower(chr);
+		chr = ctx->codepage == SC_CP_UTF8 || chr < 0x80
+					? g_unichar_tolower(chr) : chr;
 		break;
 	default:
 		break;
 	}
 
-	teco_string_append_c(ctx->result, chr);
+	teco_string_append_wc(ctx->result, chr);
 	return &teco_state_stringbuilding_start;
 }
 
 TECO_DEFINE_STATE(teco_state_stringbuilding_escaped);
 
 static teco_state_t *
-teco_state_stringbuilding_lower_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_lower_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
 {
 	if (!ctx->result)
 		/* parse-only mode */
@@ -443,8 +481,9 @@ teco_state_stringbuilding_lower_input(teco_machine_stringbuilding_t *ctx, gchar
 			teco_undo_guint(ctx->mode);
 		ctx->mode = TECO_STRINGBUILDING_MODE_LOWER;
 	} else {
-		/* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */
-		teco_string_append_c(ctx->result, g_ascii_tolower(chr));
+		chr = ctx->codepage == SC_CP_UTF8 || chr < 0x80
+					? g_unichar_tolower(chr) : chr;
+		teco_string_append_wc(ctx->result, chr);
 	}
 
 	return &teco_state_stringbuilding_start;
@@ -453,7 +492,7 @@ teco_state_stringbuilding_lower_input(teco_machine_stringbuilding_t *ctx, gchar
 TECO_DEFINE_STATE(teco_state_stringbuilding_lower);
 
 static teco_state_t *
-teco_state_stringbuilding_upper_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_upper_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
 {
 	if (!ctx->result)
 		/* parse-only mode */
@@ -467,8 +506,9 @@ teco_state_stringbuilding_upper_input(teco_machine_stringbuilding_t *ctx, gchar
 			teco_undo_guint(ctx->mode);
 		ctx->mode = TECO_STRINGBUILDING_MODE_UPPER;
 	} else {
-		/* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */
-		teco_string_append_c(ctx->result, g_ascii_toupper(chr));
+		chr = ctx->codepage == SC_CP_UTF8 || chr < 0x80
+					? g_unichar_toupper(chr) : chr;
+		teco_string_append_wc(ctx->result, chr);
 	}
 
 	return &teco_state_stringbuilding_start;
@@ -477,7 +517,7 @@ teco_state_stringbuilding_upper_input(teco_machine_stringbuilding_t *ctx, gchar
 TECO_DEFINE_STATE(teco_state_stringbuilding_upper);
 
 static teco_state_t *
-teco_state_stringbuilding_ctle_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_ctle_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
 {
 	teco_state_t *next;
 
@@ -489,8 +529,9 @@ teco_state_stringbuilding_ctle_input(teco_machine_stringbuilding_t *ctx, gchar c
 	case 'N':  next = &teco_state_stringbuilding_ctle_n; break;
 	default:
 		if (ctx->result) {
-			gchar buf[] = {TECO_CTL_KEY('E'), chr};
-			teco_string_append(ctx->result, buf, sizeof(buf));
+			gchar buf[1+6] = {TECO_CTL_KEY('E')};
+			gsize len = g_unichar_to_utf8(chr, buf+1);
+			teco_string_append(ctx->result, buf, 1+len);
 		}
 		return &teco_state_stringbuilding_start;
 	}
@@ -508,7 +549,7 @@ TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_stringbuilding_ctle);
 
 /* in cmdline.c */
 gboolean teco_state_stringbuilding_qreg_process_edit_cmd(teco_machine_stringbuilding_t *ctx, teco_machine_t *parent_ctx,
-                                                         gchar chr, GError **error);
+                                                         gunichar chr, GError **error);
 
 /**
  * @interface TECO_DEFINE_STATE_STRINGBUILDING_QREG
@@ -523,7 +564,7 @@ gboolean teco_state_stringbuilding_qreg_process_edit_cmd(teco_machine_stringbuil
 	)
 
 static teco_state_t *
-teco_state_stringbuilding_ctle_num_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_ctle_num_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
 {
 	teco_qreg_t *qreg;
 
@@ -558,7 +599,7 @@ teco_state_stringbuilding_ctle_num_input(teco_machine_stringbuilding_t *ctx, gch
 TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_num);
 
 static teco_state_t *
-teco_state_stringbuilding_ctle_u_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_ctle_u_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
 {
 	teco_qreg_t *qreg;
 
@@ -583,10 +624,7 @@ teco_state_stringbuilding_ctle_u_input(teco_machine_stringbuilding_t *ctx, gchar
 	if (ctx->codepage == SC_CP_UTF8) {
 		if (value < 0 || !g_unichar_validate(value))
 			goto error_codepoint;
-		/* 4 bytes should be enough, but we better follow the documentation */
-		gchar buf[6];
-		gsize len = g_unichar_to_utf8(value, buf);
-		teco_string_append(ctx->result, buf, len);
+		teco_string_append_wc(ctx->result, value);
 	} else {
 		if (value < 0 || value > 0xFF)
 			goto error_codepoint;
@@ -606,7 +644,7 @@ error_codepoint: {
 TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_u);
 
 static teco_state_t *
-teco_state_stringbuilding_ctle_q_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_ctle_q_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
 {
 	teco_qreg_t *qreg;
 
@@ -637,7 +675,7 @@ teco_state_stringbuilding_ctle_q_input(teco_machine_stringbuilding_t *ctx, gchar
 TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_q);
 
 static teco_state_t *
-teco_state_stringbuilding_ctle_quote_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_ctle_quote_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
 {
 	teco_qreg_t *qreg;
 	teco_qreg_table_t *table;
@@ -680,7 +718,7 @@ teco_state_stringbuilding_ctle_quote_input(teco_machine_stringbuilding_t *ctx, g
 TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_quote);
 
 static teco_state_t *
-teco_state_stringbuilding_ctle_n_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_ctle_n_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
 {
 	teco_qreg_t *qreg;
 	teco_qreg_table_t *table;
@@ -717,7 +755,7 @@ teco_state_stringbuilding_ctle_n_input(teco_machine_stringbuilding_t *ctx, gchar
 TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_n);
 
 void
-teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gchar escape_char,
+teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gunichar escape_char,
                                  teco_qreg_table_t *locals, gboolean must_undo)
 {
 	memset(ctx, 0, sizeof(*ctx));
@@ -738,6 +776,10 @@ teco_machine_stringbuilding_reset(teco_machine_stringbuilding_t *ctx)
 	ctx->mode = TECO_STRINGBUILDING_MODE_NORMAL;
 }
 
+/*
+ * If we case folded only ANSI characters as in teco_ascii_toupper(),
+ * this could be simplified.
+ */
 void
 teco_machine_stringbuilding_escape(teco_machine_stringbuilding_t *ctx, const gchar *str, gsize len,
                                    teco_string_t *target)
@@ -745,12 +787,18 @@ teco_machine_stringbuilding_escape(teco_machine_stringbuilding_t *ctx, const gch
 	target->data = g_malloc(len*2+1);
 	target->len = 0;
 
-	for (guint i = 0; i < len; i++) {
-		if (teco_ascii_toupper(str[i]) == ctx->escape_char ||
-		    (ctx->escape_char == '[' && str[i] == ']') ||
-		    (ctx->escape_char == '{' && str[i] == '}'))
+	for (guint i = 0; i < len; ) {
+		gunichar chr = g_utf8_get_char(str+i);
+
+		if (g_unichar_toupper(chr) == ctx->escape_char ||
+		    (ctx->escape_char == '[' && chr == ']') ||
+		    (ctx->escape_char == '{' && chr == '}'))
 			target->data[target->len++] = TECO_CTL_KEY('Q');
-		target->data[target->len++] = str[i];
+
+		gsize lenc = g_utf8_next_char(str+i) - (str+i);
+		memcpy(target->data+target->len, str+i, lenc);
+		target->len += lenc;
+		i += lenc;
 	}
 
 	target->data[target->len] = '\0';
@@ -772,7 +820,7 @@ teco_state_expectstring_initial(teco_machine_main_t *ctx, GError **error)
 }
 
 teco_state_t *
-teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_expectstring_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
 {
 	teco_state_t *current = ctx->parent.current;
 
@@ -789,13 +837,18 @@ teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **erro
 		/*
 		 * FIXME: Exclude setting at least whitespace characters as the
 		 * new string escape character to avoid accidental errors?
+		 *
+		 * FIXME: Should we perhaps restrict case folding escape characters
+		 * to the ANSI range (teco_ascii_toupper())?
+		 * This would be faster than case folding each and every character
+		 * of a string argument to check against the escape char.
 		 */
 		switch (ctx->expectstring.machine.escape_char) {
 		case '\e':
 		case '{':
 			if (ctx->parent.must_undo)
-				teco_undo_gchar(ctx->expectstring.machine.escape_char);
-			ctx->expectstring.machine.escape_char = teco_ascii_toupper(chr);
+				teco_undo_gunichar(ctx->expectstring.machine.escape_char);
+			ctx->expectstring.machine.escape_char = g_unichar_toupper(chr);
 			return current;
 		}
 	}
@@ -819,7 +872,7 @@ teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **erro
 				ctx->expectstring.nesting--;
 				break;
 			}
-		} else if (teco_ascii_toupper(chr) == ctx->expectstring.machine.escape_char) {
+		} else if (g_unichar_toupper(chr) == ctx->expectstring.machine.escape_char) {
 			if (ctx->parent.must_undo)
 				teco_undo_gint(ctx->expectstring.nesting);
 			ctx->expectstring.nesting--;
@@ -849,7 +902,7 @@ teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **erro
 
 		if (current->expectstring.last) {
 			if (ctx->parent.must_undo)
-				teco_undo_gchar(ctx->expectstring.machine.escape_char);
+				teco_undo_gunichar(ctx->expectstring.machine.escape_char);
 			ctx->expectstring.machine.escape_char = '\e';
 		}
 		ctx->expectstring.nesting = 1;
@@ -880,7 +933,7 @@ teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **erro
 		if (!teco_machine_stringbuilding_input(&ctx->expectstring.machine, chr, str, error))
 			return NULL;
 	} else if (ctx->mode == TECO_MODE_NORMAL) {
-		teco_string_append_c(&ctx->expectstring.string, chr);
+		teco_string_append_wc(&ctx->expectstring.string, chr);
 	}
 
 	/*
@@ -924,7 +977,7 @@ teco_state_expectfile_process(teco_machine_main_t *ctx, const teco_string_t *str
 	g_assert(str->data != NULL);
 
 	/*
-	 * Null-chars must not ocur in filename/path strings and at some point
+	 * Null-chars must not occur in filename/path strings and at some point
 	 * teco_string_t has to be converted to a null-terminated C string
 	 * as all the glib filename functions rely on null-terminated strings.
 	 * Doing it here ensures that teco_file_expand_path() can be safely called
diff --git a/src/parser.h b/src/parser.h
index 09ec483..ae2cb9b 100644
--- a/src/parser.h
+++ b/src/parser.h
@@ -101,11 +101,11 @@ typedef const struct {
 } teco_state_expectqreg_t;
 
 typedef gboolean (*teco_state_initial_cb_t)(teco_machine_t *ctx, GError **error);
-typedef teco_state_t *(*teco_state_input_cb_t)(teco_machine_t *ctx, gchar chr, GError **error);
+typedef teco_state_t *(*teco_state_input_cb_t)(teco_machine_t *ctx, gunichar chr, GError **error);
 typedef gboolean (*teco_state_refresh_cb_t)(teco_machine_t *ctx, GError **error);
 typedef gboolean (*teco_state_end_of_macro_cb_t)(teco_machine_t *ctx, GError **error);
 typedef gboolean (*teco_state_process_edit_cmd_cb_t)(teco_machine_t *ctx, teco_machine_t *parent_ctx,
-                                                     gchar key, GError **error);
+                                                     gunichar key, GError **error);
 
 typedef enum {
 	TECO_FNMACRO_MASK_START		= (1 << 0),
@@ -225,7 +225,7 @@ struct teco_state_t {
 gboolean teco_state_end_of_macro(teco_machine_t *ctx, GError **error);
 
 /* in cmdline.c */
-gboolean teco_state_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gchar chr, GError **error);
+gboolean teco_state_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gunichar chr, GError **error);
 
 /**
  * @interface TECO_DEFINE_STATE
@@ -254,7 +254,7 @@ gboolean teco_state_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent
 	extern teco_state_t NAME
 
 /* in cmdline.c */
-gboolean teco_state_caseinsensitive_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gchar chr, GError **error);
+gboolean teco_state_caseinsensitive_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gunichar chr, GError **error);
 
 /**
  * @interface TECO_DEFINE_STATE_CASEINSENSITIVE
@@ -308,7 +308,7 @@ teco_machine_reset(teco_machine_t *ctx, teco_state_t *initial)
 		teco_undo_ptr(ctx->current) = initial;
 }
 
-gboolean teco_machine_input(teco_machine_t *ctx, gchar chr, GError **error);
+gboolean teco_machine_input(teco_machine_t *ctx, gunichar chr, GError **error);
 
 typedef enum {
 	TECO_STRINGBUILDING_MODE_NORMAL = 0,
@@ -336,7 +336,7 @@ typedef struct teco_machine_stringbuilding_t {
 	 * If this is `[` or `{`, it is assumed that `]` and `}` must
 	 * be escaped as well by teco_machine_stringbuilding_escape().
 	 */
-	gchar escape_char;
+	gunichar escape_char;
 
 	/**
 	 * Q-Register table for local registers.
@@ -366,7 +366,7 @@ typedef struct teco_machine_stringbuilding_t {
 	guint codepage;
 } teco_machine_stringbuilding_t;
 
-void teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gchar escape_char,
+void teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gunichar escape_char,
                                       teco_qreg_table_t *locals, gboolean must_undo);
 
 void teco_machine_stringbuilding_reset(teco_machine_stringbuilding_t *ctx);
@@ -381,7 +381,7 @@ void teco_machine_stringbuilding_reset(teco_machine_stringbuilding_t *ctx);
  * @return FALSE in case of error.
  */
 static inline gboolean
-teco_machine_stringbuilding_input(teco_machine_stringbuilding_t *ctx, gchar chr,
+teco_machine_stringbuilding_input(teco_machine_stringbuilding_t *ctx, gunichar chr,
                                   teco_string_t *result, GError **error)
 {
 	ctx->result = result;
@@ -497,7 +497,7 @@ void teco_machine_main_init(teco_machine_main_t *ctx,
 gboolean teco_machine_main_eval_colon(teco_machine_main_t *ctx);
 
 gboolean teco_machine_main_step(teco_machine_main_t *ctx,
-                                const gchar *macro, gint stop_pos, GError **error);
+                                const gchar *macro, gsize stop_pos, GError **error);
 
 gboolean teco_execute_macro(const gchar *macro, gsize macro_len,
                             teco_qreg_table_t *qreg_table_locals, GError **error);
@@ -516,18 +516,18 @@ typedef const struct {
  */
 teco_state_t *teco_machine_main_transition_input(teco_machine_main_t *ctx,
                                                  teco_machine_main_transition_t *transitions,
-                                                 guint len, gchar chr, GError **error);
+                                                 guint len, gunichar chr, GError **error);
 
 void teco_machine_main_clear(teco_machine_main_t *ctx);
 
 G_DEFINE_AUTO_CLEANUP_CLEAR_FUNC(teco_machine_main_t, teco_machine_main_clear);
 
 gboolean teco_state_expectstring_initial(teco_machine_main_t *ctx, GError **error);
-teco_state_t *teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **error);
+teco_state_t *teco_state_expectstring_input(teco_machine_main_t *ctx, gunichar chr, GError **error);
 gboolean teco_state_expectstring_refresh(teco_machine_main_t *ctx, GError **error);
 
 /* in cmdline.c */
-gboolean teco_state_expectstring_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error);
+gboolean teco_state_expectstring_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error);
 
 /**
  * @interface TECO_DEFINE_STATE_EXPECTSTRING
@@ -543,7 +543,7 @@ gboolean teco_state_expectstring_process_edit_cmd(teco_machine_main_t *ctx, teco
  */
 #define TECO_DEFINE_STATE_EXPECTSTRING(NAME, ...) \
 	static teco_state_t * \
-	NAME##_input(teco_machine_main_t *ctx, gchar chr, GError **error) \
+	NAME##_input(teco_machine_main_t *ctx, gunichar chr, GError **error) \
 	{ \
 		return teco_state_expectstring_input(ctx, chr, error); \
 	} \
@@ -564,7 +564,7 @@ gboolean teco_state_expectfile_process(teco_machine_main_t *ctx, const teco_stri
                                        gsize new_chars, GError **error);
 
 /* in cmdline.c */
-gboolean teco_state_expectfile_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error);
+gboolean teco_state_expectfile_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error);
 
 /**
  * @interface TECO_DEFINE_STATE_EXPECTFILE
@@ -580,7 +580,7 @@ gboolean teco_state_expectfile_process_edit_cmd(teco_machine_main_t *ctx, teco_m
 	)
 
 /* in cmdline.c */
-gboolean teco_state_expectdir_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error);
+gboolean teco_state_expectdir_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error);
 
 /**
  * @interface TECO_DEFINE_STATE_EXPECTDIR
diff --git a/src/qreg-commands.c b/src/qreg-commands.c
index f248ced..8d28e7d 100644
--- a/src/qreg-commands.c
+++ b/src/qreg-commands.c
@@ -50,7 +50,7 @@ teco_state_expectqreg_initial(teco_machine_main_t *ctx, GError **error)
 }
 
 teco_state_t *
-teco_state_expectqreg_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_expectqreg_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
 {
 	teco_state_t *current = ctx->parent.current;
 
@@ -680,6 +680,10 @@ teco_state_macro_got_register(teco_machine_main_t *ctx, teco_qreg_t *qreg,
  * Note that the string of <q> will be copied upon macro execution,
  * so subsequent changes to Q-Register <q> from inside the macro do
  * not modify the executed code.
+ *
+ * While \fBM\fP does not check the register's configured encoding
+ * (as reported by \fBEE\fP), its contents must be and are checked to be in
+ * valid UTF-8.
  */
 TECO_DEFINE_STATE_EXPECTQREG(teco_state_macro);
 
@@ -714,6 +718,9 @@ teco_state_macrofile_done(teco_machine_main_t *ctx, const teco_string_t *str, GE
  * It is otherwise similar to the \(lqM\(rq command.
  *
  * If <file> could not be read, the command yields an error.
+ *
+ * As all \*(ST code, the contents of <file> must be in valid UTF-8
+ * even if operating in the \(lqdefault ANSI\(rq mode as configured by \fBED\fP.
  */
 TECO_DEFINE_STATE_EXPECTFILE(teco_state_macrofile);
 
diff --git a/src/qreg-commands.h b/src/qreg-commands.h
index b190e9f..27a6a5c 100644
--- a/src/qreg-commands.h
+++ b/src/qreg-commands.h
@@ -33,10 +33,10 @@ teco_state_expectqreg_reset(teco_machine_main_t *ctx)
 
 gboolean teco_state_expectqreg_initial(teco_machine_main_t *ctx, GError **error);
 
-teco_state_t *teco_state_expectqreg_input(teco_machine_main_t *ctx, gchar chr, GError **error);
+teco_state_t *teco_state_expectqreg_input(teco_machine_main_t *ctx, gunichar chr, GError **error);
 
 /* in cmdline.c */
-gboolean teco_state_expectqreg_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error);
+gboolean teco_state_expectqreg_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error);
 
 /**
  * @interface TECO_DEFINE_STATE_EXPECTQREG
@@ -47,7 +47,7 @@ gboolean teco_state_expectqreg_process_edit_cmd(teco_machine_main_t *ctx, teco_m
  */
 #define TECO_DEFINE_STATE_EXPECTQREG(NAME, ...) \
 	static teco_state_t * \
-	NAME##_input(teco_machine_main_t *ctx, gchar chr, GError **error) \
+	NAME##_input(teco_machine_main_t *ctx, gunichar chr, GError **error) \
 	{ \
 		return teco_state_expectqreg_input(ctx, chr, error); \
 	} \
diff --git a/src/qreg.c b/src/qreg.c
index fb559af..cac2d12 100644
--- a/src/qreg.c
+++ b/src/qreg.c
@@ -84,10 +84,9 @@ teco_qreg_execute(teco_qreg_t *qreg, teco_qreg_table_t *qreg_table_locals, GErro
 	g_auto(teco_string_t) macro = {NULL, 0};
 
 	/*
-	 * FIXME: Once we have an Unicode-aware parser,
-	 * we should probably check the encoding of the register.
-	 * On the other hand, we will have to validate the
-	 * UTF-8 codepoints before execution anyway.
+	 * SciTECO macros must be in UTF-8, but we don't check the encoding,
+	 * so as not to complicate TECO_ED_DEFAULT_ANSI mode.
+	 * The UTF-8 byte sequences are checked anyway.
 	 */
 	if (!qreg->vtable->get_string(qreg, &macro.data, &macro.len, NULL, error) ||
 	    !teco_execute_macro(macro.data, macro.len, qreg_table_locals, error)) {
@@ -1220,7 +1219,7 @@ TECO_DECLARE_STATE(teco_state_qregspec_secondchar);
 TECO_DECLARE_STATE(teco_state_qregspec_string);
 
 static teco_state_t *teco_state_qregspec_start_global_input(teco_machine_qregspec_t *ctx,
-                                                            gchar chr, GError **error);
+                                                            gunichar chr, GError **error);
 
 static teco_state_t *
 teco_state_qregspec_done(teco_machine_qregspec_t *ctx, GError **error)
@@ -1255,7 +1254,7 @@ teco_state_qregspec_done(teco_machine_qregspec_t *ctx, GError **error)
 }
 
 static teco_state_t *
-teco_state_qregspec_start_input(teco_machine_qregspec_t *ctx, gchar chr, GError **error)
+teco_state_qregspec_start_input(teco_machine_qregspec_t *ctx, gunichar chr, GError **error)
 {
 	/*
 	 * FIXME: We're using teco_state_qregspec_start as a success condition,
@@ -1272,7 +1271,7 @@ teco_state_qregspec_start_input(teco_machine_qregspec_t *ctx, gchar chr, GError
 }
 
 /* in cmdline.c */
-gboolean teco_state_qregspec_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error);
+gboolean teco_state_qregspec_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error);
 
 TECO_DEFINE_STATE(teco_state_qregspec_start,
 	.is_start = TRUE,
@@ -1280,7 +1279,7 @@ TECO_DEFINE_STATE(teco_state_qregspec_start,
 );
 
 static teco_state_t *
-teco_state_qregspec_start_global_input(teco_machine_qregspec_t *ctx, gchar chr, GError **error)
+teco_state_qregspec_start_global_input(teco_machine_qregspec_t *ctx, gunichar chr, GError **error)
 {
 	/*
 	 * FIXME: Disallow space characters?
@@ -1299,8 +1298,7 @@ teco_state_qregspec_start_global_input(teco_machine_qregspec_t *ctx, gchar chr,
 	if (!ctx->parse_only) {
 		if (ctx->parent.must_undo)
 			undo__teco_string_truncate(&ctx->name, ctx->name.len);
-		/* FIXME: g_unicode_toupper() once we have an Unicode-conforming parser */
-		teco_string_append_c(&ctx->name, g_ascii_toupper(chr));
+		teco_string_append_wc(&ctx->name, g_unichar_toupper(chr));
 	}
 	return teco_state_qregspec_done(ctx, error);
 }
@@ -1316,7 +1314,7 @@ TECO_DEFINE_STATE(teco_state_qregspec_start_global,
 );
 
 static teco_state_t *
-teco_state_qregspec_firstchar_input(teco_machine_qregspec_t *ctx, gchar chr, GError **error)
+teco_state_qregspec_firstchar_input(teco_machine_qregspec_t *ctx, gunichar chr, GError **error)
 {
 	/*
 	 * FIXME: Disallow space characters?
@@ -1324,8 +1322,7 @@ teco_state_qregspec_firstchar_input(teco_machine_qregspec_t *ctx, gchar chr, GEr
 	if (!ctx->parse_only) {
 		if (ctx->parent.must_undo)
 			undo__teco_string_truncate(&ctx->name, ctx->name.len);
-		/* FIXME: g_unicode_toupper() once we have an Unicode-conforming parser */
-		teco_string_append_c(&ctx->name, g_ascii_toupper(chr));
+		teco_string_append_wc(&ctx->name, g_unichar_toupper(chr));
 	}
 	return &teco_state_qregspec_secondchar;
 }
@@ -1335,7 +1332,7 @@ TECO_DEFINE_STATE(teco_state_qregspec_firstchar,
 );
 
 static teco_state_t *
-teco_state_qregspec_secondchar_input(teco_machine_qregspec_t *ctx, gchar chr, GError **error)
+teco_state_qregspec_secondchar_input(teco_machine_qregspec_t *ctx, gunichar chr, GError **error)
 {
 	/*
 	 * FIXME: Disallow space characters?
@@ -1343,8 +1340,7 @@ teco_state_qregspec_secondchar_input(teco_machine_qregspec_t *ctx, gchar chr, GE
 	if (!ctx->parse_only) {
 		if (ctx->parent.must_undo)
 			undo__teco_string_truncate(&ctx->name, ctx->name.len);
-		/* FIXME: g_unicode_toupper() once we have an Unicode-conforming parser */
-		teco_string_append_c(&ctx->name, g_ascii_toupper(chr));
+		teco_string_append_wc(&ctx->name, g_unichar_toupper(chr));
 	}
 	return teco_state_qregspec_done(ctx, error);
 }
@@ -1354,7 +1350,7 @@ TECO_DEFINE_STATE(teco_state_qregspec_secondchar,
 );
 
 static teco_state_t *
-teco_state_qregspec_string_input(teco_machine_qregspec_t *ctx, gchar chr, GError **error)
+teco_state_qregspec_string_input(teco_machine_qregspec_t *ctx, gunichar chr, GError **error)
 {
 	/*
 	 * Makes sure that braces within string building constructs do not have to be
@@ -1395,7 +1391,7 @@ teco_state_qregspec_string_input(teco_machine_qregspec_t *ctx, gchar chr, GError
 
 /* in cmdline.c */
 gboolean teco_state_qregspec_string_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_t *parent_ctx,
-                                                     gchar key, GError **error);
+                                                     gunichar key, GError **error);
 
 TECO_DEFINE_STATE(teco_state_qregspec_string,
 	.process_edit_cmd_cb = (teco_state_process_edit_cmd_cb_t)teco_state_qregspec_string_process_edit_cmd
@@ -1456,7 +1452,7 @@ teco_machine_qregspec_get_stringbuilding(teco_machine_qregspec_t *ctx)
  * @memberof teco_machine_qregspec_t
  */
 teco_machine_qregspec_status_t
-teco_machine_qregspec_input(teco_machine_qregspec_t *ctx, gchar chr,
+teco_machine_qregspec_input(teco_machine_qregspec_t *ctx, gunichar chr,
                             teco_qreg_t **result, teco_qreg_table_t **result_table, GError **error)
 {
 	ctx->parse_only = result == NULL;
@@ -1484,7 +1480,7 @@ teco_machine_qregspec_get_results(teco_machine_qregspec_t *ctx,
 gboolean
 teco_machine_qregspec_auto_complete(teco_machine_qregspec_t *ctx, teco_string_t *insert)
 {
-	gsize restrict_len = 0;
+	guint restrict_len = 0;
 
 	/*
 	 * NOTE: We could have separate process_edit_cmd_cb() for
@@ -1499,6 +1495,10 @@ teco_machine_qregspec_auto_complete(teco_machine_qregspec_t *ctx, teco_string_t
 		/* two-letter Q-Reg */
 		restrict_len = 2;
 
+	/*
+	 * FIXME: This is not quite right as it will propose even
+	 * lower case single or two-letter Q-Register names.
+	 */
 	return teco_rb3str_auto_complete(&ctx->result_table->tree, !restrict_len,
 	                                 ctx->name.data, ctx->name.len, restrict_len, insert) &&
 	       ctx->nesting == 1;
diff --git a/src/qreg.h b/src/qreg.h
index 8c8764e..df4bdb4 100644
--- a/src/qreg.h
+++ b/src/qreg.h
@@ -227,7 +227,7 @@ void teco_machine_qregspec_reset(teco_machine_qregspec_t *ctx);
  */
 struct teco_machine_stringbuilding_t *teco_machine_qregspec_get_stringbuilding(teco_machine_qregspec_t *ctx);
 
-teco_machine_qregspec_status_t teco_machine_qregspec_input(teco_machine_qregspec_t *ctx, gchar chr,
+teco_machine_qregspec_status_t teco_machine_qregspec_input(teco_machine_qregspec_t *ctx, gunichar chr,
                                                            teco_qreg_t **result,
                                                            teco_qreg_table_t **result_table, GError **error);
 
diff --git a/src/rb3str.c b/src/rb3str.c
index 72cf444..d51ac5d 100644
--- a/src/rb3str.c
+++ b/src/rb3str.c
@@ -95,7 +95,7 @@ teco_rb3str_nfind(teco_rb3str_tree_t *tree, gboolean case_sensitive, const gchar
  * @param case_sensitive Whether to match case-sensitive.
  * @param str String to complete (not necessarily null-terminated).
  * @param str_len Length of characters in `str`.
- * @param restrict_len Limit completions to this size.
+ * @param restrict_len Limit completions to this size (in characters).
  * @param insert String to set with characters that can be autocompleted.
  * @return TRUE if the completion was unambiguous, else FALSE.
  *
@@ -103,7 +103,7 @@ teco_rb3str_nfind(teco_rb3str_tree_t *tree, gboolean case_sensitive, const gchar
  */
 gboolean
 teco_rb3str_auto_complete(teco_rb3str_tree_t *tree, gboolean case_sensitive,
-                          const gchar *str, gsize str_len, gsize restrict_len, teco_string_t *insert)
+                          const gchar *str, gsize str_len, guint restrict_len, teco_string_t *insert)
 {
 	memset(insert, 0, sizeof(*insert));
 
@@ -115,7 +115,7 @@ teco_rb3str_auto_complete(teco_rb3str_tree_t *tree, gboolean case_sensitive,
 	for (teco_rb3str_head_t *cur = teco_rb3str_nfind(tree, case_sensitive, str, str_len);
 	     cur && cur->key.len >= str_len && diff(&cur->key, str, str_len) == str_len;
 	     cur = teco_rb3str_get_next(cur)) {
-		if (restrict_len && cur->key.len != restrict_len)
+		if (restrict_len && g_utf8_strlen(cur->key.data, cur->key.len) != restrict_len)
 			continue;
 
 		if (G_UNLIKELY(!first)) {
@@ -136,7 +136,7 @@ teco_rb3str_auto_complete(teco_rb3str_tree_t *tree, gboolean case_sensitive,
 		for (teco_rb3str_head_t *cur = first;
 		     cur && cur->key.len >= str_len && diff(&cur->key, str, str_len) == str_len;
 		     cur = teco_rb3str_get_next(cur)) {
-			if (restrict_len && cur->key.len != restrict_len)
+			if (restrict_len && g_utf8_strlen(cur->key.data, cur->key.len) != restrict_len)
 				continue;
 
 			teco_interface_popup_add(TECO_POPUP_PLAIN,
diff --git a/src/rb3str.h b/src/rb3str.h
index 74b3a37..adf5f89 100644
--- a/src/rb3str.h
+++ b/src/rb3str.h
@@ -65,5 +65,5 @@ teco_rb3str_head_t *teco_rb3str_nfind(teco_rb3str_tree_t *tree, gboolean case_se
                                       const gchar *str, gsize len);
 
 gboolean teco_rb3str_auto_complete(teco_rb3str_tree_t *tree, gboolean case_sensitive,
-                                   const gchar *str, gsize str_len, gsize restrict_len,
+                                   const gchar *str, gsize str_len, guint restrict_len,
                                    teco_string_t *insert);
diff --git a/src/sciteco.h b/src/sciteco.h
index 09dea3b..02eed97 100644
--- a/src/sciteco.h
+++ b/src/sciteco.h
@@ -71,7 +71,7 @@ teco_is_failure(teco_bool_t x)
 #endif
 
 /** TRUE if C is a control character */
-#define TECO_IS_CTL(C)		((guchar)(C) < ' ')
+#define TECO_IS_CTL(C)		((gunichar)(C) < ' ')
 /** ASCII character to echo control character C */
 #define TECO_CTL_ECHO(C)	((C) | 0x40)
 /**
diff --git a/src/search.c b/src/search.c
index e146def..43a2936 100644
--- a/src/search.c
+++ b/src/search.c
@@ -308,14 +308,6 @@ teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error
 
 	do {
 		/*
-		 * FIXME: Currently we are fed single bytes, so there
-		 * could be an incomplete UTF-8 sequence at the end of the pattern.
-		 * This should not be necessary once we have an Unicode-aware parser.
-		 */
-		if (pattern->len > 0 && (gint32)g_utf8_get_char_validated(pattern->data, -1) < 0)
-			break;
-
-		/*
 		 * First check whether it is a class.
 		 * This will not treat individual characters
 		 * as classes, so we do not convert them to regexp
diff --git a/src/spawn.c b/src/spawn.c
index 044b8de..445acc5 100644
--- a/src/spawn.c
+++ b/src/spawn.c
@@ -417,7 +417,7 @@ cleanup:
 }
 
 /* in cmdline.c */
-gboolean teco_state_execute_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error);
+gboolean teco_state_execute_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error);
 
 /*$ EC pipe filter
  * ECcommand$ -- Execute operating system command and filter buffer contents
@@ -642,7 +642,7 @@ teco_spawn_stdin_watch_cb(GIOChannel *chan, GIOCondition condition, gpointer dat
 	gssize bytes_written = teco_eol_writer_convert(&teco_spawn_ctx.stdin_writer, buffer,
 	                                               convert_len, &teco_spawn_ctx.error);
 	if (bytes_written < 0) {
-		/* GError ocurred */
+		/* GError occurred */
 		g_main_loop_quit(teco_spawn_ctx.mainloop);
 		return G_SOURCE_REMOVE;
 	}
diff --git a/src/string-utils.c b/src/string-utils.c
index ac5835b..d9b12e0 100644
--- a/src/string-utils.c
+++ b/src/string-utils.c
@@ -78,7 +78,17 @@ teco_string_get_coord(const gchar *str, guint pos, guint *line, guint *column)
 	}
 }
 
-/** @memberof teco_string_t */
+/**
+ * Get the length of the prefix common to two strings.
+ * Works with UTF-8 and single-byte encodings.
+ *
+ * @param a Left string.
+ * @param b Right string.
+ * @param b_len Length of right string.
+ * @return Length of the common prefix in bytes.
+ *
+ * @memberof teco_string_t
+ */
 gsize
 teco_string_diff(const teco_string_t *a, const gchar *b, gsize b_len)
 {
@@ -92,14 +102,16 @@ teco_string_diff(const teco_string_t *a, const gchar *b, gsize b_len)
 }
 
 /**
- * Get the length of the prefix common to two strings
+ * Get the length of the prefix common to two UTF-8 strings
  * without considering case.
  *
- * @fixme This is currently only used for symbols and one/two letter
- * Q-Register names, which cannot be UTF-8.
- * If we rewrote this to perform Unicode case folding, we would
- * also have to check for character validity.
- * Once our parser is Unicode-aware, this is not necessary.
+ * The UTF-8 strings must be validated, which should be the case
+ * for help labels and short Q-Register names.
+ *
+ * @param a Left UTF-8 string.
+ * @param b Right UTF-8 string.
+ * @param b_len Length of right UTF-8 string.
+ * @return Length of the common prefix in bytes.
  *
  * @memberof teco_string_t
  */
@@ -108,9 +120,13 @@ teco_string_casediff(const teco_string_t *a, const gchar *b, gsize b_len)
 {
 	gsize len = 0;
 
-	while (len < a->len && len < b_len &&
-	       g_ascii_tolower(a->data[len]) == g_ascii_tolower(b[len]))
-		len++;
+	while (len < a->len && len < b_len) {
+		gunichar a_chr = g_utf8_get_char(a->data+len);
+		gunichar b_chr = g_utf8_get_char(b+len);
+		if (g_unichar_tolower(a_chr) != g_unichar_tolower(b_chr))
+			break;
+		len = g_utf8_next_char(b+len) - b;
+	}
 
 	return len;
 }
diff --git a/src/string-utils.h b/src/string-utils.h
index bb9ed37..1b4957f 100644
--- a/src/string-utils.h
+++ b/src/string-utils.h
@@ -26,11 +26,11 @@
 /**
  * Upper-case SciTECO command character.
  *
- * There are implementations in glib (g_ascii_toupper) and libc,
+ * There are implementations in glib (g_ascii_toupper() and g_unichar_toupper()) and libc,
  * but this implementation is sufficient for all letters used by SciTECO commands.
  */
-static inline gchar
-teco_ascii_toupper(gchar chr)
+static inline gunichar
+teco_ascii_toupper(gunichar chr)
 {
 	return chr >= 'a' && chr <= 'z' ? chr & ~0x20 : chr;
 }
@@ -52,6 +52,7 @@ teco_strv_remove(gchar **strv, guint i)
  * and the allocation length is not stored.
  * Just like GString, teco_string_t are always null-terminated but at the
  * same time 8-bit clean (can contain null-characters).
+ * It may or may not contain UTF-8 byte sequences.
  *
  * The API is designed such that teco_string_t operations operate on plain
  * (null-terminated) C strings, a single character or character array as well as
@@ -74,7 +75,7 @@ typedef struct {
 	 * The pointer is guaranteed to be non-NULL after initialization.
 	 */
 	gchar *data;
-	/** Length of `data` without the trailing null-byte. */
+	/** Length of `data` without the trailing null-byte in bytes. */
 	gsize len;
 } teco_string_t;
 
@@ -128,6 +129,16 @@ teco_string_append_c(teco_string_t *str, gchar chr)
 	teco_string_append(str, &chr, sizeof(chr));
 }
 
+/** @memberof teco_string_t */
+static inline void
+teco_string_append_wc(teco_string_t *target, gunichar chr)
+{
+	/* 4 bytes should be enough, but we better follow the documentation */
+	target->data = g_realloc(target->data, target->len + 6 + 1);
+	target->len += g_unichar_to_utf8(chr, target->data+target->len);
+	target->data[target->len] = '\0';
+}
+
 /**
  * @fixme Should this also realloc str->data?
  *
diff --git a/src/symbols.c b/src/symbols.c
index ba407cc..feead76 100644
--- a/src/symbols.c
+++ b/src/symbols.c
@@ -251,7 +251,7 @@ teco_state_scintilla_symbols_done(teco_machine_main_t *ctx, const teco_string_t
 }
 
 /* in cmdline.c */
-gboolean teco_state_scintilla_symbols_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error);
+gboolean teco_state_scintilla_symbols_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error);
 
 /*$ ES scintilla message
  * -- Send Scintilla message
diff --git a/src/undo.c b/src/undo.c
index dfae63b..dc54c7a 100644
--- a/src/undo.c
+++ b/src/undo.c
@@ -30,7 +30,7 @@
 
 //#define DEBUG
 
-TECO_DEFINE_UNDO_SCALAR(gchar);
+TECO_DEFINE_UNDO_SCALAR(gunichar);
 TECO_DEFINE_UNDO_SCALAR(gint);
 TECO_DEFINE_UNDO_SCALAR(guint);
 TECO_DEFINE_UNDO_SCALAR(gsize);
diff --git a/src/undo.h b/src/undo.h
index ea1414f..9715c7a 100644
--- a/src/undo.h
+++ b/src/undo.h
@@ -164,8 +164,8 @@ gpointer teco_undo_push_size(teco_undo_action_t action_cb, gsize size)
  * significantly improves batch-mode performance.
  */
 
-TECO_DECLARE_UNDO_SCALAR(gchar);
-#define teco_undo_gchar(VAR) (*teco_undo_object_gchar_push(&(VAR)))
+TECO_DECLARE_UNDO_SCALAR(gunichar);
+#define teco_undo_gunichar(VAR) (*teco_undo_object_gunichar_push(&(VAR)))
 
 TECO_DECLARE_UNDO_SCALAR(gint);
 #define teco_undo_gint(VAR) (*teco_undo_object_gint_push(&(VAR)))
diff --git a/tests/testsuite.at b/tests/testsuite.at
index 4749b13..0733d2a 100644
--- a/tests/testsuite.at
+++ b/tests/testsuite.at
@@ -84,8 +84,6 @@ AT_CHECK([$SCITECO -e "0@I//J 0A\"N(0/0)' :@S/^@/\"F(0/0)'"], 0, ignore, ignore)
 AT_CHECK([$SCITECO -e "@EQa//0EE 1U*0EE 0:@EUa/f^@^@/ :Qa-4\"N(0/0)' Ga Z-4\"N(0/0)'"], 0, ignore, ignore)
 AT_CHECK([$SCITECO -e "0EE 129@I// -A-129\"N(0/0)' HXa @EQa// EE\"N(0/0)'"], 0, ignore, ignore)
 AT_CHECK([$SCITECO -8e "129@:^Ua// 0Qa-129\"N(0/0)'"], 0, ignore, ignore)
-# FIXME: This will fail once we have an UTF-8-only parser.
-AT_CHECK([$SCITECO -8e "@:^Ua/^^/ 129:@^Ua// Ma-129\"N(0/0)'"], 0, ignore, ignore)
 AT_CHECK([$SCITECO -e "1EE 167Ua @I/^EUa/ .-1\"N(0/0)'"], 0, ignore, ignore)
 AT_CLEANUP
 
@@ -95,6 +93,8 @@ AT_CHECK([$SCITECO -e "8594@^Ua/Здравствуй, мир!/ :Qa-17\"N(0/0)' 0
 AT_CHECK([$SCITECO -e "@I/Здравствуй, мир!/ JW .-10\"N(0/0)' ^E-20\"N(0/0)' 204:EE .-10\"N(0/0)'"], 0, ignore, ignore)
 AT_CHECK([$SCITECO -e "@I/TEST/ @EW/юникод.txt/"], 0, ignore, ignore)
 AT_CHECK([test -f юникод.txt], 0, ignore, ignore)
+AT_CHECK([$SCITECO -e "^^ß-223\"N(0/0) 23Uъ Q[Ъ]-23\"N(0/0)'"], 0, ignore, ignore)
+AT_CHECK([$SCITECO -e "@O/метка/ !метка!"], 0, ignore, ignore)
 AT_CLEANUP
 
 AT_SETUP([Automatic EOL normalization])
@@ -207,8 +207,7 @@ AT_CLEANUP
 AT_SETUP([Unicode glitches])
 # While TECO code must always be UTF-8, strings after string building
 # can be in single-byte encodings as well.
-# This might already work after introducing the Unicode-aware parser.
-# If not, it should be fixed.
+# It must be possible to search for single bytes in single-byte encodings.
 AT_CHECK([$SCITECO -8e "164Ua Ga@I//J :@S/^EUa/\"F(0/0)'"], 0, ignore, ignore)
 AT_XFAIL_IF(true)
 AT_CLEANUP
author	Robin Haberkorn <robin.haberkorn@googlemail.com>	2024-09-11 12:21:42 +0200
committer	Robin Haberkorn <robin.haberkorn@googlemail.com>	2024-09-11 16:14:27 +0200
commit	68578072bfaf6054a96bb6bcedfccb6e56a508fe (patch)
tree	b7916f665e77c698d2d0fda7cb9f3ac4356f502b
parent	adc067ba745cebf2e2a2f9523bc14136ca1d2680 (diff)
download	sciteco-68578072bfaf6054a96bb6bcedfccb6e56a508fe.tar.gz