1 files changed, 461 insertions, 66 deletions
diff --git a/src/core-commands.c b/src/core-commands.c
index 4d5b378..0cde7e0 100644
--- a/src/core-commands.c
+++ b/src/core-commands.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2012-2023 Robin Haberkorn
+ * Copyright (C) 2012-2024 Robin Haberkorn
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -45,7 +45,7 @@
 #include "goto-commands.h"
 #include "core-commands.h"
 
-static teco_state_t *teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error);
+static teco_state_t *teco_state_control_input(teco_machine_main_t *ctx, gunichar chr, GError **error);
 
 /*
  * NOTE: This needs some extra code in teco_state_start_input().
@@ -129,7 +129,8 @@ teco_state_start_dot(teco_machine_main_t *ctx, GError **error)
 {
 	if (!teco_expressions_eval(FALSE, error))
 		return;
-	teco_expressions_push(teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0));
+	sptr_t pos = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0);
+	teco_expressions_push(teco_interface_bytes2glyphs(pos));
 }
 
 /*$ Z size
@@ -145,7 +146,8 @@ teco_state_start_zed(teco_machine_main_t *ctx, GError **error)
 {
 	if (!teco_expressions_eval(FALSE, error))
 		return;
-	teco_expressions_push(teco_interface_ssm(SCI_GETLENGTH, 0, 0));
+	sptr_t pos = teco_interface_ssm(SCI_GETLENGTH, 0, 0);
+	teco_expressions_push(teco_interface_bytes2glyphs(pos));
 }
 
 /*$ H
@@ -162,10 +164,11 @@ teco_state_start_range(teco_machine_main_t *ctx, GError **error)
 	if (!teco_expressions_eval(FALSE, error))
 		return;
 	teco_expressions_push(0);
-	teco_expressions_push(teco_interface_ssm(SCI_GETLENGTH, 0, 0));
+	sptr_t pos = teco_interface_ssm(SCI_GETLENGTH, 0, 0);
+	teco_expressions_push(teco_interface_bytes2glyphs(pos));
 }
 
-/*$ "\\"
+/*$ \[rs]
  * n\\ -- Insert or read ASCII numbers
  * \\ -> n
  *
@@ -241,6 +244,7 @@ teco_state_start_loop_open(teco_machine_main_t *ctx, GError **error)
 	if (!teco_expressions_eval(FALSE, error) ||
 	    !teco_expressions_pop_num_calc(&lctx.counter, -1, error))
 		return;
+	lctx.brace_level = teco_brace_level;
 	lctx.pass_through = teco_machine_main_eval_colon(ctx);
 
 	if (lctx.counter) {
@@ -280,6 +284,14 @@ teco_state_start_loop_close(teco_machine_main_t *ctx, GError **error)
 
 	teco_loop_context_t *lctx = &g_array_index(teco_loop_stack, teco_loop_context_t,
 	                                           teco_loop_stack->len-1);
+
+	/* only non-pass-through loops increase the brace level */
+	if (teco_brace_level != lctx->brace_level + !lctx->pass_through) {
+		g_set_error_literal(error, TECO_ERROR, TECO_ERROR_FAILED,
+		                    "Brace left open at loop end command");
+		return;
+	}
+
 	gboolean colon_modified = teco_machine_main_eval_colon(ctx);
 
 	/*
@@ -348,7 +360,7 @@ teco_state_start_break(teco_machine_main_t *ctx, GError **error)
 {
 	if (teco_loop_stack->len <= ctx->loop_stack_fp) {
 		g_set_error_literal(error, TECO_ERROR, TECO_ERROR_FAILED,
-		                    "<;> only allowed in iterations");
+		                    "<;> only allowed in loops");
 		return;
 	}
 
@@ -373,7 +385,7 @@ teco_state_start_break(teco_machine_main_t *ctx, GError **error)
 	if (!teco_expressions_discard_args(error))
 		return;
 	if (!lctx.pass_through &&
-	    !teco_expressions_brace_close(error))
+	    !teco_expressions_brace_return(lctx.brace_level, 0, error))
 		return;
 
 	undo__insert_val__teco_loop_stack(teco_loop_stack->len, lctx);
@@ -511,11 +523,12 @@ teco_state_start_jump(teco_machine_main_t *ctx, GError **error)
 	if (!teco_expressions_pop_num_calc(&v, 0, error))
 		return;
 
-	if (teco_validate_pos(v)) {
+	gssize pos = teco_interface_glyphs2bytes(v);
+	if (pos >= 0) {
 		if (teco_current_doc_must_undo())
 			undo__teco_interface_ssm(SCI_GOTOPOS,
 			                         teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0), 0);
-		teco_interface_ssm(SCI_GOTOPOS, v, 0);
+		teco_interface_ssm(SCI_GOTOPOS, pos, 0);
 
 		if (teco_machine_main_eval_colon(ctx))
 			teco_expressions_push(TECO_SUCCESS);
@@ -531,11 +544,11 @@ static teco_bool_t
 teco_move_chars(teco_int_t n)
 {
 	sptr_t pos = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0);
-
-	if (!teco_validate_pos(pos + n))
+	gssize next_pos = teco_interface_glyphs2bytes_relative(pos, n);
+	if (next_pos < 0)
 		return TECO_FAILURE;
 
-	teco_interface_ssm(SCI_GOTOPOS, pos + n, 0);
+	teco_interface_ssm(SCI_GOTOPOS, next_pos, 0);
 	if (teco_current_doc_must_undo())
 		undo__teco_interface_ssm(SCI_GOTOPOS, pos, 0);
 
@@ -879,7 +892,7 @@ static gboolean
 teco_state_start_kill(teco_machine_main_t *ctx, const gchar *cmd, gboolean by_lines, GError **error)
 {
 	teco_bool_t rc;
-	teco_int_t from, len;
+	gssize from, len; /* in bytes */
 
 	if (!teco_expressions_eval(FALSE, error))
 		return FALSE;
@@ -894,20 +907,24 @@ teco_state_start_kill(teco_machine_main_t *ctx, const gchar *cmd, gboolean by_li
 			len = teco_interface_ssm(SCI_POSITIONFROMLINE, line, 0) - from;
 			rc = teco_bool(teco_validate_line(line));
 		} else {
-			if (!teco_expressions_pop_num_calc(&len, teco_num_sign, error))
+			teco_int_t len_glyphs;
+			if (!teco_expressions_pop_num_calc(&len_glyphs, teco_num_sign, error))
 				return FALSE;
-			rc = teco_bool(teco_validate_pos(from + len));
+			gssize to = teco_interface_glyphs2bytes_relative(from, len_glyphs);
+			rc = teco_bool(to >= 0);
+			len = to-from;
 		}
 		if (len < 0) {
 			len *= -1;
 			from -= len;
 		}
 	} else {
-		teco_int_t to = teco_expressions_pop_num(0);
-		from = teco_expressions_pop_num(0);
+		teco_int_t to_glyphs = teco_expressions_pop_num(0);
+		gssize to = teco_interface_glyphs2bytes(to_glyphs);
+		teco_int_t from_glyphs = teco_expressions_pop_num(0);
+		from = teco_interface_glyphs2bytes(from_glyphs);
 		len = to - from;
-		rc = teco_bool(len >= 0 && teco_validate_pos(from) &&
-		                           teco_validate_pos(to));
+		rc = teco_bool(len >= 0 && from >= 0 && to >= 0);
 	}
 
 	if (teco_machine_main_eval_colon(ctx)) {
@@ -1002,6 +1019,9 @@ teco_state_start_delete_chars(teco_machine_main_t *ctx, GError **error)
  * This can be an ASCII <code> or Unicode codepoint
  * depending on Scintilla's encoding of the current
  * buffer.
+ * Invalid Unicode byte sequences are reported as
+ * -1 or -2.
+ *
  *   - If <n> is 0, return the <code> of the character
  *     pointed to by dot.
  *   - If <n> is 1, return the <code> of the character
@@ -1012,28 +1032,33 @@ teco_state_start_delete_chars(teco_machine_main_t *ctx, GError **error)
  *
  * If the position of the queried character is off-page,
  * the command will yield an error.
+ *
+ * If the document is encoded as UTF-8 and there is
+ * an incomplete sequence at the requested position,
+ * -1 is returned.
+ * All other invalid Unicode sequences are returned as -2.
  */
-/** @todo does Scintilla really return code points??? */
 static void
 teco_state_start_get(teco_machine_main_t *ctx, GError **error)
 {
 	teco_int_t v;
 	if (!teco_expressions_pop_num_calc(&v, teco_num_sign, error))
 		return;
-	v += teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0);
-	/*
-	 * NOTE: We cannot use teco_validate_pos() here since
-	 * the end of the buffer is not a valid position for <A>.
-	 */
-	if (v < 0 || v >= teco_interface_ssm(SCI_GETLENGTH, 0, 0)) {
+
+	sptr_t pos = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0);
+	gssize get_pos = teco_interface_glyphs2bytes_relative(pos, v);
+	sptr_t len = teco_interface_ssm(SCI_GETLENGTH, 0, 0);
+
+	if (get_pos < 0 || get_pos == len) {
 		teco_error_range_set(error, "A");
 		return;
 	}
-	teco_expressions_push(teco_interface_ssm(SCI_GETCHARAT, v, 0));
+
+	teco_expressions_push(teco_interface_get_character(get_pos, len));
 }
 
 static teco_state_t *
-teco_state_start_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_start_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
 {
 	static teco_machine_main_transition_t transitions[] = {
 		/*
@@ -1148,7 +1173,7 @@ teco_state_start_input(teco_machine_main_t *ctx, gchar chr, GError **error)
 		 *
 		 * FIXME: Maybe, there should be a special teco_state_t
 		 * for beginnings of command-lines?
-		 * It could also be used for a corresponding FNMACRO mask.
+		 * It could also be used for a corresponding KEYMACRO mask.
 		 */
 		if (teco_cmdline.effective_len == 1 && teco_cmdline.str.data[0] == '*')
 			return &teco_state_save_cmdline;
@@ -1244,7 +1269,7 @@ teco_state_start_input(teco_machine_main_t *ctx, gchar chr, GError **error)
 TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_start,
 	.end_of_macro_cb = NULL, /* Allowed at the end of a macro! */
 	.is_start = TRUE,
-	.fnmacro_mask = TECO_FNMACRO_MASK_START
+	.keymacro_mask = TECO_KEYMACRO_MASK_START | TECO_KEYMACRO_MASK_CASEINSENSITIVE
 );
 
 /*$ F<
@@ -1372,7 +1397,7 @@ teco_state_fcommand_cond_else(teco_machine_main_t *ctx, GError **error)
 }
 
 static teco_state_t *
-teco_state_fcommand_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_fcommand_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
 {
 	static teco_machine_main_transition_t transitions[] = {
 		/*
@@ -1435,7 +1460,7 @@ teco_state_changedir_done(teco_machine_main_t *ctx, const teco_string_t *str, GE
 		teco_qreg_t *qreg = teco_qreg_table_find(&teco_qreg_table_globals, "$HOME", 5);
 		g_assert(qreg != NULL);
 		teco_string_t home;
-		if (!qreg->vtable->get_string(qreg, &home.data, &home.len, error))
+		if (!qreg->vtable->get_string(qreg, &home.data, &home.len, NULL, error))
 			return NULL;
 
 		/*
@@ -1496,7 +1521,7 @@ teco_state_changedir_done(teco_machine_main_t *ctx, const teco_string_t *str, GE
 TECO_DEFINE_STATE_EXPECTDIR(teco_state_changedir);
 
 static teco_state_t *
-teco_state_condcommand_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_condcommand_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
 {
 	teco_int_t value = 0;
 	gboolean result = TRUE;
@@ -1536,20 +1561,20 @@ teco_state_condcommand_input(teco_machine_main_t *ctx, gchar chr, GError **error
 		break;
 	case 'A':
 		if (ctx->mode == TECO_MODE_NORMAL)
-			result = g_ascii_isalpha((gchar)value);
+			result = g_unichar_isalpha(value);
 		break;
 	case 'C':
 		if (ctx->mode == TECO_MODE_NORMAL)
-			result = g_ascii_isalnum((gchar)value) ||
+			result = g_unichar_isalnum(value) ||
 			         value == '.' || value == '$' || value == '_';
 		break;
 	case 'D':
 		if (ctx->mode == TECO_MODE_NORMAL)
-			result = g_ascii_isdigit((gchar)value);
+			result = g_unichar_isdigit(value);
 		break;
 	case 'I':
 		if (ctx->mode == TECO_MODE_NORMAL)
-			result = G_IS_DIR_SEPARATOR((gchar)value);
+			result = G_IS_DIR_SEPARATOR(value);
 		break;
 	case 'S':
 	case 'T':
@@ -1582,15 +1607,15 @@ teco_state_condcommand_input(teco_machine_main_t *ctx, gchar chr, GError **error
 		break;
 	case 'R':
 		if (ctx->mode == TECO_MODE_NORMAL)
-			result = g_ascii_isalnum((gchar)value);
+			result = g_unichar_isalnum(value);
 		break;
 	case 'V':
 		if (ctx->mode == TECO_MODE_NORMAL)
-			result = g_ascii_islower((gchar)value);
+			result = g_unichar_islower(value);
 		break;
 	case 'W':
 		if (ctx->mode == TECO_MODE_NORMAL)
-			result = g_ascii_isupper((gchar)value);
+			result = g_unichar_isupper(value);
 		break;
 	default:
 		g_set_error(error, TECO_ERROR, TECO_ERROR_FAILED,
@@ -1720,8 +1745,71 @@ teco_state_control_radix(teco_machine_main_t *ctx, GError **error)
 	}
 }
 
+/*$ ^E glyphs2bytes bytes2glyphs
+ * glyphs^E -> bytes -- Translate between glyph and byte indexes
+ * bytes:^E -> glyphs
+ * ^E -> bytes
+ * :^E -> length
+ *
+ * Translates from glyph/character to byte indexes when called
+ * without a colon.
+ * Otherwise when colon-modified, translates from byte indexes
+ * back to glyph indexes.
+ * These values can differ in documents with multi-byte
+ * encodings (of which only UTF-8 is supported).
+ * It is especially useful to translate between these indexes
+ * when manually invoking Scintilla messages (\fBES\fP command), as
+ * they almost always take byte positions.
+ *
+ * When called without arguments, \fB^E\fP returns the current
+ * position (dot) in bytes.
+ * This is equivalent, but faster than \(lq.^E\(rq.
+ * \fB:^E\fP without arguments returns the length of the current
+ * document in bytes, which is equivalent but faster than \(lqZ^E\(rq.
+ *
+ * When passing in indexes outside of the document's valid area,
+ * -1 is returned, so the return value can also be interpreted
+ * as a TECO boolean, signalling truth/success for invalid indexes.
+ * This provides an elegant and effective way to validate
+ * buffer addresses.
+ */
+static void
+teco_state_control_glyphs2bytes(teco_machine_main_t *ctx, GError **error)
+{
+	teco_int_t res;
+
+	if (!teco_expressions_eval(FALSE, error))
+		return;
+
+	gboolean colon_modified = teco_machine_main_eval_colon(ctx);
+
+	if (!teco_expressions_args()) {
+		/*
+		 * This is shorter than .^E or Z^E and avoids unnecessary glyph to
+		 * byte index translations.
+		 * On the other hand :^E is inconsistent, as it will return a byte
+		 * index, instead of glyph index.
+		 */
+		res = teco_interface_ssm(colon_modified ? SCI_GETLENGTH : SCI_GETCURRENTPOS, 0, 0);
+	} else {
+		teco_int_t pos;
+		if (!teco_expressions_pop_num_calc(&pos, 0, error))
+			return;
+		if (colon_modified) {
+			/* teco_interface_bytes2glyphs() does not check addresses */
+			res = 0 <= pos && pos <= teco_interface_ssm(SCI_GETLENGTH, 0, 0)
+				? teco_interface_bytes2glyphs(pos) : -1;
+		} else {
+			/* negative values for invalid indexes are passed down. */
+			res = teco_interface_glyphs2bytes(pos);
+		}
+	}
+
+	teco_expressions_push(res);
+}
+
 static teco_state_t *
-teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_control_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
 {
 	static teco_machine_main_transition_t transitions[] = {
 		/*
@@ -1746,7 +1834,8 @@ teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error)
 		['C']  = {&teco_state_start, teco_state_control_exit},
 		['O']  = {&teco_state_start, teco_state_control_octal},
 		['D']  = {&teco_state_start, teco_state_control_decimal},
-		['R']  = {&teco_state_start, teco_state_control_radix}
+		['R']  = {&teco_state_start, teco_state_control_radix},
+		['E']  = {&teco_state_start, teco_state_control_glyphs2bytes}
 	};
 
 	/*
@@ -1761,7 +1850,7 @@ teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error)
 TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_control);
 
 static teco_state_t *
-teco_state_ascii_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_ascii_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
 {
 	if (ctx->mode == TECO_MODE_NORMAL)
 		teco_expressions_push(chr);
@@ -1797,7 +1886,7 @@ TECO_DEFINE_STATE(teco_state_ascii);
  * only be seen when executing the following command.
  */
 static teco_state_t *
-teco_state_escape_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_escape_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
 {
 	/*$ ^[^[ ^[$ $$ terminate return
 	 * [a1,a2,...]$$ -- Terminate command line or return from macro
@@ -1891,7 +1980,7 @@ TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_escape,
 	 * when it comes to function key macro masking.
 	 */
 	.is_start = TRUE,
-	.fnmacro_mask = TECO_FNMACRO_MASK_START
+	.keymacro_mask = TECO_KEYMACRO_MASK_START | TECO_KEYMACRO_MASK_CASEINSENSITIVE
 );
 
 /*$ EF close
@@ -1958,6 +2047,11 @@ teco_state_ecommand_close(teco_machine_main_t *ctx, GError **error)
  * Without any argument ED returns the current flags.
  *
  * Currently, the following flags are used by \*(ST:
+ *   - 4: If enabled, prefer raw single-byte ANSI encoding
+ *     for all new buffers and registers.
+ *     This does not change the encoding of any existing
+ *     buffers and any initialized default register when set via
+ *     \fBED\fP, so you might want to launch \*(ST with \fB--8bit\fP.
  *   - 8: Enable/disable automatic folding of case-insensitive
  *     command characters during interactive key translation.
  *     The case of letter keys is inverted, so one or two
@@ -1973,14 +2067,17 @@ teco_state_ecommand_close(teco_machine_main_t *ctx, GError **error)
  *     of files.
  *   - 32: Enable/Disable buffer editing hooks
  *     (via execution of macro in global Q-Register \(lqED\(rq)
- *   - 64: Enable/Disable function key macros
  *   - 128: Enable/Disable enforcement of UNIX98
  *     \(lq/bin/sh\(rq emulation for operating system command
  *     executions
- *   - 256: Enable/Disable \fBxterm\fP(1) clipboard support.
- *     Should only be enabled if XTerm allows the
- *     \fIGetSelection\fP and \fISetSelection\fP window
- *     operations.
+ *   - 256: Enable/Disable OSC-52 clipboard support.
+ *     Must only be enabled if the terminal emulator is configured
+ *     properly.
+ *   - 512: Enable/Disable Unicode icons in the Curses UI.
+ *     This requires a capable font, like the ones provided
+ *     by the \(lqNerd Fonts\(rq project.
+ *     Changes to this flag in interactive mode may not become
+ *     effective immediately.
  *
  * The features controlled thus are discribed in other sections
  * of this manual.
@@ -2098,6 +2195,12 @@ teco_state_ecommand_flags(teco_machine_main_t *ctx, GError **error)
  * on exit the author is aware of is \fBxterm\fP(1) and
  * the Linux console driver.
  * You have been warned. Good luck.
+ * .IP 4
+ * The column after the last horizontal movement.
+ * This is only used by \fBfnkeys.tes\fP and is similar to the Scintilla-internal
+ * setting \fBSCI_CHOOSECARETX\fP.
+ * Unless most other settings, this is on purpose not restored on rubout,
+ * so it "survives" command line replacements.
  */
 static void
 teco_state_ecommand_properties(teco_machine_main_t *ctx, GError **error)
@@ -2106,9 +2209,12 @@ teco_state_ecommand_properties(teco_machine_main_t *ctx, GError **error)
 		EJ_USER_INTERFACE = 0,
 		EJ_BUFFERS,
 		EJ_MEMORY_LIMIT,
-		EJ_INIT_COLOR
+		EJ_INIT_COLOR,
+		EJ_CARETX
 	};
 
+	static teco_int_t caret_x = 0;
+
 	teco_int_t property;
 	if (!teco_expressions_eval(FALSE, error) ||
 	    !teco_expressions_pop_num_calc(&property, teco_num_sign, error))
@@ -2144,6 +2250,10 @@ teco_state_ecommand_properties(teco_machine_main_t *ctx, GError **error)
 			teco_interface_init_color((guint)value, (guint32)color);
 			break;
 
+		case EJ_CARETX:
+			caret_x = value;
+			break;
+
 		default:
 			g_set_error(error, TECO_ERROR, TECO_ERROR_FAILED,
 			            "Cannot set property %" TECO_INT_FORMAT " "
@@ -2180,6 +2290,10 @@ teco_state_ecommand_properties(teco_machine_main_t *ctx, GError **error)
 		teco_expressions_push(teco_memory_limit);
 		break;
 
+	case EJ_CARETX:
+		teco_expressions_push(caret_x);
+		break;
+
 	default:
 		g_set_error(error, TECO_ERROR, TECO_ERROR_FAILED,
 		            "Invalid property %" TECO_INT_FORMAT " "
@@ -2292,6 +2406,252 @@ teco_state_ecommand_eol(teco_machine_main_t *ctx, GError **error)
 	}
 }
 
+static const gchar *
+teco_codepage2str(guint codepage)
+{
+	/*
+	 * The multi-byte charsets are excluded, since we don't
+	 * support them in SciTECO, even though Scintilla has them.
+	 * Contrary to the Scintilla documentation, Gtk supports
+	 * most of them.
+	 * Those that are supported are tested, so the codepage
+	 * mapping should be definitive (although there could be
+	 * similar related codepages).
+	 */
+	switch (codepage) {
+	case SC_CP_UTF8:		return "UTF-8";
+	case SC_CHARSET_ANSI:
+	case SC_CHARSET_DEFAULT:	return "ISO-8859-1"; /* LATIN1 */
+	case SC_CHARSET_BALTIC:		return "ISO-8859-13"; /* LATIN7 */
+	//case SC_CHARSET_CHINESEBIG5:	return "BIG5";
+	case SC_CHARSET_EASTEUROPE:	return "ISO-8859-2"; /* LATIN2 */
+	//case SC_CHARSET_GB2312:	return "GB2312";
+	case SC_CHARSET_GREEK:		return "ISO-8859-7"; // CP1253???
+	//case SC_CHARSET_HANGUL:	return "UHC";
+	/* unsure whether this is supported on Gtk */
+	case SC_CHARSET_MAC:		return "MAC";
+	/* not supported by Gtk */
+	case SC_CHARSET_OEM:		return "CP437";
+	/*
+	 * Apparently, this can be CP1251 on the native Windows
+	 * port of Scintilla.
+	 */
+	case SC_CHARSET_RUSSIAN:	return "KOI8-R";
+	case SC_CHARSET_OEM866:		return "CP866";
+	case SC_CHARSET_CYRILLIC:	return "CP1251";
+	//case SC_CHARSET_SHIFTJIS:	return "SHIFT-JIS";
+	//case SC_CHARSET_SYMBOL:
+	case SC_CHARSET_TURKISH:	return "ISO-8859-9"; /* LATIN5 */
+	//case SC_CHARSET_JOHAB:	return "JOHAB";
+	case SC_CHARSET_HEBREW:		return "ISO-8859-8"; // CP1255?
+	/*
+	 * FIXME: Some arabic codepage is supported by Gtk,
+	 * but I am not sure which.
+	 */
+	case SC_CHARSET_ARABIC:		return "ISO-8859-6"; // CP720, CP1256???
+	/* apparently not supported by Gtk */
+	case SC_CHARSET_VIETNAMESE:	return "CP1258";
+	case SC_CHARSET_THAI:		return "ISO-8859-11";
+	case SC_CHARSET_8859_15:	return "ISO-8859-15"; /* LATIN9 */
+	}
+
+	return NULL;
+}
+
+/*$ EE encoding codepage charset
+ * codepageEE -- Edit current document's encoding (codepage/charset)
+ * EE -> codepage
+ * codepage:EE
+ * :EE -> codepage
+ *
+ * When called with an argument, it sets the current codepage,
+ * otherwise returns it.
+ * The following codepages are supported:
+ * - 0: ANSI (raw bytes)
+ * - 1: ISO-8859-1 (latin1)
+ * - 77: Macintosh Latin encoding
+ * - 161: ISO-8859-7
+ * - 162: ISO-8859-9 (latin5)
+ * - 163: CP1258
+ * - 177: ISO-8859-8
+ * - 178: ISO-8859-6
+ * - 186: ISO-8859-13 (latin7)
+ * - 204: KOI8-R
+ * - 222: ISO-8859-11
+ * - 238: ISO-8859-2 (latin2)
+ * - 255: CP437
+ * - 866: CP866
+ * - 1000: ISO-8859-15 (latin9)
+ * - 1251: CP1251
+ * - 65001: UTF-8
+ *
+ * Displaying characters in the single-byte (non-UTF-8) codepages might
+ * be supported only with the Gtk UI.
+ * At least 77, 178, 163 and 255 are not displayed correctly on Gtk.
+ * 65001 (UTF-8) is the default for new buffers.
+ * 0 (ANSI) should be used when working with raw bytes,
+ * but is currently displayed like ISO-8859-1 (latin1).
+ *
+ * \fBEE\fP does not change the buffer contents itself by default, only
+ * how it is displayed and how \*(ST interacts with it.
+ * This allows fixing up the codepage if it is not in the default UTF-8
+ * or if codepage guessing failed.
+ *
+ * When colon-modified the \fB:EE\fP command will also additionally convert
+ * the current buffer contents into the new code page, preserving the
+ * current position (dot).
+ * This will fail if the conversion would be lossy.
+ * Conversions from and to UTF-8 \fIshould\fP always be successful.
+ */
+static void
+teco_state_ecommand_encoding(teco_machine_main_t *ctx, GError **error)
+{
+	if (!teco_expressions_eval(FALSE, error))
+		return;
+
+	gboolean colon_modified = teco_machine_main_eval_colon(ctx);
+
+	guint old_cp = teco_interface_get_codepage();
+
+	if (!teco_expressions_args()) {
+		/* get current code page */
+		teco_expressions_push(old_cp);
+		return;
+	}
+
+	/*
+	 * Set code page
+	 */
+	teco_int_t new_cp;
+	if (!teco_expressions_pop_num_calc(&new_cp, 0, error))
+		return;
+
+	if (old_cp == SC_CP_UTF8 && new_cp == SC_CP_UTF8)
+		return;
+
+	if (teco_current_doc_must_undo() && teco_undo_enabled) {
+		if (old_cp == SC_CP_UTF8) { /* new_cp != SC_CP_UTF8 */
+			undo__teco_interface_ssm(SCI_ALLOCATELINECHARACTERINDEX,
+			                         SC_LINECHARACTERINDEX_UTF32, 0);
+			undo__teco_interface_ssm(SCI_SETCODEPAGE, SC_CP_UTF8, 0);
+		} else {
+			undo__teco_interface_ssm(SCI_SETCODEPAGE, 0, 0);
+			for (gint style = 0; style <= STYLE_LASTPREDEFINED; style++)
+				undo__teco_interface_ssm(SCI_STYLESETCHARACTERSET, style, old_cp);
+			/*
+			 * The index is internally reference-counted and could underflow,
+			 * so don't do it more than necessary.
+			 */
+			if (new_cp == SC_CP_UTF8)
+				undo__teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
+				                         SC_LINECHARACTERINDEX_UTF32, 0);
+		}
+	}
+
+	teco_int_t dot_glyphs;
+	if (colon_modified) {
+		sptr_t dot_bytes = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0);
+		dot_glyphs = teco_interface_bytes2glyphs(dot_bytes);
+
+		/*
+		 * Convert buffer to new codepage.
+		 *
+		 * FIXME: Could be optimized slightly by converting first
+		 * before the gap, inserting the converted text and then
+		 * converting after the gap.
+		 */
+		const gchar *to_codepage = teco_codepage2str(new_cp);
+		const gchar *from_codepage = teco_codepage2str(old_cp);
+		if (!to_codepage || !from_codepage) {
+			g_set_error_literal(error, TECO_ERROR, TECO_ERROR_FAILED,
+			                    "Unknown or unsupported codepage/charset");
+			return;
+		}
+
+		const gchar *buf = (const gchar *)teco_interface_ssm(SCI_GETCHARACTERPOINTER, 0, 0);
+		gsize len = teco_interface_ssm(SCI_GETLENGTH, 0, 0);
+		g_autofree gchar *converted;
+		gsize converted_len;
+
+		/*
+		 * This fails if there is no direct translation.
+		 * If we'd use g_convert_with_fallback(), it would be tricky to choose
+		 * fallback characters that will always work.
+		 */
+		converted = g_convert(buf, len, to_codepage, from_codepage,
+		                      NULL, &converted_len, error);
+		if (!converted)
+			return;
+
+		teco_interface_ssm(SCI_BEGINUNDOACTION, 0, 0);
+		teco_interface_ssm(SCI_CLEARALL, 0, 0);
+		teco_interface_ssm(SCI_APPENDTEXT, converted_len, (sptr_t)converted);
+		teco_interface_ssm(SCI_ENDUNDOACTION, 0, 0);
+		teco_ring_dirtify();
+
+		if (teco_current_doc_must_undo()) {
+			undo__teco_interface_ssm(SCI_GOTOPOS, dot_bytes, 0);
+			undo__teco_interface_ssm(SCI_UNDO, 0, 0);
+		}
+	}
+
+	if (new_cp == SC_CP_UTF8) {
+		teco_interface_ssm(SCI_SETCODEPAGE, SC_CP_UTF8, 0);
+		/*
+		 * UTF-8 documents strictly require the line character index.
+		 * See teco_view_glyphs2bytes() and teco_view_bytes2glyphs().
+		 */
+		g_assert(!(teco_interface_ssm(SCI_GETLINECHARACTERINDEX, 0, 0)
+						& SC_LINECHARACTERINDEX_UTF32));
+		teco_interface_ssm(SCI_ALLOCATELINECHARACTERINDEX,
+		                   SC_LINECHARACTERINDEX_UTF32, 0);
+	} else {
+		/*
+		 * The index is NOT released automatically when setting the codepage.
+		 * But it is internally reference-counted and could underflow,
+		 * so don't do it more than necessary.
+		 */
+		if (old_cp == SC_CP_UTF8) {
+			teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
+			                   SC_LINECHARACTERINDEX_UTF32, 0);
+			g_assert(!(teco_interface_ssm(SCI_GETLINECHARACTERINDEX, 0, 0)
+							& SC_LINECHARACTERINDEX_UTF32));
+		}
+
+		/*
+		 * Configure a single-byte codepage/charset.
+		 * This requires setting it on all of the possible styles.
+		 * Unfortunately there can theoretically even be 255 (STYLE_MAX) styles.
+		 * This is important only for display purposes - other than that
+		 * all single-byte encodings are handled the same.
+		 *
+		 * FIXME: Should we avoid this if new_cp == 0?
+		 * It will be used for raw byte handling mostly.
+		 */
+		if (teco_current_doc_must_undo()) {
+			/*
+			 * There is a chance the user will see this buffer even if we
+			 * are currently in batch mode.
+			 */
+			for (gint style = 0; style <= STYLE_LASTPREDEFINED; style++)
+				teco_interface_ssm(SCI_STYLESETCHARACTERSET, style, new_cp);
+		} else {
+			/* we must still set it, so that <EE> retrieval works */
+			teco_interface_ssm(SCI_STYLESETCHARACTERSET, STYLE_DEFAULT, new_cp);
+		}
+		/* 0 is used for ALL single-byte encodings */
+		teco_interface_ssm(SCI_SETCODEPAGE, 0, 0);
+	}
+
+	if (colon_modified)
+		/*
+		 * Only now, it will be safe to recalculate dot in the new encoding.
+		 * If the new codepage is UTF-8, the line character index will be
+		 * ready only now.
+		 */
+		teco_interface_ssm(SCI_GOTOPOS, teco_interface_glyphs2bytes(dot_glyphs), 0);
+}
+
 /*$ EX exit
  * [bool]EX -- Exit program
  * -EX
@@ -2352,7 +2712,7 @@ teco_state_ecommand_exit(teco_machine_main_t *ctx, GError **error)
 }
 
 static teco_state_t *
-teco_state_ecommand_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_ecommand_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
 {
 	static teco_machine_main_transition_t transitions[] = {
 		/*
@@ -2377,6 +2737,7 @@ teco_state_ecommand_input(teco_machine_main_t *ctx, gchar chr, GError **error)
 		['D']  = {&teco_state_start, teco_state_ecommand_flags},
 		['J']  = {&teco_state_start, teco_state_ecommand_properties},
 		['L']  = {&teco_state_start, teco_state_ecommand_eol},
+		['E']  = {&teco_state_start, teco_state_ecommand_encoding},
 		['X']  = {&teco_state_start, teco_state_ecommand_exit}
 	};
 
@@ -2395,26 +2756,61 @@ teco_state_insert_initial(teco_machine_main_t *ctx, GError **error)
 	if (ctx->mode > TECO_MODE_NORMAL)
 		return TRUE;
 
+	/*
+	 * Current document's encoding determines the behaviour of
+	 * string building constructs.
+	 */
+	teco_machine_stringbuilding_set_codepage(&ctx->expectstring.machine,
+	                                         teco_interface_get_codepage());
+
 	if (!teco_expressions_eval(FALSE, error))
 		return FALSE;
 	guint args = teco_expressions_args();
 	if (!args)
 		return TRUE;
 
-	teco_interface_ssm(SCI_BEGINUNDOACTION, 0, 0);
-	for (int i = args; i > 0; i--) {
-		gchar chr = (gchar)teco_expressions_peek_num(i-1);
-		teco_interface_ssm(SCI_ADDTEXT, 1, (sptr_t)&chr);
+	if (teco_interface_ssm(SCI_GETCODEPAGE, 0, 0) == SC_CP_UTF8) {
+		/* detect possible errors before introducing side effects */
+		for (gint i = args; i > 0; i--) {
+			teco_int_t chr = teco_expressions_peek_num(i-1);
+			if (chr < 0 || !g_unichar_validate(chr)) {
+				teco_error_codepoint_set(error, "I");
+				return FALSE;
+			}
+		}
+		teco_interface_ssm(SCI_BEGINUNDOACTION, 0, 0);
+		for (gint i = args; i > 0; i--) {
+			/* 4 bytes should be enough, but we better follow the documentation */
+			gchar buf[6];
+			gsize len = g_unichar_to_utf8(teco_expressions_peek_num(i-1), buf);
+			teco_interface_ssm(SCI_ADDTEXT, len, (sptr_t)buf);
+		}
+	} else {
+		/* everything else is a single-byte encoding */
+		for (gint i = args; i > 0; i--) {
+			teco_int_t chr = teco_expressions_peek_num(i-1);
+			if (chr < 0 || chr > 0xFF) {
+				teco_error_codepoint_set(error, "I");
+				return FALSE;
+			}
+		}
+		teco_interface_ssm(SCI_BEGINUNDOACTION, 0, 0);
+		for (gint i = args; i > 0; i--) {
+			gchar chr = (gchar)teco_expressions_peek_num(i-1);
+			teco_interface_ssm(SCI_ADDTEXT, 1, (sptr_t)&chr);
+		}
 	}
-	for (int i = args; i > 0; i--)
-		if (!teco_expressions_pop_num_calc(NULL, 0, error))
-			return FALSE;
 	teco_interface_ssm(SCI_ENDUNDOACTION, 0, 0);
 	teco_ring_dirtify();
 
 	if (teco_current_doc_must_undo())
 		undo__teco_interface_ssm(SCI_UNDO, 0, 0);
 
+	/* This is done only now because it can _theoretically_ fail. */
+	for (gint i = args; i > 0; i--)
+		if (!teco_expressions_pop_num_calc(NULL, 0, error))
+			return FALSE;
+
 	return TRUE;
 }
 
@@ -2451,8 +2847,8 @@ teco_state_insert_process(teco_machine_main_t *ctx, const teco_string_t *str,
  * Secondly, the command inserts <text>.
  * In interactive mode, <text> is inserted interactively.
  *
- * String building characters are \fBenabled\fP for the
- * I command.
+ * Unlike in classic TECO dialects, string building characters are
+ * \fBenabled\fP for the \fBI\fP command.
  * When editing \*(ST macros, using the \fBEI\fP command
  * may be better, since it has string building characters
  * disabled.
@@ -2491,10 +2887,9 @@ teco_state_insert_indent_initial(teco_machine_main_t *ctx, GError **error)
 		len -= teco_interface_ssm(SCI_GETCOLUMN,
 		                          teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0), 0) % len;
 
-		gchar spaces[len];
-
-		memset(spaces, ' ', sizeof(spaces));
-		teco_interface_ssm(SCI_ADDTEXT, sizeof(spaces), (sptr_t)spaces);
+		gchar space = ' ';
+		while (len-- > 0)
+			teco_interface_ssm(SCI_ADDTEXT, 1, (sptr_t)&space);
 	}
 	teco_interface_ssm(SCI_ENDUNDOACTION, 0, 0);
 	teco_ring_dirtify();