2 files changed, 231 insertions, 35 deletions
diff --git a/src/core-commands.c b/src/core-commands.c
index 98097bb..edd7e35 100644
--- a/src/core-commands.c
+++ b/src/core-commands.c
@@ -1771,6 +1771,9 @@ teco_state_control_glyphs2bytes(teco_machine_main_t *ctx, GError **error)
 
 	if (!teco_expressions_eval(FALSE, error))
 		return;
+
+	gboolean colon_modified = teco_machine_main_eval_colon(ctx);
+
 	if (!teco_expressions_args()) {
 		/*
 		 * This is shorter than .^E or Z^E and avoids unnecessary glyph to
@@ -1778,13 +1781,12 @@ teco_state_control_glyphs2bytes(teco_machine_main_t *ctx, GError **error)
 		 * On the other hand :^E is inconsistent, as it will return a byte
 		 * index, instead of glyph index.
 		 */
-		res = teco_interface_ssm(teco_machine_main_eval_colon(ctx)
-		                         ? SCI_GETLENGTH : SCI_GETCURRENTPOS, 0, 0);
+		res = teco_interface_ssm(colon_modified ? SCI_GETLENGTH : SCI_GETCURRENTPOS, 0, 0);
 	} else {
 		teco_int_t pos;
 		if (!teco_expressions_pop_num_calc(&pos, 0, error))
 			return;
-		if (teco_machine_main_eval_colon(ctx)) {
+		if (colon_modified) {
 			/* teco_bytes2glyphs() does not check addresses */
 			res = 0 <= pos && pos <= teco_interface_ssm(SCI_GETLENGTH, 0, 0)
 				? teco_bytes2glyphs(pos) : -1;
@@ -2387,14 +2389,102 @@ teco_state_ecommand_eol(teco_machine_main_t *ctx, GError **error)
 	}
 }
 
+static const gchar *
+teco_codepage2str(guint codepage)
+{
+	/*
+	 * The multi-byte charsets are excluded, since we don't
+	 * support them in SciTECO, even though Scintilla has them.
+	 * Contrary to the Scintilla documentation, Gtk supports
+	 * most of them.
+	 * Those that are supported are tested, so the codepage
+	 * mapping should be definitive (although there could be
+	 * similar related codepages).
+	 */
+	switch (codepage) {
+	case SC_CP_UTF8:		return "UTF-8";
+	case SC_CHARSET_ANSI:
+	case SC_CHARSET_DEFAULT:	return "ISO-8859-1"; /* LATIN1 */
+	case SC_CHARSET_BALTIC:		return "ISO-8859-13"; /* LATIN7 */
+	//case SC_CHARSET_CHINESEBIG5:	return "BIG5";
+	case SC_CHARSET_EASTEUROPE:	return "ISO-8859-2"; /* LATIN2 */
+	//case SC_CHARSET_GB2312:	return "GB2312";
+	case SC_CHARSET_GREEK:		return "ISO-8859-7"; // CP1253???
+	//case SC_CHARSET_HANGUL:	return "UHC";
+	/* unsure whether this is supported on Gtk */
+	case SC_CHARSET_MAC:		return "MAC";
+	/* not supported by Gtk */
+	case SC_CHARSET_OEM:		return "CP437";
+	/*
+	 * Apparently, this can be CP1251 on the native Windows
+	 * port of Scintilla.
+	 */
+	case SC_CHARSET_RUSSIAN:	return "KOI8-R";
+	case SC_CHARSET_OEM866:		return "CP866";
+	case SC_CHARSET_CYRILLIC:	return "CP1251";
+	//case SC_CHARSET_SHIFTJIS:	return "SHIFT-JIS";
+	//case SC_CHARSET_SYMBOL:
+	case SC_CHARSET_TURKISH:	return "ISO-8859-9"; /* LATIN5 */
+	//case SC_CHARSET_JOHAB:	return "JOHAB";
+	case SC_CHARSET_HEBREW:		return "ISO-8859-8"; // CP1255?
+	/*
+	 * FIXME: Some arabic codepage is supported by Gtk,
+	 * but I am not sure which.
+	 */
+	case SC_CHARSET_ARABIC:		return "ISO-8859-6"; // CP720, CP1256???
+	/* apparently not supported by Gtk */
+	case SC_CHARSET_VIETNAMESE:	return "CP1258";
+	case SC_CHARSET_THAI:		return "ISO-8859-11";
+	case SC_CHARSET_8859_15:	return "ISO-8859-15"; /* LATIN9 */
+	}
+
+	return NULL;
+}
+
 /*$ EE encoding codepage charset
  * codepageEE -- Edit current document's encoding (codepage/charset)
  * EE -> codepage
+ * codepage:EE
+ * :EE -> codepage
  *
  * When called with an argument, it sets the current codepage,
  * otherwise returns it.
+ * The following codepages are supported:
+ * - 0: ANSI (raw bytes)
+ * - 1: ISO-8859-1 (latin1)
+ * - 77: Macintosh Latin encoding
+ * - 161: ISO-8859-7
+ * - 162: ISO-8859-9 (latin5)
+ * - 163: CP1258
+ * - 177: ISO-8859-8
+ * - 178: ISO-8859-6
+ * - 186: ISO-8859-13 (latin7)
+ * - 204: KOI8-R
+ * - 222: ISO-8859-11
+ * - 238: ISO-8859-2 (latin2)
+ * - 255: CP437
+ * - 866: CP866
+ * - 1000: ISO-8859-15 (latin9)
+ * - 1251: CP1251
+ * - 65001: UTF-8
+ *
+ * Displaying characters in the single-byte (non-UTF-8) codepages might
+ * be supported only with the Gtk UI.
+ * At least 77, 178, 163 and 255 are not displayed correctly on Gtk.
  * 65001 (UTF-8) is the default for new buffers.
- * 0 (ANSI) should be used when working with raw bytes.
+ * 0 (ANSI) should be used when working with raw bytes,
+ * but is currently displayed like ISO-8859-1 (latin1).
+ *
+ * \fBEE\fP does not change the buffer contents itself by default, only
+ * how it is displayed and how \*(ST interacts with it.
+ * This allows fixing up the codepage if it is not in the default UTF-8
+ * or if codepage guessing failed.
+ *
+ * When colon-modified the \fB:EE\fP command will also additionally convert
+ * the current buffer contents into the new code page, preserving the
+ * current position (dot).
+ * This will fail if the conversion would be lossy.
+ * Conversions from and to UTF-8 \fIshould\fP always be successful.
  */
 static void
 teco_state_ecommand_encoding(teco_machine_main_t *ctx, GError **error)
@@ -2402,53 +2492,153 @@ teco_state_ecommand_encoding(teco_machine_main_t *ctx, GError **error)
 	if (!teco_expressions_eval(FALSE, error))
 		return;
 
-	sptr_t old_cp = teco_interface_ssm(SCI_GETCODEPAGE, 0, 0);
+	gboolean colon_modified = teco_machine_main_eval_colon(ctx);
+
+	sptr_t old_cp = teco_interface_ssm(SCI_GETCODEPAGE, 0, 0)
+				? : teco_interface_ssm(SCI_STYLEGETCHARACTERSET, STYLE_DEFAULT, 0);
 
 	if (!teco_expressions_args()) {
-		teco_expressions_push(old_cp ? : teco_interface_ssm(SCI_STYLEGETCHARACTERSET, STYLE_DEFAULT, 0));
+		/* get current code page */
+		teco_expressions_push(old_cp);
 		return;
 	}
 
 	/*
 	 * Set code page
 	 */
-	if (teco_current_doc_must_undo()) {
-		if (old_cp == SC_CP_UTF8) {
+	teco_int_t new_cp;
+	if (!teco_expressions_pop_num_calc(&new_cp, 0, error))
+		return;
+
+	if (old_cp == SC_CP_UTF8 && new_cp == SC_CP_UTF8)
+		return;
+
+	if (teco_current_doc_must_undo() && teco_undo_enabled) {
+		if (old_cp == SC_CP_UTF8) { /* new_cp != SC_CP_UTF8 */
 			undo__teco_interface_ssm(SCI_ALLOCATELINECHARACTERINDEX,
 			                         SC_LINECHARACTERINDEX_UTF32, 0);
 			undo__teco_interface_ssm(SCI_SETCODEPAGE, SC_CP_UTF8, 0);
 		} else {
-			undo__teco_interface_ssm(SCI_SETCODEPAGE, old_cp, 0);
-			undo__teco_interface_ssm(SCI_STYLESETCHARACTERSET, STYLE_DEFAULT,
-			                         teco_interface_ssm(SCI_STYLEGETCHARACTERSET, STYLE_DEFAULT, 0));
-			undo__teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
-			                         SC_LINECHARACTERINDEX_UTF32, 0);
+			undo__teco_interface_ssm(SCI_SETCODEPAGE, 0, 0);
+			for (gint style = 0; style <= STYLE_LASTPREDEFINED; style++)
+				undo__teco_interface_ssm(SCI_STYLESETCHARACTERSET, style, old_cp);
+			/*
+			 * The index is internally reference-counted and could underflow,
+			 * so don't do it more than necessary.
+			 */
+			if (new_cp == SC_CP_UTF8)
+				undo__teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
+				                         SC_LINECHARACTERINDEX_UTF32, 0);
 		}
 	}
 
-	teco_int_t v;
-	if (!teco_expressions_pop_num_calc(&v, 0, error))
-		return;
-	if (v == SC_CP_UTF8) {
+	teco_int_t dot_glyphs;
+	if (colon_modified) {
+		sptr_t dot_bytes = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0);
+		dot_glyphs = teco_bytes2glyphs(dot_bytes);
+
+		/*
+		 * Convert buffer to new codepage.
+		 *
+		 * FIXME: Could be optimized slightly by converting first
+		 * before the gap, inserting the converted text and then
+		 * converting after the gap.
+		 */
+		const gchar *to_codepage = teco_codepage2str(new_cp);
+		const gchar *from_codepage = teco_codepage2str(old_cp);
+		if (!to_codepage || !from_codepage) {
+			g_set_error_literal(error, TECO_ERROR, TECO_ERROR_FAILED,
+			                    "Unknown or unsupported codepage/charset");
+			return;
+		}
+
+		const gchar *buf = (const gchar *)teco_interface_ssm(SCI_GETCHARACTERPOINTER, 0, 0);
+		gsize len = teco_interface_ssm(SCI_GETLENGTH, 0, 0);
+		g_autofree gchar *converted;
+		gsize converted_len;
+
+		/*
+		 * This fails if there is no direct translation.
+		 * If we'd use g_convert_with_fallback(), it would be tricky to choose
+		 * fallback characters that will always work.
+		 */
+		converted = g_convert(buf, len, to_codepage, from_codepage,
+		                      NULL, &converted_len, error);
+		if (!converted)
+			return;
+
+		teco_interface_ssm(SCI_BEGINUNDOACTION, 0, 0);
+		teco_interface_ssm(SCI_CLEARALL, 0, 0);
+		teco_interface_ssm(SCI_APPENDTEXT, converted_len, (sptr_t)converted);
+		teco_interface_ssm(SCI_ENDUNDOACTION, 0, 0);
+		teco_ring_dirtify();
+
+		if (teco_current_doc_must_undo()) {
+			undo__teco_interface_ssm(SCI_GOTOPOS, dot_bytes, 0);
+			undo__teco_interface_ssm(SCI_UNDO, 0, 0);
+		}
+	}
+
+	if (new_cp == SC_CP_UTF8) {
 		teco_interface_ssm(SCI_SETCODEPAGE, SC_CP_UTF8, 0);
 		/*
 		 * UTF-8 documents strictly require the line character index.
 		 * See teco_glyphs2bytes() and teco_bytes2glyphs().
 		 */
+		g_assert(!(teco_interface_ssm(SCI_GETLINECHARACTERINDEX, 0, 0)
+						& SC_LINECHARACTERINDEX_UTF32));
 		teco_interface_ssm(SCI_ALLOCATELINECHARACTERINDEX,
 		                   SC_LINECHARACTERINDEX_UTF32, 0);
-		return;
+	} else {
+		/*
+		 * The index is NOT released automatically when setting the codepage.
+		 * But it is internally reference-counted and could underflow,
+		 * so don't do it more than necessary.
+		 */
+		if (old_cp == SC_CP_UTF8) {
+			teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
+			                   SC_LINECHARACTERINDEX_UTF32, 0);
+			g_assert(!(teco_interface_ssm(SCI_GETLINECHARACTERINDEX, 0, 0)
+							& SC_LINECHARACTERINDEX_UTF32));
+		}
+
+		/*
+		 * Configure a single-byte codepage/charset.
+		 * This requires setting it on all of the possible styles.
+		 * Unfortunately there can theoretically even be 255 (STYLE_MAX) styles.
+		 * This is important only for display purposes - other than that
+		 * all single-byte encodings are handled the same.
+		 *
+		 * FIXME: Should we avoid this if new_cp == 0?
+		 * It will be used for raw byte handling mostly.
+		 * Perhaps we should even set char representations appropriately
+		 * for all non-ANSI codepoints in the 0 codepage.
+		 * But this would also be costly...
+		 */
+		if (teco_current_doc_must_undo()) {
+			/*
+			 * There is a chance the user will see this buffer even if we
+			 * are currently in batch mode.
+			 */
+			for (gint style = 0; style <= STYLE_LASTPREDEFINED; style++)
+				teco_interface_ssm(SCI_STYLESETCHARACTERSET, style, new_cp);
+		} else {
+			/* we must still set it, so that <EE> retrieval works */
+			teco_interface_ssm(SCI_STYLESETCHARACTERSET, STYLE_DEFAULT, new_cp);
+		}
+		/* 0 is used for ALL single-byte encodings */
+		teco_interface_ssm(SCI_SETCODEPAGE, 0, 0);
 	}
 
-	teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
-	                   SC_LINECHARACTERINDEX_UTF32, 0);
-	teco_interface_ssm(SCI_STYLESETCHARACTERSET, STYLE_DEFAULT, v);
-	/* 0 is used for ALL single-byte encodings */
-	teco_interface_ssm(SCI_SETCODEPAGE, 0, 0);
-	/*
-	 * FIXME: Should we attempt any code page conversion via
-	 * g_iconv()?
-	 */
+	if (colon_modified)
+		/*
+		 * Only now, it will be safe to recalculate dot in the new encoding.
+		 * If the new codepage is UTF-8, the line character index will be
+		 * ready only now.
+		 * FIXME: Apparently the line character index is still not ready
+		 * after switching to UTF-8!
+		 */
+		teco_interface_ssm(SCI_GOTOPOS, teco_glyphs2bytes(dot_glyphs), 0);
 }
 
 /*$ EX exit
diff --git a/src/doc.c b/src/doc.c
index 4e41e8a..12413af 100644
--- a/src/doc.c
+++ b/src/doc.c
@@ -48,22 +48,28 @@ teco_doc_edit(teco_doc_t *ctx)
 	teco_view_ssm(teco_qreg_view, SCI_SETSEL, ctx->anchor, (sptr_t)ctx->dot);
 
 	/*
-	 * NOTE: Thanks to a custom Scintilla patch, se representations
+	 * NOTE: Thanks to a custom Scintilla patch, representations
 	 * do not get reset after SCI_SETDOCPOINTER, so they have to be
 	 * initialized only once.
 	 */
 	//teco_view_set_representations(teco_qreg_view);
 
 	/*
-	 * Documents are UTF-8 by default and all UTF-8 documents
-	 * are expected to have a character index.
+	 * All UTF-8 documents are expected to have a character index.
+	 * This allocates nothing if the document is not UTF-8.
+	 * But it is reference counted, so it must not be allocated
+	 * more than once.
 	 *
-	 * FIXME: This apparently gets reset with every SCI_SETDOCPOINTER.
-	 * Does that mean the index needs to be recalculated repeatedly as well?
-	 * What if the document/register is made non-UTF-8 afterwards?
+	 * FIXME: This apparently gets reset with every SCI_SETDOCPOINTER
+	 * (although I don't know why and where).
+	 * Recalculating it could be inefficient.
+	 * The index is reference-counted. Perhaps we could just allocate
+	 * one more time, so it doesn't get freed when changing documents.
 	 */
-	teco_view_ssm(teco_qreg_view, SCI_ALLOCATELINECHARACTERINDEX,
-	              SC_LINECHARACTERINDEX_UTF32, 0);
+	if (!(teco_view_ssm(teco_qreg_view,
+	                    SCI_GETLINECHARACTERINDEX, 0, 0) & SC_LINECHARACTERINDEX_UTF32))
+		teco_view_ssm(teco_qreg_view, SCI_ALLOCATELINECHARACTERINDEX,
+		              SC_LINECHARACTERINDEX_UTF32, 0);
 }
 
 /** @memberof teco_doc_t */