:EL can be used to perform codepage conversions now (refs #5)

* I decoded the Scintilla charset values into codepages, at least those used on Gtk. * make sure that the line character index is not allocated or released too often, as it is actually internally reference counted, which could result in it missing when we really need it. * The line character index still appears to be released whenever the document pointer changes, which will happen after using a different Q-Register. This could be a performance bottleneck (FIXME).
author: Robin Haberkorn <robin.haberkorn@googlemail.com> 2024-09-02 15:33:00 +0200
committer: Robin Haberkorn <robin.haberkorn@googlemail.com> 2024-09-09 18:22:21 +0200
commit: 33124e3d469d028f367b5fcd1f1a7197754f8f09 (patch)
tree: bd864a0d11213556c8495c84d2362b70d56d5b18
parent: e466218d6c608ec4456384dc94aefafdb5b60586 (diff)
download: sciteco-33124e3d469d028f367b5fcd1f1a7197754f8f09.tar.gz
2 files changed, 231 insertions, 35 deletions
diff --git a/src/core-commands.c b/src/core-commands.c
index 98097bb..edd7e35 100644
--- a/src/core-commands.c
+++ b/src/core-commands.c
@@ -1771,6 +1771,9 @@ teco_state_control_glyphs2bytes(teco_machine_main_t *ctx, GError **error)
 
 	if (!teco_expressions_eval(FALSE, error))
 		return;
+
+	gboolean colon_modified = teco_machine_main_eval_colon(ctx);
+
 	if (!teco_expressions_args()) {
 		/*
 		 * This is shorter than .^E or Z^E and avoids unnecessary glyph to
@@ -1778,13 +1781,12 @@ teco_state_control_glyphs2bytes(teco_machine_main_t *ctx, GError **error)
 		 * On the other hand :^E is inconsistent, as it will return a byte
 		 * index, instead of glyph index.
 		 */
-		res = teco_interface_ssm(teco_machine_main_eval_colon(ctx)
-		                         ? SCI_GETLENGTH : SCI_GETCURRENTPOS, 0, 0);
+		res = teco_interface_ssm(colon_modified ? SCI_GETLENGTH : SCI_GETCURRENTPOS, 0, 0);
 	} else {
 		teco_int_t pos;
 		if (!teco_expressions_pop_num_calc(&pos, 0, error))
 			return;
-		if (teco_machine_main_eval_colon(ctx)) {
+		if (colon_modified) {
 			/* teco_bytes2glyphs() does not check addresses */
 			res = 0 <= pos && pos <= teco_interface_ssm(SCI_GETLENGTH, 0, 0)
 				? teco_bytes2glyphs(pos) : -1;
@@ -2387,14 +2389,102 @@ teco_state_ecommand_eol(teco_machine_main_t *ctx, GError **error)
 	}
 }
 
+static const gchar *
+teco_codepage2str(guint codepage)
+{
+	/*
+	 * The multi-byte charsets are excluded, since we don't
+	 * support them in SciTECO, even though Scintilla has them.
+	 * Contrary to the Scintilla documentation, Gtk supports
+	 * most of them.
+	 * Those that are supported are tested, so the codepage
+	 * mapping should be definitive (although there could be
+	 * similar related codepages).
+	 */
+	switch (codepage) {
+	case SC_CP_UTF8:		return "UTF-8";
+	case SC_CHARSET_ANSI:
+	case SC_CHARSET_DEFAULT:	return "ISO-8859-1"; /* LATIN1 */
+	case SC_CHARSET_BALTIC:		return "ISO-8859-13"; /* LATIN7 */
+	//case SC_CHARSET_CHINESEBIG5:	return "BIG5";
+	case SC_CHARSET_EASTEUROPE:	return "ISO-8859-2"; /* LATIN2 */
+	//case SC_CHARSET_GB2312:	return "GB2312";
+	case SC_CHARSET_GREEK:		return "ISO-8859-7"; // CP1253???
+	//case SC_CHARSET_HANGUL:	return "UHC";
+	/* unsure whether this is supported on Gtk */
+	case SC_CHARSET_MAC:		return "MAC";
+	/* not supported by Gtk */
+	case SC_CHARSET_OEM:		return "CP437";
+	/*
+	 * Apparently, this can be CP1251 on the native Windows
+	 * port of Scintilla.
+	 */
+	case SC_CHARSET_RUSSIAN:	return "KOI8-R";
+	case SC_CHARSET_OEM866:		return "CP866";
+	case SC_CHARSET_CYRILLIC:	return "CP1251";
+	//case SC_CHARSET_SHIFTJIS:	return "SHIFT-JIS";
+	//case SC_CHARSET_SYMBOL:
+	case SC_CHARSET_TURKISH:	return "ISO-8859-9"; /* LATIN5 */
+	//case SC_CHARSET_JOHAB:	return "JOHAB";
+	case SC_CHARSET_HEBREW:		return "ISO-8859-8"; // CP1255?
+	/*
+	 * FIXME: Some arabic codepage is supported by Gtk,
+	 * but I am not sure which.
+	 */
+	case SC_CHARSET_ARABIC:		return "ISO-8859-6"; // CP720, CP1256???
+	/* apparently not supported by Gtk */
+	case SC_CHARSET_VIETNAMESE:	return "CP1258";
+	case SC_CHARSET_THAI:		return "ISO-8859-11";
+	case SC_CHARSET_8859_15:	return "ISO-8859-15"; /* LATIN9 */
+	}
+
+	return NULL;
+}
+
 /*$ EE encoding codepage charset
  * codepageEE -- Edit current document's encoding (codepage/charset)
  * EE -> codepage
+ * codepage:EE
+ * :EE -> codepage
  *
  * When called with an argument, it sets the current codepage,
  * otherwise returns it.
+ * The following codepages are supported:
+ * - 0: ANSI (raw bytes)
+ * - 1: ISO-8859-1 (latin1)
+ * - 77: Macintosh Latin encoding
+ * - 161: ISO-8859-7
+ * - 162: ISO-8859-9 (latin5)
+ * - 163: CP1258
+ * - 177: ISO-8859-8
+ * - 178: ISO-8859-6
+ * - 186: ISO-8859-13 (latin7)
+ * - 204: KOI8-R
+ * - 222: ISO-8859-11
+ * - 238: ISO-8859-2 (latin2)
+ * - 255: CP437
+ * - 866: CP866
+ * - 1000: ISO-8859-15 (latin9)
+ * - 1251: CP1251
+ * - 65001: UTF-8
+ *
+ * Displaying characters in the single-byte (non-UTF-8) codepages might
+ * be supported only with the Gtk UI.
+ * At least 77, 178, 163 and 255 are not displayed correctly on Gtk.
  * 65001 (UTF-8) is the default for new buffers.
- * 0 (ANSI) should be used when working with raw bytes.
+ * 0 (ANSI) should be used when working with raw bytes,
+ * but is currently displayed like ISO-8859-1 (latin1).
+ *
+ * \fBEE\fP does not change the buffer contents itself by default, only
+ * how it is displayed and how \*(ST interacts with it.
+ * This allows fixing up the codepage if it is not in the default UTF-8
+ * or if codepage guessing failed.
+ *
+ * When colon-modified the \fB:EE\fP command will also additionally convert
+ * the current buffer contents into the new code page, preserving the
+ * current position (dot).
+ * This will fail if the conversion would be lossy.
+ * Conversions from and to UTF-8 \fIshould\fP always be successful.
  */
 static void
 teco_state_ecommand_encoding(teco_machine_main_t *ctx, GError **error)
@@ -2402,53 +2492,153 @@ teco_state_ecommand_encoding(teco_machine_main_t *ctx, GError **error)
 	if (!teco_expressions_eval(FALSE, error))
 		return;
 
-	sptr_t old_cp = teco_interface_ssm(SCI_GETCODEPAGE, 0, 0);
+	gboolean colon_modified = teco_machine_main_eval_colon(ctx);
+
+	sptr_t old_cp = teco_interface_ssm(SCI_GETCODEPAGE, 0, 0)
+				? : teco_interface_ssm(SCI_STYLEGETCHARACTERSET, STYLE_DEFAULT, 0);
 
 	if (!teco_expressions_args()) {
-		teco_expressions_push(old_cp ? : teco_interface_ssm(SCI_STYLEGETCHARACTERSET, STYLE_DEFAULT, 0));
+		/* get current code page */
+		teco_expressions_push(old_cp);
 		return;
 	}
 
 	/*
 	 * Set code page
 	 */
-	if (teco_current_doc_must_undo()) {
-		if (old_cp == SC_CP_UTF8) {
+	teco_int_t new_cp;
+	if (!teco_expressions_pop_num_calc(&new_cp, 0, error))
+		return;
+
+	if (old_cp == SC_CP_UTF8 && new_cp == SC_CP_UTF8)
+		return;
+
+	if (teco_current_doc_must_undo() && teco_undo_enabled) {
+		if (old_cp == SC_CP_UTF8) { /* new_cp != SC_CP_UTF8 */
 			undo__teco_interface_ssm(SCI_ALLOCATELINECHARACTERINDEX,
 			                         SC_LINECHARACTERINDEX_UTF32, 0);
 			undo__teco_interface_ssm(SCI_SETCODEPAGE, SC_CP_UTF8, 0);
 		} else {
-			undo__teco_interface_ssm(SCI_SETCODEPAGE, old_cp, 0);
-			undo__teco_interface_ssm(SCI_STYLESETCHARACTERSET, STYLE_DEFAULT,
-			                         teco_interface_ssm(SCI_STYLEGETCHARACTERSET, STYLE_DEFAULT, 0));
-			undo__teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
-			                         SC_LINECHARACTERINDEX_UTF32, 0);
+			undo__teco_interface_ssm(SCI_SETCODEPAGE, 0, 0);
+			for (gint style = 0; style <= STYLE_LASTPREDEFINED; style++)
+				undo__teco_interface_ssm(SCI_STYLESETCHARACTERSET, style, old_cp);
+			/*
+			 * The index is internally reference-counted and could underflow,
+			 * so don't do it more than necessary.
+			 */
+			if (new_cp == SC_CP_UTF8)
+				undo__teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
+				                         SC_LINECHARACTERINDEX_UTF32, 0);
 		}
 	}
 
-	teco_int_t v;
-	if (!teco_expressions_pop_num_calc(&v, 0, error))
-		return;
-	if (v == SC_CP_UTF8) {
+	teco_int_t dot_glyphs;
+	if (colon_modified) {
+		sptr_t dot_bytes = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0);
+		dot_glyphs = teco_bytes2glyphs(dot_bytes);
+
+		/*
+		 * Convert buffer to new codepage.
+		 *
+		 * FIXME: Could be optimized slightly by converting first
+		 * before the gap, inserting the converted text and then
+		 * converting after the gap.
+		 */
+		const gchar *to_codepage = teco_codepage2str(new_cp);
+		const gchar *from_codepage = teco_codepage2str(old_cp);
+		if (!to_codepage || !from_codepage) {
+			g_set_error_literal(error, TECO_ERROR, TECO_ERROR_FAILED,
+			                    "Unknown or unsupported codepage/charset");
+			return;
+		}
+
+		const gchar *buf = (const gchar *)teco_interface_ssm(SCI_GETCHARACTERPOINTER, 0, 0);
+		gsize len = teco_interface_ssm(SCI_GETLENGTH, 0, 0);
+		g_autofree gchar *converted;
+		gsize converted_len;
+
+		/*
+		 * This fails if there is no direct translation.
+		 * If we'd use g_convert_with_fallback(), it would be tricky to choose
+		 * fallback characters that will always work.
+		 */
+		converted = g_convert(buf, len, to_codepage, from_codepage,
+		                      NULL, &converted_len, error);
+		if (!converted)
+			return;
+
+		teco_interface_ssm(SCI_BEGINUNDOACTION, 0, 0);
+		teco_interface_ssm(SCI_CLEARALL, 0, 0);
+		teco_interface_ssm(SCI_APPENDTEXT, converted_len, (sptr_t)converted);
+		teco_interface_ssm(SCI_ENDUNDOACTION, 0, 0);
+		teco_ring_dirtify();
+
+		if (teco_current_doc_must_undo()) {
+			undo__teco_interface_ssm(SCI_GOTOPOS, dot_bytes, 0);
+			undo__teco_interface_ssm(SCI_UNDO, 0, 0);
+		}
+	}
+
+	if (new_cp == SC_CP_UTF8) {
 		teco_interface_ssm(SCI_SETCODEPAGE, SC_CP_UTF8, 0);
 		/*
 		 * UTF-8 documents strictly require the line character index.
 		 * See teco_glyphs2bytes() and teco_bytes2glyphs().
 		 */
+		g_assert(!(teco_interface_ssm(SCI_GETLINECHARACTERINDEX, 0, 0)
+						& SC_LINECHARACTERINDEX_UTF32));
 		teco_interface_ssm(SCI_ALLOCATELINECHARACTERINDEX,
 		                   SC_LINECHARACTERINDEX_UTF32, 0);
-		return;
+	} else {
+		/*
+		 * The index is NOT released automatically when setting the codepage.
+		 * But it is internally reference-counted and could underflow,
+		 * so don't do it more than necessary.
+		 */
+		if (old_cp == SC_CP_UTF8) {
+			teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
+			                   SC_LINECHARACTERINDEX_UTF32, 0);
+			g_assert(!(teco_interface_ssm(SCI_GETLINECHARACTERINDEX, 0, 0)
+							& SC_LINECHARACTERINDEX_UTF32));
+		}
+
+		/*
+		 * Configure a single-byte codepage/charset.
+		 * This requires setting it on all of the possible styles.
+		 * Unfortunately there can theoretically even be 255 (STYLE_MAX) styles.
+		 * This is important only for display purposes - other than that
+		 * all single-byte encodings are handled the same.
+		 *
+		 * FIXME: Should we avoid this if new_cp == 0?
+		 * It will be used for raw byte handling mostly.
+		 * Perhaps we should even set char representations appropriately
+		 * for all non-ANSI codepoints in the 0 codepage.
+		 * But this would also be costly...
+		 */
+		if (teco_current_doc_must_undo()) {
+			/*
+			 * There is a chance the user will see this buffer even if we
+			 * are currently in batch mode.
+			 */
+			for (gint style = 0; style <= STYLE_LASTPREDEFINED; style++)
+				teco_interface_ssm(SCI_STYLESETCHARACTERSET, style, new_cp);
+		} else {
+			/* we must still set it, so that <EE> retrieval works */
+			teco_interface_ssm(SCI_STYLESETCHARACTERSET, STYLE_DEFAULT, new_cp);
+		}
+		/* 0 is used for ALL single-byte encodings */
+		teco_interface_ssm(SCI_SETCODEPAGE, 0, 0);
 	}
 
-	teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
-	                   SC_LINECHARACTERINDEX_UTF32, 0);
-	teco_interface_ssm(SCI_STYLESETCHARACTERSET, STYLE_DEFAULT, v);
-	/* 0 is used for ALL single-byte encodings */
-	teco_interface_ssm(SCI_SETCODEPAGE, 0, 0);
-	/*
-	 * FIXME: Should we attempt any code page conversion via
-	 * g_iconv()?
-	 */
+	if (colon_modified)
+		/*
+		 * Only now, it will be safe to recalculate dot in the new encoding.
+		 * If the new codepage is UTF-8, the line character index will be
+		 * ready only now.
+		 * FIXME: Apparently the line character index is still not ready
+		 * after switching to UTF-8!
+		 */
+		teco_interface_ssm(SCI_GOTOPOS, teco_glyphs2bytes(dot_glyphs), 0);
 }
 
 /*$ EX exit
diff --git a/src/doc.c b/src/doc.c
index 4e41e8a..12413af 100644
--- a/src/doc.c
+++ b/src/doc.c
@@ -48,22 +48,28 @@ teco_doc_edit(teco_doc_t *ctx)
 	teco_view_ssm(teco_qreg_view, SCI_SETSEL, ctx->anchor, (sptr_t)ctx->dot);
 
 	/*
-	 * NOTE: Thanks to a custom Scintilla patch, se representations
+	 * NOTE: Thanks to a custom Scintilla patch, representations
 	 * do not get reset after SCI_SETDOCPOINTER, so they have to be
 	 * initialized only once.
 	 */
 	//teco_view_set_representations(teco_qreg_view);
 
 	/*
-	 * Documents are UTF-8 by default and all UTF-8 documents
-	 * are expected to have a character index.
+	 * All UTF-8 documents are expected to have a character index.
+	 * This allocates nothing if the document is not UTF-8.
+	 * But it is reference counted, so it must not be allocated
+	 * more than once.
 	 *
-	 * FIXME: This apparently gets reset with every SCI_SETDOCPOINTER.
-	 * Does that mean the index needs to be recalculated repeatedly as well?
-	 * What if the document/register is made non-UTF-8 afterwards?
+	 * FIXME: This apparently gets reset with every SCI_SETDOCPOINTER
+	 * (although I don't know why and where).
+	 * Recalculating it could be inefficient.
+	 * The index is reference-counted. Perhaps we could just allocate
+	 * one more time, so it doesn't get freed when changing documents.
 	 */
-	teco_view_ssm(teco_qreg_view, SCI_ALLOCATELINECHARACTERINDEX,
-	              SC_LINECHARACTERINDEX_UTF32, 0);
+	if (!(teco_view_ssm(teco_qreg_view,
+	                    SCI_GETLINECHARACTERINDEX, 0, 0) & SC_LINECHARACTERINDEX_UTF32))
+		teco_view_ssm(teco_qreg_view, SCI_ALLOCATELINECHARACTERINDEX,
+		              SC_LINECHARACTERINDEX_UTF32, 0);
 }
 
 /** @memberof teco_doc_t */
author	Robin Haberkorn <robin.haberkorn@googlemail.com>	2024-09-02 15:33:00 +0200
committer	Robin Haberkorn <robin.haberkorn@googlemail.com>	2024-09-09 18:22:21 +0200
commit	33124e3d469d028f367b5fcd1f1a7197754f8f09 (patch)
tree	bd864a0d11213556c8495c84d2362b70d56d5b18
parent	e466218d6c608ec4456384dc94aefafdb5b60586 (diff)
download	sciteco-33124e3d469d028f367b5fcd1f1a7197754f8f09.tar.gz