From e384e4fde604564a3bc140b89bb8c1556a726464 Mon Sep 17 00:00:00 2001
From: Robin Haberkorn <robin.haberkorn@googlemail.com>
Date: Fri, 30 Aug 2024 16:15:16 +0200
Subject: implemented <EE> and <^E> commands for configuring encodings and
 translating between glyph and byte offsets (refs #5)

* ^E is heavily overloaded and can also be used to check whether a given index is valid
  (as it is the same that most movement commands to internally).
  Besides that, it is mainly useful for interfacing with Scintilla messages.
* EE takes a code page or 0 for ANSI/ASCII.
  Currently all documents and new registers are UTF-8.
  There will have to be some kind of codepage inheritance and a single-byte-only mode.
---
 lib/fnkeys.tes      |  32 ++++++-------
 sample.teco_ini     |   2 +-
 src/core-commands.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 src/doc.c           |   1 +
 4 files changed, 146 insertions(+), 18 deletions(-)
diff --git a/lib/fnkeys.tes b/lib/fnkeys.tes
index 036445b..081e7d0 100644
--- a/lib/fnkeys.tes
+++ b/lib/fnkeys.tes
@@ -36,79 +36,79 @@
  *!
 
 @[HOME]{
-  .ESLINEFROMPOSITIONESPOSITIONFROMLINEU.p
+  ESLINEFROMPOSITIONESPOSITIONFROMLINE:U.p
   Q.pU.l <Q.l-."U 1; ' Q.l-.AU.c Q.c- "N Q.c-9"N Q.lU.p 1; '' %.l>
-  Q.pESGETCOLUMN,4EJ
+  Q.pESGETCOLUMN,4EJ
   Q.p-.M#c
 }
 @[HOME]{(M[HOME]}
 1U[HOME]
 
 @[END]{
-  .ESLINEFROMPOSITIONESGETLINEENDPOSITIONU.p
+  ESLINEFROMPOSITIONESGETLINEENDPOSITIONU.p
   Q.pESGETCOLUMN,4EJ
-  Q.p-.M#c
+  Q.p:-.M#c
 }
 @[END]{(M[END]}
 1U[END]
 
 @[NPAGE]{
   0,4EJ
-  .ESLINEFROMPOSITION+(ESLINESONSCREEN)
+  ESLINEFROMPOSITION+(ESLINESONSCREEN)
   ESPOSITIONFROMLINEU.p
-  Q.p"< Z | Q.p '-.M#c
+  Q.p"< Z | Q.p: '-.M#c
 }
 @[NPAGE]{(M[NPAGE]}
 1U[NPAGE]
 
 @[PPAGE]{
   0,4EJ
-  .ESLINEFROMPOSITION-(ESLINESONSCREEN)U.l
-  Q.l"< 0 | Q.lESPOSITIONFROMLINE '-.M#c
+  ESLINEFROMPOSITION-(ESLINESONSCREEN)U.l
+  Q.l"< 0 | Q.lESPOSITIONFROMLINE: '-.M#c
 }
 @[PPAGE]{(M[PPAGE]}
 1U[PPAGE]
 
 @[LEFT]{
   ."=0|.-1'U.p
-  Q.pESGETCOLUMN,4EJ
+  Q.pESGETCOLUMN,4EJ
   Q.p-.M#c
 }
 @[LEFT]{(M[LEFT]}
 1U[LEFT]
 
 @[SLEFT]{
-  0,0,.ESWORDSTARTPOSITIONESWORDSTARTPOSITIONU.p
+  0,0,ESWORDSTARTPOSITIONESWORDSTARTPOSITIONU.p
   Q.pESGETCOLUMN,4EJ
-  Q.p-.M#c
+  Q.p:-.M#c
 }
 @[SLEFT]{(M[SLEFT]}
 1U[SLEFT]
 
 @[RIGHT]{
   .-Z"=.|.+1'U.p
-  Q.pESGETCOLUMN,4EJ
+  Q.pESGETCOLUMN,4EJ
   Q.p-.M#c
 }
 @[RIGHT]{(M[RIGHT]}
 1U[RIGHT]
 
 @[SRIGHT]{
-  0,0,.ESWORDENDPOSITIONESWORDENDPOSITIONU.p
+  0,0,ESWORDENDPOSITIONESWORDENDPOSITIONU.p
   Q.pESGETCOLUMN,4EJ
-  Q.p-.M#c
+  Q.p:-.M#c
 }
 @[SRIGHT]{(M[SRIGHT]}
 1U[SRIGHT]
 
 @[UP]{
-  4EJ(.ESLINEFROMPOSITION-1)ESFINDCOLUMN-.M#c
+  4EJ(ESLINEFROMPOSITION-1)ESFINDCOLUMN:-.M#c
 }
 @[UP]{(M[UP]}
 1U[UP]
 
 @[DOWN]{
-  4EJ(.ESLINEFROMPOSITION+1)ESFINDCOLUMN-.M#c
+  4EJ(ESLINEFROMPOSITION+1)ESFINDCOLUMN:-.M#c
 }
 @[DOWN]{(M[DOWN]}
 1U[DOWN]
diff --git a/sample.teco_ini b/sample.teco_ini
index d25b176..1c7e521 100644
--- a/sample.teco_ini
+++ b/sample.teco_ini
@@ -30,7 +30,7 @@ EMQ[$SCITECOPATH]/session.tes
 
   !edit!
     ! Add code here to execute when a document is edited !
-    .ESGETCOLUMN,4EJ
+    ESGETCOLUMN,4EJ
     
 
   !close!
diff --git a/src/core-commands.c b/src/core-commands.c
index 9281d0d..c6a9d5f 100644
--- a/src/core-commands.c
+++ b/src/core-commands.c
@@ -1761,6 +1761,67 @@ teco_state_control_radix(teco_machine_main_t *ctx, GError **error)
 	}
 }
 
+/*$ ^E glyphs2bytes bytes2glyphs
+ * glyphs^E -> bytes -- Translate between glyph and byte indexes
+ * bytes:^E -> glyphs
+ * ^E -> bytes
+ * :^E -> length
+ *
+ * Translates from glyph/character to byte indexes when called
+ * without a colon.
+ * Otherwise when colon-modified, translates from byte indexes
+ * back to glyph indexes.
+ * These values can differ in documents with multi-byte
+ * encodings (of which only UTF-8 is supported).
+ * It is especially useful to translate between these indexes
+ * when manually invoking Scintilla messages (\fBES\fP command), as
+ * they almost always take byte positions.
+ *
+ * When called without arguments, \fB^E\fP returns the current
+ * position (dot) in bytes.
+ * This is equivalent, but faster than \(lq.^E\(rq.
+ * \fB:^E\fP without arguments returns the length of the current
+ * document in bytes, which is equivalent but faster than \(lqZ^E\(rq.
+ *
+ * When passing in indexes outside of the document's valid area,
+ * -1 is returned, so the return value can also be interpreted
+ * as a TECO boolean, signalling truth/success for invalid indexes.
+ * This provides an elegant and effective way to validate
+ * buffer addresses.
+ */
+static void
+teco_state_control_glyphs2bytes(teco_machine_main_t *ctx, GError **error)
+{
+	teco_int_t res;
+
+	if (!teco_expressions_eval(FALSE, error))
+		return;
+	if (!teco_expressions_args()) {
+		/*
+		 * This is shorter than .^E or Z^E and avoids unnecessary glyph to
+		 * byte index translations.
+		 * On the other hand :^E is inconsistent, as it will return a byte
+		 * index, instead of glyph index.
+		 */
+		res = teco_interface_ssm(teco_machine_main_eval_colon(ctx)
+		                         ? SCI_GETLENGTH : SCI_GETCURRENTPOS, 0, 0);
+	} else {
+		teco_int_t pos;
+		if (!teco_expressions_pop_num_calc(&pos, 0, error))
+			return;
+		if (teco_machine_main_eval_colon(ctx)) {
+			/* teco_bytes2glyphs() does not check addresses */
+			res = 0 <= pos && pos <= teco_interface_ssm(SCI_GETLENGTH, 0, 0)
+				? teco_bytes2glyphs(pos) : -1;
+		} else {
+			/* negative values for invalid indexes are passed down. */
+			res = teco_glyphs2bytes(pos);
+		}
+	}
+
+	teco_expressions_push(res);
+}
+
 static teco_state_t *
 teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error)
 {
@@ -1787,7 +1848,8 @@ teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error)
 		['C']  = {&teco_state_start, teco_state_control_exit},
 		['O']  = {&teco_state_start, teco_state_control_octal},
 		['D']  = {&teco_state_start, teco_state_control_decimal},
-		['R']  = {&teco_state_start, teco_state_control_radix}
+		['R']  = {&teco_state_start, teco_state_control_radix},
+		['E']  = {&teco_state_start, teco_state_control_glyphs2bytes}
 	};
 
 	/*
@@ -2350,6 +2412,70 @@ teco_state_ecommand_eol(teco_machine_main_t *ctx, GError **error)
 	}
 }
 
+/*$ EE encoding codepage charset
+ * codepageEE -- Edit current document's encoding (codepage/charset)
+ * EE -> codepage
+ *
+ * When called with an argument, it sets the current codepage,
+ * otherwise returns it.
+ * 65001 (UTF-8) is the default for new buffers.
+ * 0 (ANSI) should be used when working with raw bytes.
+ */
+static void
+teco_state_ecommand_encoding(teco_machine_main_t *ctx, GError **error)
+{
+	if (!teco_expressions_eval(FALSE, error))
+		return;
+
+	sptr_t old_cp = teco_interface_ssm(SCI_GETCODEPAGE, 0, 0);
+
+	if (!teco_expressions_args()) {
+		teco_expressions_push(old_cp ? : teco_interface_ssm(SCI_STYLEGETCHARACTERSET, STYLE_DEFAULT, 0));
+		return;
+	}
+
+	/*
+	 * Set code page
+	 */
+	if (teco_current_doc_must_undo()) {
+		if (old_cp == SC_CP_UTF8) {
+			undo__teco_interface_ssm(SCI_ALLOCATELINECHARACTERINDEX,
+			                         SC_LINECHARACTERINDEX_UTF32, 0);
+			undo__teco_interface_ssm(SCI_SETCODEPAGE, SC_CP_UTF8, 0);
+		} else {
+			undo__teco_interface_ssm(SCI_SETCODEPAGE, old_cp, 0);
+			undo__teco_interface_ssm(SCI_STYLESETCHARACTERSET, STYLE_DEFAULT,
+			                         teco_interface_ssm(SCI_STYLEGETCHARACTERSET, STYLE_DEFAULT, 0));
+			undo__teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
+			                         SC_LINECHARACTERINDEX_UTF32, 0);
+		}
+	}
+
+	teco_int_t v;
+	if (!teco_expressions_pop_num_calc(&v, 0, error))
+		return;
+	if (v == SC_CP_UTF8) {
+		teco_interface_ssm(SCI_SETCODEPAGE, SC_CP_UTF8, 0);
+		/*
+		 * UTF-8 documents strictly require the line character index.
+		 * See teco_glyphs2bytes() and teco_bytes2glyphs().
+		 */
+		teco_interface_ssm(SCI_ALLOCATELINECHARACTERINDEX,
+		                   SC_LINECHARACTERINDEX_UTF32, 0);
+		return;
+	}
+
+	teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
+	                   SC_LINECHARACTERINDEX_UTF32, 0);
+	teco_interface_ssm(SCI_STYLESETCHARACTERSET, STYLE_DEFAULT, v);
+	/* 0 is used for ALL single-byte encodings */
+	teco_interface_ssm(SCI_SETCODEPAGE, 0, 0);
+	/*
+	 * FIXME: Should we attempt any code page conversion via
+	 * g_iconv()?
+	 */
+}
+
 /*$ EX exit
  * [bool]EX -- Exit program
  * -EX
@@ -2435,6 +2561,7 @@ teco_state_ecommand_input(teco_machine_main_t *ctx, gchar chr, GError **error)
 		['D']  = {&teco_state_start, teco_state_ecommand_flags},
 		['J']  = {&teco_state_start, teco_state_ecommand_properties},
 		['L']  = {&teco_state_start, teco_state_ecommand_eol},
+		['E']  = {&teco_state_start, teco_state_ecommand_encoding},
 		['X']  = {&teco_state_start, teco_state_ecommand_exit}
 	};
 
diff --git a/src/doc.c b/src/doc.c
index 0360b43..4e41e8a 100644
--- a/src/doc.c
+++ b/src/doc.c
@@ -60,6 +60,7 @@ teco_doc_edit(teco_doc_t *ctx)
 	 *
 	 * FIXME: This apparently gets reset with every SCI_SETDOCPOINTER.
 	 * Does that mean the index needs to be recalculated repeatedly as well?
+	 * What if the document/register is made non-UTF-8 afterwards?
 	 */
 	teco_view_ssm(teco_qreg_view, SCI_ALLOCATELINECHARACTERINDEX,
 	              SC_LINECHARACTERINDEX_UTF32, 0);
-- 
cgit v1.2.3