aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRobin Haberkorn <robin.haberkorn@googlemail.com>2024-08-30 16:15:16 +0200
committerRobin Haberkorn <robin.haberkorn@googlemail.com>2024-09-09 18:22:21 +0200
commite384e4fde604564a3bc140b89bb8c1556a726464 (patch)
tree35c9a29f8a6d8362213652c863dd1bf77da60486
parent7507ad3e1816f3bc9004dceb39bb303804287438 (diff)
downloadsciteco-e384e4fde604564a3bc140b89bb8c1556a726464.tar.gz
implemented <EE> and <^E> commands for configuring encodings and translating between glyph and byte offsets (refs #5)
* ^E is heavily overloaded and can also be used to check whether a given index is valid (as it is the same that most movement commands to internally). Besides that, it is mainly useful for interfacing with Scintilla messages. * EE takes a code page or 0 for ANSI/ASCII. Currently all documents and new registers are UTF-8. There will have to be some kind of codepage inheritance and a single-byte-only mode.
-rw-r--r--lib/fnkeys.tes32
-rw-r--r--sample.teco_ini2
-rw-r--r--src/core-commands.c129
-rw-r--r--src/doc.c1
4 files changed, 146 insertions, 18 deletions
diff --git a/lib/fnkeys.tes b/lib/fnkeys.tes
index 036445b..081e7d0 100644
--- a/lib/fnkeys.tes
+++ b/lib/fnkeys.tes
@@ -36,79 +36,79 @@
*!
@[HOME]{
- .ESLINEFROMPOSITIONESPOSITIONFROMLINEU.p
+ ESLINEFROMPOSITIONESPOSITIONFROMLINE:U.p
Q.pU.l <Q.l-."U 1; ' Q.l-.AU.c Q.c- "N Q.c-9"N Q.lU.p 1; '' %.l>
- Q.pESGETCOLUMN,4EJ
+ Q.pESGETCOLUMN,4EJ
Q.p-.M#c
}
@[HOME]{(M[HOME]}
1U[HOME]
@[END]{
- .ESLINEFROMPOSITIONESGETLINEENDPOSITIONU.p
+ ESLINEFROMPOSITIONESGETLINEENDPOSITIONU.p
Q.pESGETCOLUMN,4EJ
- Q.p-.M#c
+ Q.p:-.M#c
}
@[END]{(M[END]}
1U[END]
@[NPAGE]{
0,4EJ
- .ESLINEFROMPOSITION+(ESLINESONSCREEN)
+ ESLINEFROMPOSITION+(ESLINESONSCREEN)
ESPOSITIONFROMLINEU.p
- Q.p"< Z | Q.p '-.M#c
+ Q.p"< Z | Q.p: '-.M#c
}
@[NPAGE]{(M[NPAGE]}
1U[NPAGE]
@[PPAGE]{
0,4EJ
- .ESLINEFROMPOSITION-(ESLINESONSCREEN)U.l
- Q.l"< 0 | Q.lESPOSITIONFROMLINE '-.M#c
+ ESLINEFROMPOSITION-(ESLINESONSCREEN)U.l
+ Q.l"< 0 | Q.lESPOSITIONFROMLINE: '-.M#c
}
@[PPAGE]{(M[PPAGE]}
1U[PPAGE]
@[LEFT]{
."=0|.-1'U.p
- Q.pESGETCOLUMN,4EJ
+ Q.pESGETCOLUMN,4EJ
Q.p-.M#c
}
@[LEFT]{(M[LEFT]}
1U[LEFT]
@[SLEFT]{
- 0,0,.ESWORDSTARTPOSITIONESWORDSTARTPOSITIONU.p
+ 0,0,ESWORDSTARTPOSITIONESWORDSTARTPOSITIONU.p
Q.pESGETCOLUMN,4EJ
- Q.p-.M#c
+ Q.p:-.M#c
}
@[SLEFT]{(M[SLEFT]}
1U[SLEFT]
@[RIGHT]{
.-Z"=.|.+1'U.p
- Q.pESGETCOLUMN,4EJ
+ Q.pESGETCOLUMN,4EJ
Q.p-.M#c
}
@[RIGHT]{(M[RIGHT]}
1U[RIGHT]
@[SRIGHT]{
- 0,0,.ESWORDENDPOSITIONESWORDENDPOSITIONU.p
+ 0,0,ESWORDENDPOSITIONESWORDENDPOSITIONU.p
Q.pESGETCOLUMN,4EJ
- Q.p-.M#c
+ Q.p:-.M#c
}
@[SRIGHT]{(M[SRIGHT]}
1U[SRIGHT]
@[UP]{
- 4EJ(.ESLINEFROMPOSITION-1)ESFINDCOLUMN-.M#c
+ 4EJ(ESLINEFROMPOSITION-1)ESFINDCOLUMN:-.M#c
}
@[UP]{(M[UP]}
1U[UP]
@[DOWN]{
- 4EJ(.ESLINEFROMPOSITION+1)ESFINDCOLUMN-.M#c
+ 4EJ(ESLINEFROMPOSITION+1)ESFINDCOLUMN:-.M#c
}
@[DOWN]{(M[DOWN]}
1U[DOWN]
diff --git a/sample.teco_ini b/sample.teco_ini
index d25b176..1c7e521 100644
--- a/sample.teco_ini
+++ b/sample.teco_ini
@@ -30,7 +30,7 @@ EMQ[$SCITECOPATH]/session.tes
!edit!
! Add code here to execute when a document is edited !
- .ESGETCOLUMN,4EJ
+ ESGETCOLUMN,4EJ

!close!
diff --git a/src/core-commands.c b/src/core-commands.c
index 9281d0d..c6a9d5f 100644
--- a/src/core-commands.c
+++ b/src/core-commands.c
@@ -1761,6 +1761,67 @@ teco_state_control_radix(teco_machine_main_t *ctx, GError **error)
}
}
+/*$ ^E glyphs2bytes bytes2glyphs
+ * glyphs^E -> bytes -- Translate between glyph and byte indexes
+ * bytes:^E -> glyphs
+ * ^E -> bytes
+ * :^E -> length
+ *
+ * Translates from glyph/character to byte indexes when called
+ * without a colon.
+ * Otherwise when colon-modified, translates from byte indexes
+ * back to glyph indexes.
+ * These values can differ in documents with multi-byte
+ * encodings (of which only UTF-8 is supported).
+ * It is especially useful to translate between these indexes
+ * when manually invoking Scintilla messages (\fBES\fP command), as
+ * they almost always take byte positions.
+ *
+ * When called without arguments, \fB^E\fP returns the current
+ * position (dot) in bytes.
+ * This is equivalent, but faster than \(lq.^E\(rq.
+ * \fB:^E\fP without arguments returns the length of the current
+ * document in bytes, which is equivalent but faster than \(lqZ^E\(rq.
+ *
+ * When passing in indexes outside of the document's valid area,
+ * -1 is returned, so the return value can also be interpreted
+ * as a TECO boolean, signalling truth/success for invalid indexes.
+ * This provides an elegant and effective way to validate
+ * buffer addresses.
+ */
+static void
+teco_state_control_glyphs2bytes(teco_machine_main_t *ctx, GError **error)
+{
+ teco_int_t res;
+
+ if (!teco_expressions_eval(FALSE, error))
+ return;
+ if (!teco_expressions_args()) {
+ /*
+ * This is shorter than .^E or Z^E and avoids unnecessary glyph to
+ * byte index translations.
+ * On the other hand :^E is inconsistent, as it will return a byte
+ * index, instead of glyph index.
+ */
+ res = teco_interface_ssm(teco_machine_main_eval_colon(ctx)
+ ? SCI_GETLENGTH : SCI_GETCURRENTPOS, 0, 0);
+ } else {
+ teco_int_t pos;
+ if (!teco_expressions_pop_num_calc(&pos, 0, error))
+ return;
+ if (teco_machine_main_eval_colon(ctx)) {
+ /* teco_bytes2glyphs() does not check addresses */
+ res = 0 <= pos && pos <= teco_interface_ssm(SCI_GETLENGTH, 0, 0)
+ ? teco_bytes2glyphs(pos) : -1;
+ } else {
+ /* negative values for invalid indexes are passed down. */
+ res = teco_glyphs2bytes(pos);
+ }
+ }
+
+ teco_expressions_push(res);
+}
+
static teco_state_t *
teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error)
{
@@ -1787,7 +1848,8 @@ teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error)
['C'] = {&teco_state_start, teco_state_control_exit},
['O'] = {&teco_state_start, teco_state_control_octal},
['D'] = {&teco_state_start, teco_state_control_decimal},
- ['R'] = {&teco_state_start, teco_state_control_radix}
+ ['R'] = {&teco_state_start, teco_state_control_radix},
+ ['E'] = {&teco_state_start, teco_state_control_glyphs2bytes}
};
/*
@@ -2350,6 +2412,70 @@ teco_state_ecommand_eol(teco_machine_main_t *ctx, GError **error)
}
}
+/*$ EE encoding codepage charset
+ * codepageEE -- Edit current document's encoding (codepage/charset)
+ * EE -> codepage
+ *
+ * When called with an argument, it sets the current codepage,
+ * otherwise returns it.
+ * 65001 (UTF-8) is the default for new buffers.
+ * 0 (ANSI) should be used when working with raw bytes.
+ */
+static void
+teco_state_ecommand_encoding(teco_machine_main_t *ctx, GError **error)
+{
+ if (!teco_expressions_eval(FALSE, error))
+ return;
+
+ sptr_t old_cp = teco_interface_ssm(SCI_GETCODEPAGE, 0, 0);
+
+ if (!teco_expressions_args()) {
+ teco_expressions_push(old_cp ? : teco_interface_ssm(SCI_STYLEGETCHARACTERSET, STYLE_DEFAULT, 0));
+ return;
+ }
+
+ /*
+ * Set code page
+ */
+ if (teco_current_doc_must_undo()) {
+ if (old_cp == SC_CP_UTF8) {
+ undo__teco_interface_ssm(SCI_ALLOCATELINECHARACTERINDEX,
+ SC_LINECHARACTERINDEX_UTF32, 0);
+ undo__teco_interface_ssm(SCI_SETCODEPAGE, SC_CP_UTF8, 0);
+ } else {
+ undo__teco_interface_ssm(SCI_SETCODEPAGE, old_cp, 0);
+ undo__teco_interface_ssm(SCI_STYLESETCHARACTERSET, STYLE_DEFAULT,
+ teco_interface_ssm(SCI_STYLEGETCHARACTERSET, STYLE_DEFAULT, 0));
+ undo__teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
+ SC_LINECHARACTERINDEX_UTF32, 0);
+ }
+ }
+
+ teco_int_t v;
+ if (!teco_expressions_pop_num_calc(&v, 0, error))
+ return;
+ if (v == SC_CP_UTF8) {
+ teco_interface_ssm(SCI_SETCODEPAGE, SC_CP_UTF8, 0);
+ /*
+ * UTF-8 documents strictly require the line character index.
+ * See teco_glyphs2bytes() and teco_bytes2glyphs().
+ */
+ teco_interface_ssm(SCI_ALLOCATELINECHARACTERINDEX,
+ SC_LINECHARACTERINDEX_UTF32, 0);
+ return;
+ }
+
+ teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
+ SC_LINECHARACTERINDEX_UTF32, 0);
+ teco_interface_ssm(SCI_STYLESETCHARACTERSET, STYLE_DEFAULT, v);
+ /* 0 is used for ALL single-byte encodings */
+ teco_interface_ssm(SCI_SETCODEPAGE, 0, 0);
+ /*
+ * FIXME: Should we attempt any code page conversion via
+ * g_iconv()?
+ */
+}
+
/*$ EX exit
* [bool]EX -- Exit program
* -EX
@@ -2435,6 +2561,7 @@ teco_state_ecommand_input(teco_machine_main_t *ctx, gchar chr, GError **error)
['D'] = {&teco_state_start, teco_state_ecommand_flags},
['J'] = {&teco_state_start, teco_state_ecommand_properties},
['L'] = {&teco_state_start, teco_state_ecommand_eol},
+ ['E'] = {&teco_state_start, teco_state_ecommand_encoding},
['X'] = {&teco_state_start, teco_state_ecommand_exit}
};
diff --git a/src/doc.c b/src/doc.c
index 0360b43..4e41e8a 100644
--- a/src/doc.c
+++ b/src/doc.c
@@ -60,6 +60,7 @@ teco_doc_edit(teco_doc_t *ctx)
*
* FIXME: This apparently gets reset with every SCI_SETDOCPOINTER.
* Does that mean the index needs to be recalculated repeatedly as well?
+ * What if the document/register is made non-UTF-8 afterwards?
*/
teco_view_ssm(teco_qreg_view, SCI_ALLOCATELINECHARACTERINDEX,
SC_LINECHARACTERINDEX_UTF32, 0);