From e384e4fde604564a3bc140b89bb8c1556a726464 Mon Sep 17 00:00:00 2001 From: Robin Haberkorn Date: Fri, 30 Aug 2024 16:15:16 +0200 Subject: implemented and <^E> commands for configuring encodings and translating between glyph and byte offsets (refs #5) * ^E is heavily overloaded and can also be used to check whether a given index is valid (as it is the same that most movement commands to internally). Besides that, it is mainly useful for interfacing with Scintilla messages. * EE takes a code page or 0 for ANSI/ASCII. Currently all documents and new registers are UTF-8. There will have to be some kind of codepage inheritance and a single-byte-only mode. --- lib/fnkeys.tes | 32 ++++++------- sample.teco_ini | 2 +- src/core-commands.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++- src/doc.c | 1 + 4 files changed, 146 insertions(+), 18 deletions(-) diff --git a/lib/fnkeys.tes b/lib/fnkeys.tes index 036445b..081e7d0 100644 --- a/lib/fnkeys.tes +++ b/lib/fnkeys.tes @@ -36,79 +36,79 @@ *! @[HOME]{ - .ESLINEFROMPOSITIONESPOSITIONFROMLINEU.p + ESLINEFROMPOSITIONESPOSITIONFROMLINE:U.p Q.pU.l - Q.pESGETCOLUMN,4EJ + Q.pESGETCOLUMN,4EJ Q.p-.M#c } @[HOME]{(M[HOME]} 1U[HOME] @[END]{ - .ESLINEFROMPOSITIONESGETLINEENDPOSITIONU.p + ESLINEFROMPOSITIONESGETLINEENDPOSITIONU.p Q.pESGETCOLUMN,4EJ - Q.p-.M#c + Q.p:-.M#c } @[END]{(M[END]} 1U[END] @[NPAGE]{ 0,4EJ - .ESLINEFROMPOSITION+(ESLINESONSCREEN) + ESLINEFROMPOSITION+(ESLINESONSCREEN) ESPOSITIONFROMLINEU.p - Q.p"< Z | Q.p '-.M#c + Q.p"< Z | Q.p: '-.M#c } @[NPAGE]{(M[NPAGE]} 1U[NPAGE] @[PPAGE]{ 0,4EJ - .ESLINEFROMPOSITION-(ESLINESONSCREEN)U.l - Q.l"< 0 | Q.lESPOSITIONFROMLINE '-.M#c + ESLINEFROMPOSITION-(ESLINESONSCREEN)U.l + Q.l"< 0 | Q.lESPOSITIONFROMLINE: '-.M#c } @[PPAGE]{(M[PPAGE]} 1U[PPAGE] @[LEFT]{ ."=0|.-1'U.p - Q.pESGETCOLUMN,4EJ + Q.pESGETCOLUMN,4EJ Q.p-.M#c } @[LEFT]{(M[LEFT]} 1U[LEFT] @[SLEFT]{ - 0,0,.ESWORDSTARTPOSITIONESWORDSTARTPOSITIONU.p + 0,0,ESWORDSTARTPOSITIONESWORDSTARTPOSITIONU.p Q.pESGETCOLUMN,4EJ - Q.p-.M#c + Q.p:-.M#c } @[SLEFT]{(M[SLEFT]} 1U[SLEFT] @[RIGHT]{ .-Z"=.|.+1'U.p - Q.pESGETCOLUMN,4EJ + Q.pESGETCOLUMN,4EJ Q.p-.M#c } @[RIGHT]{(M[RIGHT]} 1U[RIGHT] @[SRIGHT]{ - 0,0,.ESWORDENDPOSITIONESWORDENDPOSITIONU.p + 0,0,ESWORDENDPOSITIONESWORDENDPOSITIONU.p Q.pESGETCOLUMN,4EJ - Q.p-.M#c + Q.p:-.M#c } @[SRIGHT]{(M[SRIGHT]} 1U[SRIGHT] @[UP]{ - 4EJ(.ESLINEFROMPOSITION-1)ESFINDCOLUMN-.M#c + 4EJ(ESLINEFROMPOSITION-1)ESFINDCOLUMN:-.M#c } @[UP]{(M[UP]} 1U[UP] @[DOWN]{ - 4EJ(.ESLINEFROMPOSITION+1)ESFINDCOLUMN-.M#c + 4EJ(ESLINEFROMPOSITION+1)ESFINDCOLUMN:-.M#c } @[DOWN]{(M[DOWN]} 1U[DOWN] diff --git a/sample.teco_ini b/sample.teco_ini index d25b176..1c7e521 100644 --- a/sample.teco_ini +++ b/sample.teco_ini @@ -30,7 +30,7 @@ EMQ[$SCITECOPATH]/session.tes !edit! ! Add code here to execute when a document is edited ! - .ESGETCOLUMN,4EJ + ESGETCOLUMN,4EJ  !close! diff --git a/src/core-commands.c b/src/core-commands.c index 9281d0d..c6a9d5f 100644 --- a/src/core-commands.c +++ b/src/core-commands.c @@ -1761,6 +1761,67 @@ teco_state_control_radix(teco_machine_main_t *ctx, GError **error) } } +/*$ ^E glyphs2bytes bytes2glyphs + * glyphs^E -> bytes -- Translate between glyph and byte indexes + * bytes:^E -> glyphs + * ^E -> bytes + * :^E -> length + * + * Translates from glyph/character to byte indexes when called + * without a colon. + * Otherwise when colon-modified, translates from byte indexes + * back to glyph indexes. + * These values can differ in documents with multi-byte + * encodings (of which only UTF-8 is supported). + * It is especially useful to translate between these indexes + * when manually invoking Scintilla messages (\fBES\fP command), as + * they almost always take byte positions. + * + * When called without arguments, \fB^E\fP returns the current + * position (dot) in bytes. + * This is equivalent, but faster than \(lq.^E\(rq. + * \fB:^E\fP without arguments returns the length of the current + * document in bytes, which is equivalent but faster than \(lqZ^E\(rq. + * + * When passing in indexes outside of the document's valid area, + * -1 is returned, so the return value can also be interpreted + * as a TECO boolean, signalling truth/success for invalid indexes. + * This provides an elegant and effective way to validate + * buffer addresses. + */ +static void +teco_state_control_glyphs2bytes(teco_machine_main_t *ctx, GError **error) +{ + teco_int_t res; + + if (!teco_expressions_eval(FALSE, error)) + return; + if (!teco_expressions_args()) { + /* + * This is shorter than .^E or Z^E and avoids unnecessary glyph to + * byte index translations. + * On the other hand :^E is inconsistent, as it will return a byte + * index, instead of glyph index. + */ + res = teco_interface_ssm(teco_machine_main_eval_colon(ctx) + ? SCI_GETLENGTH : SCI_GETCURRENTPOS, 0, 0); + } else { + teco_int_t pos; + if (!teco_expressions_pop_num_calc(&pos, 0, error)) + return; + if (teco_machine_main_eval_colon(ctx)) { + /* teco_bytes2glyphs() does not check addresses */ + res = 0 <= pos && pos <= teco_interface_ssm(SCI_GETLENGTH, 0, 0) + ? teco_bytes2glyphs(pos) : -1; + } else { + /* negative values for invalid indexes are passed down. */ + res = teco_glyphs2bytes(pos); + } + } + + teco_expressions_push(res); +} + static teco_state_t * teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error) { @@ -1787,7 +1848,8 @@ teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error) ['C'] = {&teco_state_start, teco_state_control_exit}, ['O'] = {&teco_state_start, teco_state_control_octal}, ['D'] = {&teco_state_start, teco_state_control_decimal}, - ['R'] = {&teco_state_start, teco_state_control_radix} + ['R'] = {&teco_state_start, teco_state_control_radix}, + ['E'] = {&teco_state_start, teco_state_control_glyphs2bytes} }; /* @@ -2350,6 +2412,70 @@ teco_state_ecommand_eol(teco_machine_main_t *ctx, GError **error) } } +/*$ EE encoding codepage charset + * codepageEE -- Edit current document's encoding (codepage/charset) + * EE -> codepage + * + * When called with an argument, it sets the current codepage, + * otherwise returns it. + * 65001 (UTF-8) is the default for new buffers. + * 0 (ANSI) should be used when working with raw bytes. + */ +static void +teco_state_ecommand_encoding(teco_machine_main_t *ctx, GError **error) +{ + if (!teco_expressions_eval(FALSE, error)) + return; + + sptr_t old_cp = teco_interface_ssm(SCI_GETCODEPAGE, 0, 0); + + if (!teco_expressions_args()) { + teco_expressions_push(old_cp ? : teco_interface_ssm(SCI_STYLEGETCHARACTERSET, STYLE_DEFAULT, 0)); + return; + } + + /* + * Set code page + */ + if (teco_current_doc_must_undo()) { + if (old_cp == SC_CP_UTF8) { + undo__teco_interface_ssm(SCI_ALLOCATELINECHARACTERINDEX, + SC_LINECHARACTERINDEX_UTF32, 0); + undo__teco_interface_ssm(SCI_SETCODEPAGE, SC_CP_UTF8, 0); + } else { + undo__teco_interface_ssm(SCI_SETCODEPAGE, old_cp, 0); + undo__teco_interface_ssm(SCI_STYLESETCHARACTERSET, STYLE_DEFAULT, + teco_interface_ssm(SCI_STYLEGETCHARACTERSET, STYLE_DEFAULT, 0)); + undo__teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX, + SC_LINECHARACTERINDEX_UTF32, 0); + } + } + + teco_int_t v; + if (!teco_expressions_pop_num_calc(&v, 0, error)) + return; + if (v == SC_CP_UTF8) { + teco_interface_ssm(SCI_SETCODEPAGE, SC_CP_UTF8, 0); + /* + * UTF-8 documents strictly require the line character index. + * See teco_glyphs2bytes() and teco_bytes2glyphs(). + */ + teco_interface_ssm(SCI_ALLOCATELINECHARACTERINDEX, + SC_LINECHARACTERINDEX_UTF32, 0); + return; + } + + teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX, + SC_LINECHARACTERINDEX_UTF32, 0); + teco_interface_ssm(SCI_STYLESETCHARACTERSET, STYLE_DEFAULT, v); + /* 0 is used for ALL single-byte encodings */ + teco_interface_ssm(SCI_SETCODEPAGE, 0, 0); + /* + * FIXME: Should we attempt any code page conversion via + * g_iconv()? + */ +} + /*$ EX exit * [bool]EX -- Exit program * -EX @@ -2435,6 +2561,7 @@ teco_state_ecommand_input(teco_machine_main_t *ctx, gchar chr, GError **error) ['D'] = {&teco_state_start, teco_state_ecommand_flags}, ['J'] = {&teco_state_start, teco_state_ecommand_properties}, ['L'] = {&teco_state_start, teco_state_ecommand_eol}, + ['E'] = {&teco_state_start, teco_state_ecommand_encoding}, ['X'] = {&teco_state_start, teco_state_ecommand_exit} }; diff --git a/src/doc.c b/src/doc.c index 0360b43..4e41e8a 100644 --- a/src/doc.c +++ b/src/doc.c @@ -60,6 +60,7 @@ teco_doc_edit(teco_doc_t *ctx) * * FIXME: This apparently gets reset with every SCI_SETDOCPOINTER. * Does that mean the index needs to be recalculated repeatedly as well? + * What if the document/register is made non-UTF-8 afterwards? */ teco_view_ssm(teco_qreg_view, SCI_ALLOCATELINECHARACTERINDEX, SC_LINECHARACTERINDEX_UTF32, 0); -- cgit v1.2.3