aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/core-commands.c
diff options
context:
space:
mode:
authorRobin Haberkorn <robin.haberkorn@googlemail.com>2024-08-30 16:15:16 +0200
committerRobin Haberkorn <robin.haberkorn@googlemail.com>2024-09-09 18:22:21 +0200
commite384e4fde604564a3bc140b89bb8c1556a726464 (patch)
tree35c9a29f8a6d8362213652c863dd1bf77da60486 /src/core-commands.c
parent7507ad3e1816f3bc9004dceb39bb303804287438 (diff)
downloadsciteco-e384e4fde604564a3bc140b89bb8c1556a726464.tar.gz
implemented <EE> and <^E> commands for configuring encodings and translating between glyph and byte offsets (refs #5)
* ^E is heavily overloaded and can also be used to check whether a given index is valid (as it is the same that most movement commands to internally). Besides that, it is mainly useful for interfacing with Scintilla messages. * EE takes a code page or 0 for ANSI/ASCII. Currently all documents and new registers are UTF-8. There will have to be some kind of codepage inheritance and a single-byte-only mode.
Diffstat (limited to 'src/core-commands.c')
-rw-r--r--src/core-commands.c129
1 files changed, 128 insertions, 1 deletions
diff --git a/src/core-commands.c b/src/core-commands.c
index 9281d0d..c6a9d5f 100644
--- a/src/core-commands.c
+++ b/src/core-commands.c
@@ -1761,6 +1761,67 @@ teco_state_control_radix(teco_machine_main_t *ctx, GError **error)
}
}
+/*$ ^E glyphs2bytes bytes2glyphs
+ * glyphs^E -> bytes -- Translate between glyph and byte indexes
+ * bytes:^E -> glyphs
+ * ^E -> bytes
+ * :^E -> length
+ *
+ * Translates from glyph/character to byte indexes when called
+ * without a colon.
+ * Otherwise when colon-modified, translates from byte indexes
+ * back to glyph indexes.
+ * These values can differ in documents with multi-byte
+ * encodings (of which only UTF-8 is supported).
+ * It is especially useful to translate between these indexes
+ * when manually invoking Scintilla messages (\fBES\fP command), as
+ * they almost always take byte positions.
+ *
+ * When called without arguments, \fB^E\fP returns the current
+ * position (dot) in bytes.
+ * This is equivalent, but faster than \(lq.^E\(rq.
+ * \fB:^E\fP without arguments returns the length of the current
+ * document in bytes, which is equivalent but faster than \(lqZ^E\(rq.
+ *
+ * When passing in indexes outside of the document's valid area,
+ * -1 is returned, so the return value can also be interpreted
+ * as a TECO boolean, signalling truth/success for invalid indexes.
+ * This provides an elegant and effective way to validate
+ * buffer addresses.
+ */
+static void
+teco_state_control_glyphs2bytes(teco_machine_main_t *ctx, GError **error)
+{
+ teco_int_t res;
+
+ if (!teco_expressions_eval(FALSE, error))
+ return;
+ if (!teco_expressions_args()) {
+ /*
+ * This is shorter than .^E or Z^E and avoids unnecessary glyph to
+ * byte index translations.
+ * On the other hand :^E is inconsistent, as it will return a byte
+ * index, instead of glyph index.
+ */
+ res = teco_interface_ssm(teco_machine_main_eval_colon(ctx)
+ ? SCI_GETLENGTH : SCI_GETCURRENTPOS, 0, 0);
+ } else {
+ teco_int_t pos;
+ if (!teco_expressions_pop_num_calc(&pos, 0, error))
+ return;
+ if (teco_machine_main_eval_colon(ctx)) {
+ /* teco_bytes2glyphs() does not check addresses */
+ res = 0 <= pos && pos <= teco_interface_ssm(SCI_GETLENGTH, 0, 0)
+ ? teco_bytes2glyphs(pos) : -1;
+ } else {
+ /* negative values for invalid indexes are passed down. */
+ res = teco_glyphs2bytes(pos);
+ }
+ }
+
+ teco_expressions_push(res);
+}
+
static teco_state_t *
teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error)
{
@@ -1787,7 +1848,8 @@ teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error)
['C'] = {&teco_state_start, teco_state_control_exit},
['O'] = {&teco_state_start, teco_state_control_octal},
['D'] = {&teco_state_start, teco_state_control_decimal},
- ['R'] = {&teco_state_start, teco_state_control_radix}
+ ['R'] = {&teco_state_start, teco_state_control_radix},
+ ['E'] = {&teco_state_start, teco_state_control_glyphs2bytes}
};
/*
@@ -2350,6 +2412,70 @@ teco_state_ecommand_eol(teco_machine_main_t *ctx, GError **error)
}
}
+/*$ EE encoding codepage charset
+ * codepageEE -- Edit current document's encoding (codepage/charset)
+ * EE -> codepage
+ *
+ * When called with an argument, it sets the current codepage,
+ * otherwise returns it.
+ * 65001 (UTF-8) is the default for new buffers.
+ * 0 (ANSI) should be used when working with raw bytes.
+ */
+static void
+teco_state_ecommand_encoding(teco_machine_main_t *ctx, GError **error)
+{
+ if (!teco_expressions_eval(FALSE, error))
+ return;
+
+ sptr_t old_cp = teco_interface_ssm(SCI_GETCODEPAGE, 0, 0);
+
+ if (!teco_expressions_args()) {
+ teco_expressions_push(old_cp ? : teco_interface_ssm(SCI_STYLEGETCHARACTERSET, STYLE_DEFAULT, 0));
+ return;
+ }
+
+ /*
+ * Set code page
+ */
+ if (teco_current_doc_must_undo()) {
+ if (old_cp == SC_CP_UTF8) {
+ undo__teco_interface_ssm(SCI_ALLOCATELINECHARACTERINDEX,
+ SC_LINECHARACTERINDEX_UTF32, 0);
+ undo__teco_interface_ssm(SCI_SETCODEPAGE, SC_CP_UTF8, 0);
+ } else {
+ undo__teco_interface_ssm(SCI_SETCODEPAGE, old_cp, 0);
+ undo__teco_interface_ssm(SCI_STYLESETCHARACTERSET, STYLE_DEFAULT,
+ teco_interface_ssm(SCI_STYLEGETCHARACTERSET, STYLE_DEFAULT, 0));
+ undo__teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
+ SC_LINECHARACTERINDEX_UTF32, 0);
+ }
+ }
+
+ teco_int_t v;
+ if (!teco_expressions_pop_num_calc(&v, 0, error))
+ return;
+ if (v == SC_CP_UTF8) {
+ teco_interface_ssm(SCI_SETCODEPAGE, SC_CP_UTF8, 0);
+ /*
+ * UTF-8 documents strictly require the line character index.
+ * See teco_glyphs2bytes() and teco_bytes2glyphs().
+ */
+ teco_interface_ssm(SCI_ALLOCATELINECHARACTERINDEX,
+ SC_LINECHARACTERINDEX_UTF32, 0);
+ return;
+ }
+
+ teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
+ SC_LINECHARACTERINDEX_UTF32, 0);
+ teco_interface_ssm(SCI_STYLESETCHARACTERSET, STYLE_DEFAULT, v);
+ /* 0 is used for ALL single-byte encodings */
+ teco_interface_ssm(SCI_SETCODEPAGE, 0, 0);
+ /*
+ * FIXME: Should we attempt any code page conversion via
+ * g_iconv()?
+ */
+}
+
/*$ EX exit
* [bool]EX -- Exit program
* -EX
@@ -2435,6 +2561,7 @@ teco_state_ecommand_input(teco_machine_main_t *ctx, gchar chr, GError **error)
['D'] = {&teco_state_start, teco_state_ecommand_flags},
['J'] = {&teco_state_start, teco_state_ecommand_properties},
['L'] = {&teco_state_start, teco_state_ecommand_eol},
+ ['E'] = {&teco_state_start, teco_state_ecommand_encoding},
['X'] = {&teco_state_start, teco_state_ecommand_exit}
};