diff options
Diffstat (limited to 'src/core-commands.c')
-rw-r--r-- | src/core-commands.c | 527 |
1 files changed, 461 insertions, 66 deletions
diff --git a/src/core-commands.c b/src/core-commands.c index 4d5b378..0cde7e0 100644 --- a/src/core-commands.c +++ b/src/core-commands.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2023 Robin Haberkorn + * Copyright (C) 2012-2024 Robin Haberkorn * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -45,7 +45,7 @@ #include "goto-commands.h" #include "core-commands.h" -static teco_state_t *teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error); +static teco_state_t *teco_state_control_input(teco_machine_main_t *ctx, gunichar chr, GError **error); /* * NOTE: This needs some extra code in teco_state_start_input(). @@ -129,7 +129,8 @@ teco_state_start_dot(teco_machine_main_t *ctx, GError **error) { if (!teco_expressions_eval(FALSE, error)) return; - teco_expressions_push(teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0)); + sptr_t pos = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0); + teco_expressions_push(teco_interface_bytes2glyphs(pos)); } /*$ Z size @@ -145,7 +146,8 @@ teco_state_start_zed(teco_machine_main_t *ctx, GError **error) { if (!teco_expressions_eval(FALSE, error)) return; - teco_expressions_push(teco_interface_ssm(SCI_GETLENGTH, 0, 0)); + sptr_t pos = teco_interface_ssm(SCI_GETLENGTH, 0, 0); + teco_expressions_push(teco_interface_bytes2glyphs(pos)); } /*$ H @@ -162,10 +164,11 @@ teco_state_start_range(teco_machine_main_t *ctx, GError **error) if (!teco_expressions_eval(FALSE, error)) return; teco_expressions_push(0); - teco_expressions_push(teco_interface_ssm(SCI_GETLENGTH, 0, 0)); + sptr_t pos = teco_interface_ssm(SCI_GETLENGTH, 0, 0); + teco_expressions_push(teco_interface_bytes2glyphs(pos)); } -/*$ "\\" +/*$ \[rs] * n\\ -- Insert or read ASCII numbers * \\ -> n * @@ -241,6 +244,7 @@ teco_state_start_loop_open(teco_machine_main_t *ctx, GError **error) if (!teco_expressions_eval(FALSE, error) || !teco_expressions_pop_num_calc(&lctx.counter, -1, error)) return; + lctx.brace_level = teco_brace_level; lctx.pass_through = teco_machine_main_eval_colon(ctx); if (lctx.counter) { @@ -280,6 +284,14 @@ teco_state_start_loop_close(teco_machine_main_t *ctx, GError **error) teco_loop_context_t *lctx = &g_array_index(teco_loop_stack, teco_loop_context_t, teco_loop_stack->len-1); + + /* only non-pass-through loops increase the brace level */ + if (teco_brace_level != lctx->brace_level + !lctx->pass_through) { + g_set_error_literal(error, TECO_ERROR, TECO_ERROR_FAILED, + "Brace left open at loop end command"); + return; + } + gboolean colon_modified = teco_machine_main_eval_colon(ctx); /* @@ -348,7 +360,7 @@ teco_state_start_break(teco_machine_main_t *ctx, GError **error) { if (teco_loop_stack->len <= ctx->loop_stack_fp) { g_set_error_literal(error, TECO_ERROR, TECO_ERROR_FAILED, - "<;> only allowed in iterations"); + "<;> only allowed in loops"); return; } @@ -373,7 +385,7 @@ teco_state_start_break(teco_machine_main_t *ctx, GError **error) if (!teco_expressions_discard_args(error)) return; if (!lctx.pass_through && - !teco_expressions_brace_close(error)) + !teco_expressions_brace_return(lctx.brace_level, 0, error)) return; undo__insert_val__teco_loop_stack(teco_loop_stack->len, lctx); @@ -511,11 +523,12 @@ teco_state_start_jump(teco_machine_main_t *ctx, GError **error) if (!teco_expressions_pop_num_calc(&v, 0, error)) return; - if (teco_validate_pos(v)) { + gssize pos = teco_interface_glyphs2bytes(v); + if (pos >= 0) { if (teco_current_doc_must_undo()) undo__teco_interface_ssm(SCI_GOTOPOS, teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0), 0); - teco_interface_ssm(SCI_GOTOPOS, v, 0); + teco_interface_ssm(SCI_GOTOPOS, pos, 0); if (teco_machine_main_eval_colon(ctx)) teco_expressions_push(TECO_SUCCESS); @@ -531,11 +544,11 @@ static teco_bool_t teco_move_chars(teco_int_t n) { sptr_t pos = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0); - - if (!teco_validate_pos(pos + n)) + gssize next_pos = teco_interface_glyphs2bytes_relative(pos, n); + if (next_pos < 0) return TECO_FAILURE; - teco_interface_ssm(SCI_GOTOPOS, pos + n, 0); + teco_interface_ssm(SCI_GOTOPOS, next_pos, 0); if (teco_current_doc_must_undo()) undo__teco_interface_ssm(SCI_GOTOPOS, pos, 0); @@ -879,7 +892,7 @@ static gboolean teco_state_start_kill(teco_machine_main_t *ctx, const gchar *cmd, gboolean by_lines, GError **error) { teco_bool_t rc; - teco_int_t from, len; + gssize from, len; /* in bytes */ if (!teco_expressions_eval(FALSE, error)) return FALSE; @@ -894,20 +907,24 @@ teco_state_start_kill(teco_machine_main_t *ctx, const gchar *cmd, gboolean by_li len = teco_interface_ssm(SCI_POSITIONFROMLINE, line, 0) - from; rc = teco_bool(teco_validate_line(line)); } else { - if (!teco_expressions_pop_num_calc(&len, teco_num_sign, error)) + teco_int_t len_glyphs; + if (!teco_expressions_pop_num_calc(&len_glyphs, teco_num_sign, error)) return FALSE; - rc = teco_bool(teco_validate_pos(from + len)); + gssize to = teco_interface_glyphs2bytes_relative(from, len_glyphs); + rc = teco_bool(to >= 0); + len = to-from; } if (len < 0) { len *= -1; from -= len; } } else { - teco_int_t to = teco_expressions_pop_num(0); - from = teco_expressions_pop_num(0); + teco_int_t to_glyphs = teco_expressions_pop_num(0); + gssize to = teco_interface_glyphs2bytes(to_glyphs); + teco_int_t from_glyphs = teco_expressions_pop_num(0); + from = teco_interface_glyphs2bytes(from_glyphs); len = to - from; - rc = teco_bool(len >= 0 && teco_validate_pos(from) && - teco_validate_pos(to)); + rc = teco_bool(len >= 0 && from >= 0 && to >= 0); } if (teco_machine_main_eval_colon(ctx)) { @@ -1002,6 +1019,9 @@ teco_state_start_delete_chars(teco_machine_main_t *ctx, GError **error) * This can be an ASCII <code> or Unicode codepoint * depending on Scintilla's encoding of the current * buffer. + * Invalid Unicode byte sequences are reported as + * -1 or -2. + * * - If <n> is 0, return the <code> of the character * pointed to by dot. * - If <n> is 1, return the <code> of the character @@ -1012,28 +1032,33 @@ teco_state_start_delete_chars(teco_machine_main_t *ctx, GError **error) * * If the position of the queried character is off-page, * the command will yield an error. + * + * If the document is encoded as UTF-8 and there is + * an incomplete sequence at the requested position, + * -1 is returned. + * All other invalid Unicode sequences are returned as -2. */ -/** @todo does Scintilla really return code points??? */ static void teco_state_start_get(teco_machine_main_t *ctx, GError **error) { teco_int_t v; if (!teco_expressions_pop_num_calc(&v, teco_num_sign, error)) return; - v += teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0); - /* - * NOTE: We cannot use teco_validate_pos() here since - * the end of the buffer is not a valid position for <A>. - */ - if (v < 0 || v >= teco_interface_ssm(SCI_GETLENGTH, 0, 0)) { + + sptr_t pos = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0); + gssize get_pos = teco_interface_glyphs2bytes_relative(pos, v); + sptr_t len = teco_interface_ssm(SCI_GETLENGTH, 0, 0); + + if (get_pos < 0 || get_pos == len) { teco_error_range_set(error, "A"); return; } - teco_expressions_push(teco_interface_ssm(SCI_GETCHARAT, v, 0)); + + teco_expressions_push(teco_interface_get_character(get_pos, len)); } static teco_state_t * -teco_state_start_input(teco_machine_main_t *ctx, gchar chr, GError **error) +teco_state_start_input(teco_machine_main_t *ctx, gunichar chr, GError **error) { static teco_machine_main_transition_t transitions[] = { /* @@ -1148,7 +1173,7 @@ teco_state_start_input(teco_machine_main_t *ctx, gchar chr, GError **error) * * FIXME: Maybe, there should be a special teco_state_t * for beginnings of command-lines? - * It could also be used for a corresponding FNMACRO mask. + * It could also be used for a corresponding KEYMACRO mask. */ if (teco_cmdline.effective_len == 1 && teco_cmdline.str.data[0] == '*') return &teco_state_save_cmdline; @@ -1244,7 +1269,7 @@ teco_state_start_input(teco_machine_main_t *ctx, gchar chr, GError **error) TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_start, .end_of_macro_cb = NULL, /* Allowed at the end of a macro! */ .is_start = TRUE, - .fnmacro_mask = TECO_FNMACRO_MASK_START + .keymacro_mask = TECO_KEYMACRO_MASK_START | TECO_KEYMACRO_MASK_CASEINSENSITIVE ); /*$ F< @@ -1372,7 +1397,7 @@ teco_state_fcommand_cond_else(teco_machine_main_t *ctx, GError **error) } static teco_state_t * -teco_state_fcommand_input(teco_machine_main_t *ctx, gchar chr, GError **error) +teco_state_fcommand_input(teco_machine_main_t *ctx, gunichar chr, GError **error) { static teco_machine_main_transition_t transitions[] = { /* @@ -1435,7 +1460,7 @@ teco_state_changedir_done(teco_machine_main_t *ctx, const teco_string_t *str, GE teco_qreg_t *qreg = teco_qreg_table_find(&teco_qreg_table_globals, "$HOME", 5); g_assert(qreg != NULL); teco_string_t home; - if (!qreg->vtable->get_string(qreg, &home.data, &home.len, error)) + if (!qreg->vtable->get_string(qreg, &home.data, &home.len, NULL, error)) return NULL; /* @@ -1496,7 +1521,7 @@ teco_state_changedir_done(teco_machine_main_t *ctx, const teco_string_t *str, GE TECO_DEFINE_STATE_EXPECTDIR(teco_state_changedir); static teco_state_t * -teco_state_condcommand_input(teco_machine_main_t *ctx, gchar chr, GError **error) +teco_state_condcommand_input(teco_machine_main_t *ctx, gunichar chr, GError **error) { teco_int_t value = 0; gboolean result = TRUE; @@ -1536,20 +1561,20 @@ teco_state_condcommand_input(teco_machine_main_t *ctx, gchar chr, GError **error break; case 'A': if (ctx->mode == TECO_MODE_NORMAL) - result = g_ascii_isalpha((gchar)value); + result = g_unichar_isalpha(value); break; case 'C': if (ctx->mode == TECO_MODE_NORMAL) - result = g_ascii_isalnum((gchar)value) || + result = g_unichar_isalnum(value) || value == '.' || value == '$' || value == '_'; break; case 'D': if (ctx->mode == TECO_MODE_NORMAL) - result = g_ascii_isdigit((gchar)value); + result = g_unichar_isdigit(value); break; case 'I': if (ctx->mode == TECO_MODE_NORMAL) - result = G_IS_DIR_SEPARATOR((gchar)value); + result = G_IS_DIR_SEPARATOR(value); break; case 'S': case 'T': @@ -1582,15 +1607,15 @@ teco_state_condcommand_input(teco_machine_main_t *ctx, gchar chr, GError **error break; case 'R': if (ctx->mode == TECO_MODE_NORMAL) - result = g_ascii_isalnum((gchar)value); + result = g_unichar_isalnum(value); break; case 'V': if (ctx->mode == TECO_MODE_NORMAL) - result = g_ascii_islower((gchar)value); + result = g_unichar_islower(value); break; case 'W': if (ctx->mode == TECO_MODE_NORMAL) - result = g_ascii_isupper((gchar)value); + result = g_unichar_isupper(value); break; default: g_set_error(error, TECO_ERROR, TECO_ERROR_FAILED, @@ -1720,8 +1745,71 @@ teco_state_control_radix(teco_machine_main_t *ctx, GError **error) } } +/*$ ^E glyphs2bytes bytes2glyphs + * glyphs^E -> bytes -- Translate between glyph and byte indexes + * bytes:^E -> glyphs + * ^E -> bytes + * :^E -> length + * + * Translates from glyph/character to byte indexes when called + * without a colon. + * Otherwise when colon-modified, translates from byte indexes + * back to glyph indexes. + * These values can differ in documents with multi-byte + * encodings (of which only UTF-8 is supported). + * It is especially useful to translate between these indexes + * when manually invoking Scintilla messages (\fBES\fP command), as + * they almost always take byte positions. + * + * When called without arguments, \fB^E\fP returns the current + * position (dot) in bytes. + * This is equivalent, but faster than \(lq.^E\(rq. + * \fB:^E\fP without arguments returns the length of the current + * document in bytes, which is equivalent but faster than \(lqZ^E\(rq. + * + * When passing in indexes outside of the document's valid area, + * -1 is returned, so the return value can also be interpreted + * as a TECO boolean, signalling truth/success for invalid indexes. + * This provides an elegant and effective way to validate + * buffer addresses. + */ +static void +teco_state_control_glyphs2bytes(teco_machine_main_t *ctx, GError **error) +{ + teco_int_t res; + + if (!teco_expressions_eval(FALSE, error)) + return; + + gboolean colon_modified = teco_machine_main_eval_colon(ctx); + + if (!teco_expressions_args()) { + /* + * This is shorter than .^E or Z^E and avoids unnecessary glyph to + * byte index translations. + * On the other hand :^E is inconsistent, as it will return a byte + * index, instead of glyph index. + */ + res = teco_interface_ssm(colon_modified ? SCI_GETLENGTH : SCI_GETCURRENTPOS, 0, 0); + } else { + teco_int_t pos; + if (!teco_expressions_pop_num_calc(&pos, 0, error)) + return; + if (colon_modified) { + /* teco_interface_bytes2glyphs() does not check addresses */ + res = 0 <= pos && pos <= teco_interface_ssm(SCI_GETLENGTH, 0, 0) + ? teco_interface_bytes2glyphs(pos) : -1; + } else { + /* negative values for invalid indexes are passed down. */ + res = teco_interface_glyphs2bytes(pos); + } + } + + teco_expressions_push(res); +} + static teco_state_t * -teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error) +teco_state_control_input(teco_machine_main_t *ctx, gunichar chr, GError **error) { static teco_machine_main_transition_t transitions[] = { /* @@ -1746,7 +1834,8 @@ teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error) ['C'] = {&teco_state_start, teco_state_control_exit}, ['O'] = {&teco_state_start, teco_state_control_octal}, ['D'] = {&teco_state_start, teco_state_control_decimal}, - ['R'] = {&teco_state_start, teco_state_control_radix} + ['R'] = {&teco_state_start, teco_state_control_radix}, + ['E'] = {&teco_state_start, teco_state_control_glyphs2bytes} }; /* @@ -1761,7 +1850,7 @@ teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error) TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_control); static teco_state_t * -teco_state_ascii_input(teco_machine_main_t *ctx, gchar chr, GError **error) +teco_state_ascii_input(teco_machine_main_t *ctx, gunichar chr, GError **error) { if (ctx->mode == TECO_MODE_NORMAL) teco_expressions_push(chr); @@ -1797,7 +1886,7 @@ TECO_DEFINE_STATE(teco_state_ascii); * only be seen when executing the following command. */ static teco_state_t * -teco_state_escape_input(teco_machine_main_t *ctx, gchar chr, GError **error) +teco_state_escape_input(teco_machine_main_t *ctx, gunichar chr, GError **error) { /*$ ^[^[ ^[$ $$ terminate return * [a1,a2,...]$$ -- Terminate command line or return from macro @@ -1891,7 +1980,7 @@ TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_escape, * when it comes to function key macro masking. */ .is_start = TRUE, - .fnmacro_mask = TECO_FNMACRO_MASK_START + .keymacro_mask = TECO_KEYMACRO_MASK_START | TECO_KEYMACRO_MASK_CASEINSENSITIVE ); /*$ EF close @@ -1958,6 +2047,11 @@ teco_state_ecommand_close(teco_machine_main_t *ctx, GError **error) * Without any argument ED returns the current flags. * * Currently, the following flags are used by \*(ST: + * - 4: If enabled, prefer raw single-byte ANSI encoding + * for all new buffers and registers. + * This does not change the encoding of any existing + * buffers and any initialized default register when set via + * \fBED\fP, so you might want to launch \*(ST with \fB--8bit\fP. * - 8: Enable/disable automatic folding of case-insensitive * command characters during interactive key translation. * The case of letter keys is inverted, so one or two @@ -1973,14 +2067,17 @@ teco_state_ecommand_close(teco_machine_main_t *ctx, GError **error) * of files. * - 32: Enable/Disable buffer editing hooks * (via execution of macro in global Q-Register \(lqED\(rq) - * - 64: Enable/Disable function key macros * - 128: Enable/Disable enforcement of UNIX98 * \(lq/bin/sh\(rq emulation for operating system command * executions - * - 256: Enable/Disable \fBxterm\fP(1) clipboard support. - * Should only be enabled if XTerm allows the - * \fIGetSelection\fP and \fISetSelection\fP window - * operations. + * - 256: Enable/Disable OSC-52 clipboard support. + * Must only be enabled if the terminal emulator is configured + * properly. + * - 512: Enable/Disable Unicode icons in the Curses UI. + * This requires a capable font, like the ones provided + * by the \(lqNerd Fonts\(rq project. + * Changes to this flag in interactive mode may not become + * effective immediately. * * The features controlled thus are discribed in other sections * of this manual. @@ -2098,6 +2195,12 @@ teco_state_ecommand_flags(teco_machine_main_t *ctx, GError **error) * on exit the author is aware of is \fBxterm\fP(1) and * the Linux console driver. * You have been warned. Good luck. + * .IP 4 + * The column after the last horizontal movement. + * This is only used by \fBfnkeys.tes\fP and is similar to the Scintilla-internal + * setting \fBSCI_CHOOSECARETX\fP. + * Unless most other settings, this is on purpose not restored on rubout, + * so it "survives" command line replacements. */ static void teco_state_ecommand_properties(teco_machine_main_t *ctx, GError **error) @@ -2106,9 +2209,12 @@ teco_state_ecommand_properties(teco_machine_main_t *ctx, GError **error) EJ_USER_INTERFACE = 0, EJ_BUFFERS, EJ_MEMORY_LIMIT, - EJ_INIT_COLOR + EJ_INIT_COLOR, + EJ_CARETX }; + static teco_int_t caret_x = 0; + teco_int_t property; if (!teco_expressions_eval(FALSE, error) || !teco_expressions_pop_num_calc(&property, teco_num_sign, error)) @@ -2144,6 +2250,10 @@ teco_state_ecommand_properties(teco_machine_main_t *ctx, GError **error) teco_interface_init_color((guint)value, (guint32)color); break; + case EJ_CARETX: + caret_x = value; + break; + default: g_set_error(error, TECO_ERROR, TECO_ERROR_FAILED, "Cannot set property %" TECO_INT_FORMAT " " @@ -2180,6 +2290,10 @@ teco_state_ecommand_properties(teco_machine_main_t *ctx, GError **error) teco_expressions_push(teco_memory_limit); break; + case EJ_CARETX: + teco_expressions_push(caret_x); + break; + default: g_set_error(error, TECO_ERROR, TECO_ERROR_FAILED, "Invalid property %" TECO_INT_FORMAT " " @@ -2292,6 +2406,252 @@ teco_state_ecommand_eol(teco_machine_main_t *ctx, GError **error) } } +static const gchar * +teco_codepage2str(guint codepage) +{ + /* + * The multi-byte charsets are excluded, since we don't + * support them in SciTECO, even though Scintilla has them. + * Contrary to the Scintilla documentation, Gtk supports + * most of them. + * Those that are supported are tested, so the codepage + * mapping should be definitive (although there could be + * similar related codepages). + */ + switch (codepage) { + case SC_CP_UTF8: return "UTF-8"; + case SC_CHARSET_ANSI: + case SC_CHARSET_DEFAULT: return "ISO-8859-1"; /* LATIN1 */ + case SC_CHARSET_BALTIC: return "ISO-8859-13"; /* LATIN7 */ + //case SC_CHARSET_CHINESEBIG5: return "BIG5"; + case SC_CHARSET_EASTEUROPE: return "ISO-8859-2"; /* LATIN2 */ + //case SC_CHARSET_GB2312: return "GB2312"; + case SC_CHARSET_GREEK: return "ISO-8859-7"; // CP1253??? + //case SC_CHARSET_HANGUL: return "UHC"; + /* unsure whether this is supported on Gtk */ + case SC_CHARSET_MAC: return "MAC"; + /* not supported by Gtk */ + case SC_CHARSET_OEM: return "CP437"; + /* + * Apparently, this can be CP1251 on the native Windows + * port of Scintilla. + */ + case SC_CHARSET_RUSSIAN: return "KOI8-R"; + case SC_CHARSET_OEM866: return "CP866"; + case SC_CHARSET_CYRILLIC: return "CP1251"; + //case SC_CHARSET_SHIFTJIS: return "SHIFT-JIS"; + //case SC_CHARSET_SYMBOL: + case SC_CHARSET_TURKISH: return "ISO-8859-9"; /* LATIN5 */ + //case SC_CHARSET_JOHAB: return "JOHAB"; + case SC_CHARSET_HEBREW: return "ISO-8859-8"; // CP1255? + /* + * FIXME: Some arabic codepage is supported by Gtk, + * but I am not sure which. + */ + case SC_CHARSET_ARABIC: return "ISO-8859-6"; // CP720, CP1256??? + /* apparently not supported by Gtk */ + case SC_CHARSET_VIETNAMESE: return "CP1258"; + case SC_CHARSET_THAI: return "ISO-8859-11"; + case SC_CHARSET_8859_15: return "ISO-8859-15"; /* LATIN9 */ + } + + return NULL; +} + +/*$ EE encoding codepage charset + * codepageEE -- Edit current document's encoding (codepage/charset) + * EE -> codepage + * codepage:EE + * :EE -> codepage + * + * When called with an argument, it sets the current codepage, + * otherwise returns it. + * The following codepages are supported: + * - 0: ANSI (raw bytes) + * - 1: ISO-8859-1 (latin1) + * - 77: Macintosh Latin encoding + * - 161: ISO-8859-7 + * - 162: ISO-8859-9 (latin5) + * - 163: CP1258 + * - 177: ISO-8859-8 + * - 178: ISO-8859-6 + * - 186: ISO-8859-13 (latin7) + * - 204: KOI8-R + * - 222: ISO-8859-11 + * - 238: ISO-8859-2 (latin2) + * - 255: CP437 + * - 866: CP866 + * - 1000: ISO-8859-15 (latin9) + * - 1251: CP1251 + * - 65001: UTF-8 + * + * Displaying characters in the single-byte (non-UTF-8) codepages might + * be supported only with the Gtk UI. + * At least 77, 178, 163 and 255 are not displayed correctly on Gtk. + * 65001 (UTF-8) is the default for new buffers. + * 0 (ANSI) should be used when working with raw bytes, + * but is currently displayed like ISO-8859-1 (latin1). + * + * \fBEE\fP does not change the buffer contents itself by default, only + * how it is displayed and how \*(ST interacts with it. + * This allows fixing up the codepage if it is not in the default UTF-8 + * or if codepage guessing failed. + * + * When colon-modified the \fB:EE\fP command will also additionally convert + * the current buffer contents into the new code page, preserving the + * current position (dot). + * This will fail if the conversion would be lossy. + * Conversions from and to UTF-8 \fIshould\fP always be successful. + */ +static void +teco_state_ecommand_encoding(teco_machine_main_t *ctx, GError **error) +{ + if (!teco_expressions_eval(FALSE, error)) + return; + + gboolean colon_modified = teco_machine_main_eval_colon(ctx); + + guint old_cp = teco_interface_get_codepage(); + + if (!teco_expressions_args()) { + /* get current code page */ + teco_expressions_push(old_cp); + return; + } + + /* + * Set code page + */ + teco_int_t new_cp; + if (!teco_expressions_pop_num_calc(&new_cp, 0, error)) + return; + + if (old_cp == SC_CP_UTF8 && new_cp == SC_CP_UTF8) + return; + + if (teco_current_doc_must_undo() && teco_undo_enabled) { + if (old_cp == SC_CP_UTF8) { /* new_cp != SC_CP_UTF8 */ + undo__teco_interface_ssm(SCI_ALLOCATELINECHARACTERINDEX, + SC_LINECHARACTERINDEX_UTF32, 0); + undo__teco_interface_ssm(SCI_SETCODEPAGE, SC_CP_UTF8, 0); + } else { + undo__teco_interface_ssm(SCI_SETCODEPAGE, 0, 0); + for (gint style = 0; style <= STYLE_LASTPREDEFINED; style++) + undo__teco_interface_ssm(SCI_STYLESETCHARACTERSET, style, old_cp); + /* + * The index is internally reference-counted and could underflow, + * so don't do it more than necessary. + */ + if (new_cp == SC_CP_UTF8) + undo__teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX, + SC_LINECHARACTERINDEX_UTF32, 0); + } + } + + teco_int_t dot_glyphs; + if (colon_modified) { + sptr_t dot_bytes = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0); + dot_glyphs = teco_interface_bytes2glyphs(dot_bytes); + + /* + * Convert buffer to new codepage. + * + * FIXME: Could be optimized slightly by converting first + * before the gap, inserting the converted text and then + * converting after the gap. + */ + const gchar *to_codepage = teco_codepage2str(new_cp); + const gchar *from_codepage = teco_codepage2str(old_cp); + if (!to_codepage || !from_codepage) { + g_set_error_literal(error, TECO_ERROR, TECO_ERROR_FAILED, + "Unknown or unsupported codepage/charset"); + return; + } + + const gchar *buf = (const gchar *)teco_interface_ssm(SCI_GETCHARACTERPOINTER, 0, 0); + gsize len = teco_interface_ssm(SCI_GETLENGTH, 0, 0); + g_autofree gchar *converted; + gsize converted_len; + + /* + * This fails if there is no direct translation. + * If we'd use g_convert_with_fallback(), it would be tricky to choose + * fallback characters that will always work. + */ + converted = g_convert(buf, len, to_codepage, from_codepage, + NULL, &converted_len, error); + if (!converted) + return; + + teco_interface_ssm(SCI_BEGINUNDOACTION, 0, 0); + teco_interface_ssm(SCI_CLEARALL, 0, 0); + teco_interface_ssm(SCI_APPENDTEXT, converted_len, (sptr_t)converted); + teco_interface_ssm(SCI_ENDUNDOACTION, 0, 0); + teco_ring_dirtify(); + + if (teco_current_doc_must_undo()) { + undo__teco_interface_ssm(SCI_GOTOPOS, dot_bytes, 0); + undo__teco_interface_ssm(SCI_UNDO, 0, 0); + } + } + + if (new_cp == SC_CP_UTF8) { + teco_interface_ssm(SCI_SETCODEPAGE, SC_CP_UTF8, 0); + /* + * UTF-8 documents strictly require the line character index. + * See teco_view_glyphs2bytes() and teco_view_bytes2glyphs(). + */ + g_assert(!(teco_interface_ssm(SCI_GETLINECHARACTERINDEX, 0, 0) + & SC_LINECHARACTERINDEX_UTF32)); + teco_interface_ssm(SCI_ALLOCATELINECHARACTERINDEX, + SC_LINECHARACTERINDEX_UTF32, 0); + } else { + /* + * The index is NOT released automatically when setting the codepage. + * But it is internally reference-counted and could underflow, + * so don't do it more than necessary. + */ + if (old_cp == SC_CP_UTF8) { + teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX, + SC_LINECHARACTERINDEX_UTF32, 0); + g_assert(!(teco_interface_ssm(SCI_GETLINECHARACTERINDEX, 0, 0) + & SC_LINECHARACTERINDEX_UTF32)); + } + + /* + * Configure a single-byte codepage/charset. + * This requires setting it on all of the possible styles. + * Unfortunately there can theoretically even be 255 (STYLE_MAX) styles. + * This is important only for display purposes - other than that + * all single-byte encodings are handled the same. + * + * FIXME: Should we avoid this if new_cp == 0? + * It will be used for raw byte handling mostly. + */ + if (teco_current_doc_must_undo()) { + /* + * There is a chance the user will see this buffer even if we + * are currently in batch mode. + */ + for (gint style = 0; style <= STYLE_LASTPREDEFINED; style++) + teco_interface_ssm(SCI_STYLESETCHARACTERSET, style, new_cp); + } else { + /* we must still set it, so that <EE> retrieval works */ + teco_interface_ssm(SCI_STYLESETCHARACTERSET, STYLE_DEFAULT, new_cp); + } + /* 0 is used for ALL single-byte encodings */ + teco_interface_ssm(SCI_SETCODEPAGE, 0, 0); + } + + if (colon_modified) + /* + * Only now, it will be safe to recalculate dot in the new encoding. + * If the new codepage is UTF-8, the line character index will be + * ready only now. + */ + teco_interface_ssm(SCI_GOTOPOS, teco_interface_glyphs2bytes(dot_glyphs), 0); +} + /*$ EX exit * [bool]EX -- Exit program * -EX @@ -2352,7 +2712,7 @@ teco_state_ecommand_exit(teco_machine_main_t *ctx, GError **error) } static teco_state_t * -teco_state_ecommand_input(teco_machine_main_t *ctx, gchar chr, GError **error) +teco_state_ecommand_input(teco_machine_main_t *ctx, gunichar chr, GError **error) { static teco_machine_main_transition_t transitions[] = { /* @@ -2377,6 +2737,7 @@ teco_state_ecommand_input(teco_machine_main_t *ctx, gchar chr, GError **error) ['D'] = {&teco_state_start, teco_state_ecommand_flags}, ['J'] = {&teco_state_start, teco_state_ecommand_properties}, ['L'] = {&teco_state_start, teco_state_ecommand_eol}, + ['E'] = {&teco_state_start, teco_state_ecommand_encoding}, ['X'] = {&teco_state_start, teco_state_ecommand_exit} }; @@ -2395,26 +2756,61 @@ teco_state_insert_initial(teco_machine_main_t *ctx, GError **error) if (ctx->mode > TECO_MODE_NORMAL) return TRUE; + /* + * Current document's encoding determines the behaviour of + * string building constructs. + */ + teco_machine_stringbuilding_set_codepage(&ctx->expectstring.machine, + teco_interface_get_codepage()); + if (!teco_expressions_eval(FALSE, error)) return FALSE; guint args = teco_expressions_args(); if (!args) return TRUE; - teco_interface_ssm(SCI_BEGINUNDOACTION, 0, 0); - for (int i = args; i > 0; i--) { - gchar chr = (gchar)teco_expressions_peek_num(i-1); - teco_interface_ssm(SCI_ADDTEXT, 1, (sptr_t)&chr); + if (teco_interface_ssm(SCI_GETCODEPAGE, 0, 0) == SC_CP_UTF8) { + /* detect possible errors before introducing side effects */ + for (gint i = args; i > 0; i--) { + teco_int_t chr = teco_expressions_peek_num(i-1); + if (chr < 0 || !g_unichar_validate(chr)) { + teco_error_codepoint_set(error, "I"); + return FALSE; + } + } + teco_interface_ssm(SCI_BEGINUNDOACTION, 0, 0); + for (gint i = args; i > 0; i--) { + /* 4 bytes should be enough, but we better follow the documentation */ + gchar buf[6]; + gsize len = g_unichar_to_utf8(teco_expressions_peek_num(i-1), buf); + teco_interface_ssm(SCI_ADDTEXT, len, (sptr_t)buf); + } + } else { + /* everything else is a single-byte encoding */ + for (gint i = args; i > 0; i--) { + teco_int_t chr = teco_expressions_peek_num(i-1); + if (chr < 0 || chr > 0xFF) { + teco_error_codepoint_set(error, "I"); + return FALSE; + } + } + teco_interface_ssm(SCI_BEGINUNDOACTION, 0, 0); + for (gint i = args; i > 0; i--) { + gchar chr = (gchar)teco_expressions_peek_num(i-1); + teco_interface_ssm(SCI_ADDTEXT, 1, (sptr_t)&chr); + } } - for (int i = args; i > 0; i--) - if (!teco_expressions_pop_num_calc(NULL, 0, error)) - return FALSE; teco_interface_ssm(SCI_ENDUNDOACTION, 0, 0); teco_ring_dirtify(); if (teco_current_doc_must_undo()) undo__teco_interface_ssm(SCI_UNDO, 0, 0); + /* This is done only now because it can _theoretically_ fail. */ + for (gint i = args; i > 0; i--) + if (!teco_expressions_pop_num_calc(NULL, 0, error)) + return FALSE; + return TRUE; } @@ -2451,8 +2847,8 @@ teco_state_insert_process(teco_machine_main_t *ctx, const teco_string_t *str, * Secondly, the command inserts <text>. * In interactive mode, <text> is inserted interactively. * - * String building characters are \fBenabled\fP for the - * I command. + * Unlike in classic TECO dialects, string building characters are + * \fBenabled\fP for the \fBI\fP command. * When editing \*(ST macros, using the \fBEI\fP command * may be better, since it has string building characters * disabled. @@ -2491,10 +2887,9 @@ teco_state_insert_indent_initial(teco_machine_main_t *ctx, GError **error) len -= teco_interface_ssm(SCI_GETCOLUMN, teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0), 0) % len; - gchar spaces[len]; - - memset(spaces, ' ', sizeof(spaces)); - teco_interface_ssm(SCI_ADDTEXT, sizeof(spaces), (sptr_t)spaces); + gchar space = ' '; + while (len-- > 0) + teco_interface_ssm(SCI_ADDTEXT, 1, (sptr_t)&space); } teco_interface_ssm(SCI_ENDUNDOACTION, 0, 0); teco_ring_dirtify(); |