aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/core-commands.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/core-commands.c')
-rw-r--r--src/core-commands.c527
1 files changed, 461 insertions, 66 deletions
diff --git a/src/core-commands.c b/src/core-commands.c
index 4d5b378..0cde7e0 100644
--- a/src/core-commands.c
+++ b/src/core-commands.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2012-2023 Robin Haberkorn
+ * Copyright (C) 2012-2024 Robin Haberkorn
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -45,7 +45,7 @@
#include "goto-commands.h"
#include "core-commands.h"
-static teco_state_t *teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error);
+static teco_state_t *teco_state_control_input(teco_machine_main_t *ctx, gunichar chr, GError **error);
/*
* NOTE: This needs some extra code in teco_state_start_input().
@@ -129,7 +129,8 @@ teco_state_start_dot(teco_machine_main_t *ctx, GError **error)
{
if (!teco_expressions_eval(FALSE, error))
return;
- teco_expressions_push(teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0));
+ sptr_t pos = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0);
+ teco_expressions_push(teco_interface_bytes2glyphs(pos));
}
/*$ Z size
@@ -145,7 +146,8 @@ teco_state_start_zed(teco_machine_main_t *ctx, GError **error)
{
if (!teco_expressions_eval(FALSE, error))
return;
- teco_expressions_push(teco_interface_ssm(SCI_GETLENGTH, 0, 0));
+ sptr_t pos = teco_interface_ssm(SCI_GETLENGTH, 0, 0);
+ teco_expressions_push(teco_interface_bytes2glyphs(pos));
}
/*$ H
@@ -162,10 +164,11 @@ teco_state_start_range(teco_machine_main_t *ctx, GError **error)
if (!teco_expressions_eval(FALSE, error))
return;
teco_expressions_push(0);
- teco_expressions_push(teco_interface_ssm(SCI_GETLENGTH, 0, 0));
+ sptr_t pos = teco_interface_ssm(SCI_GETLENGTH, 0, 0);
+ teco_expressions_push(teco_interface_bytes2glyphs(pos));
}
-/*$ "\\"
+/*$ \[rs]
* n\\ -- Insert or read ASCII numbers
* \\ -> n
*
@@ -241,6 +244,7 @@ teco_state_start_loop_open(teco_machine_main_t *ctx, GError **error)
if (!teco_expressions_eval(FALSE, error) ||
!teco_expressions_pop_num_calc(&lctx.counter, -1, error))
return;
+ lctx.brace_level = teco_brace_level;
lctx.pass_through = teco_machine_main_eval_colon(ctx);
if (lctx.counter) {
@@ -280,6 +284,14 @@ teco_state_start_loop_close(teco_machine_main_t *ctx, GError **error)
teco_loop_context_t *lctx = &g_array_index(teco_loop_stack, teco_loop_context_t,
teco_loop_stack->len-1);
+
+ /* only non-pass-through loops increase the brace level */
+ if (teco_brace_level != lctx->brace_level + !lctx->pass_through) {
+ g_set_error_literal(error, TECO_ERROR, TECO_ERROR_FAILED,
+ "Brace left open at loop end command");
+ return;
+ }
+
gboolean colon_modified = teco_machine_main_eval_colon(ctx);
/*
@@ -348,7 +360,7 @@ teco_state_start_break(teco_machine_main_t *ctx, GError **error)
{
if (teco_loop_stack->len <= ctx->loop_stack_fp) {
g_set_error_literal(error, TECO_ERROR, TECO_ERROR_FAILED,
- "<;> only allowed in iterations");
+ "<;> only allowed in loops");
return;
}
@@ -373,7 +385,7 @@ teco_state_start_break(teco_machine_main_t *ctx, GError **error)
if (!teco_expressions_discard_args(error))
return;
if (!lctx.pass_through &&
- !teco_expressions_brace_close(error))
+ !teco_expressions_brace_return(lctx.brace_level, 0, error))
return;
undo__insert_val__teco_loop_stack(teco_loop_stack->len, lctx);
@@ -511,11 +523,12 @@ teco_state_start_jump(teco_machine_main_t *ctx, GError **error)
if (!teco_expressions_pop_num_calc(&v, 0, error))
return;
- if (teco_validate_pos(v)) {
+ gssize pos = teco_interface_glyphs2bytes(v);
+ if (pos >= 0) {
if (teco_current_doc_must_undo())
undo__teco_interface_ssm(SCI_GOTOPOS,
teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0), 0);
- teco_interface_ssm(SCI_GOTOPOS, v, 0);
+ teco_interface_ssm(SCI_GOTOPOS, pos, 0);
if (teco_machine_main_eval_colon(ctx))
teco_expressions_push(TECO_SUCCESS);
@@ -531,11 +544,11 @@ static teco_bool_t
teco_move_chars(teco_int_t n)
{
sptr_t pos = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0);
-
- if (!teco_validate_pos(pos + n))
+ gssize next_pos = teco_interface_glyphs2bytes_relative(pos, n);
+ if (next_pos < 0)
return TECO_FAILURE;
- teco_interface_ssm(SCI_GOTOPOS, pos + n, 0);
+ teco_interface_ssm(SCI_GOTOPOS, next_pos, 0);
if (teco_current_doc_must_undo())
undo__teco_interface_ssm(SCI_GOTOPOS, pos, 0);
@@ -879,7 +892,7 @@ static gboolean
teco_state_start_kill(teco_machine_main_t *ctx, const gchar *cmd, gboolean by_lines, GError **error)
{
teco_bool_t rc;
- teco_int_t from, len;
+ gssize from, len; /* in bytes */
if (!teco_expressions_eval(FALSE, error))
return FALSE;
@@ -894,20 +907,24 @@ teco_state_start_kill(teco_machine_main_t *ctx, const gchar *cmd, gboolean by_li
len = teco_interface_ssm(SCI_POSITIONFROMLINE, line, 0) - from;
rc = teco_bool(teco_validate_line(line));
} else {
- if (!teco_expressions_pop_num_calc(&len, teco_num_sign, error))
+ teco_int_t len_glyphs;
+ if (!teco_expressions_pop_num_calc(&len_glyphs, teco_num_sign, error))
return FALSE;
- rc = teco_bool(teco_validate_pos(from + len));
+ gssize to = teco_interface_glyphs2bytes_relative(from, len_glyphs);
+ rc = teco_bool(to >= 0);
+ len = to-from;
}
if (len < 0) {
len *= -1;
from -= len;
}
} else {
- teco_int_t to = teco_expressions_pop_num(0);
- from = teco_expressions_pop_num(0);
+ teco_int_t to_glyphs = teco_expressions_pop_num(0);
+ gssize to = teco_interface_glyphs2bytes(to_glyphs);
+ teco_int_t from_glyphs = teco_expressions_pop_num(0);
+ from = teco_interface_glyphs2bytes(from_glyphs);
len = to - from;
- rc = teco_bool(len >= 0 && teco_validate_pos(from) &&
- teco_validate_pos(to));
+ rc = teco_bool(len >= 0 && from >= 0 && to >= 0);
}
if (teco_machine_main_eval_colon(ctx)) {
@@ -1002,6 +1019,9 @@ teco_state_start_delete_chars(teco_machine_main_t *ctx, GError **error)
* This can be an ASCII <code> or Unicode codepoint
* depending on Scintilla's encoding of the current
* buffer.
+ * Invalid Unicode byte sequences are reported as
+ * -1 or -2.
+ *
* - If <n> is 0, return the <code> of the character
* pointed to by dot.
* - If <n> is 1, return the <code> of the character
@@ -1012,28 +1032,33 @@ teco_state_start_delete_chars(teco_machine_main_t *ctx, GError **error)
*
* If the position of the queried character is off-page,
* the command will yield an error.
+ *
+ * If the document is encoded as UTF-8 and there is
+ * an incomplete sequence at the requested position,
+ * -1 is returned.
+ * All other invalid Unicode sequences are returned as -2.
*/
-/** @todo does Scintilla really return code points??? */
static void
teco_state_start_get(teco_machine_main_t *ctx, GError **error)
{
teco_int_t v;
if (!teco_expressions_pop_num_calc(&v, teco_num_sign, error))
return;
- v += teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0);
- /*
- * NOTE: We cannot use teco_validate_pos() here since
- * the end of the buffer is not a valid position for <A>.
- */
- if (v < 0 || v >= teco_interface_ssm(SCI_GETLENGTH, 0, 0)) {
+
+ sptr_t pos = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0);
+ gssize get_pos = teco_interface_glyphs2bytes_relative(pos, v);
+ sptr_t len = teco_interface_ssm(SCI_GETLENGTH, 0, 0);
+
+ if (get_pos < 0 || get_pos == len) {
teco_error_range_set(error, "A");
return;
}
- teco_expressions_push(teco_interface_ssm(SCI_GETCHARAT, v, 0));
+
+ teco_expressions_push(teco_interface_get_character(get_pos, len));
}
static teco_state_t *
-teco_state_start_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_start_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
{
static teco_machine_main_transition_t transitions[] = {
/*
@@ -1148,7 +1173,7 @@ teco_state_start_input(teco_machine_main_t *ctx, gchar chr, GError **error)
*
* FIXME: Maybe, there should be a special teco_state_t
* for beginnings of command-lines?
- * It could also be used for a corresponding FNMACRO mask.
+ * It could also be used for a corresponding KEYMACRO mask.
*/
if (teco_cmdline.effective_len == 1 && teco_cmdline.str.data[0] == '*')
return &teco_state_save_cmdline;
@@ -1244,7 +1269,7 @@ teco_state_start_input(teco_machine_main_t *ctx, gchar chr, GError **error)
TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_start,
.end_of_macro_cb = NULL, /* Allowed at the end of a macro! */
.is_start = TRUE,
- .fnmacro_mask = TECO_FNMACRO_MASK_START
+ .keymacro_mask = TECO_KEYMACRO_MASK_START | TECO_KEYMACRO_MASK_CASEINSENSITIVE
);
/*$ F<
@@ -1372,7 +1397,7 @@ teco_state_fcommand_cond_else(teco_machine_main_t *ctx, GError **error)
}
static teco_state_t *
-teco_state_fcommand_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_fcommand_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
{
static teco_machine_main_transition_t transitions[] = {
/*
@@ -1435,7 +1460,7 @@ teco_state_changedir_done(teco_machine_main_t *ctx, const teco_string_t *str, GE
teco_qreg_t *qreg = teco_qreg_table_find(&teco_qreg_table_globals, "$HOME", 5);
g_assert(qreg != NULL);
teco_string_t home;
- if (!qreg->vtable->get_string(qreg, &home.data, &home.len, error))
+ if (!qreg->vtable->get_string(qreg, &home.data, &home.len, NULL, error))
return NULL;
/*
@@ -1496,7 +1521,7 @@ teco_state_changedir_done(teco_machine_main_t *ctx, const teco_string_t *str, GE
TECO_DEFINE_STATE_EXPECTDIR(teco_state_changedir);
static teco_state_t *
-teco_state_condcommand_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_condcommand_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
{
teco_int_t value = 0;
gboolean result = TRUE;
@@ -1536,20 +1561,20 @@ teco_state_condcommand_input(teco_machine_main_t *ctx, gchar chr, GError **error
break;
case 'A':
if (ctx->mode == TECO_MODE_NORMAL)
- result = g_ascii_isalpha((gchar)value);
+ result = g_unichar_isalpha(value);
break;
case 'C':
if (ctx->mode == TECO_MODE_NORMAL)
- result = g_ascii_isalnum((gchar)value) ||
+ result = g_unichar_isalnum(value) ||
value == '.' || value == '$' || value == '_';
break;
case 'D':
if (ctx->mode == TECO_MODE_NORMAL)
- result = g_ascii_isdigit((gchar)value);
+ result = g_unichar_isdigit(value);
break;
case 'I':
if (ctx->mode == TECO_MODE_NORMAL)
- result = G_IS_DIR_SEPARATOR((gchar)value);
+ result = G_IS_DIR_SEPARATOR(value);
break;
case 'S':
case 'T':
@@ -1582,15 +1607,15 @@ teco_state_condcommand_input(teco_machine_main_t *ctx, gchar chr, GError **error
break;
case 'R':
if (ctx->mode == TECO_MODE_NORMAL)
- result = g_ascii_isalnum((gchar)value);
+ result = g_unichar_isalnum(value);
break;
case 'V':
if (ctx->mode == TECO_MODE_NORMAL)
- result = g_ascii_islower((gchar)value);
+ result = g_unichar_islower(value);
break;
case 'W':
if (ctx->mode == TECO_MODE_NORMAL)
- result = g_ascii_isupper((gchar)value);
+ result = g_unichar_isupper(value);
break;
default:
g_set_error(error, TECO_ERROR, TECO_ERROR_FAILED,
@@ -1720,8 +1745,71 @@ teco_state_control_radix(teco_machine_main_t *ctx, GError **error)
}
}
+/*$ ^E glyphs2bytes bytes2glyphs
+ * glyphs^E -> bytes -- Translate between glyph and byte indexes
+ * bytes:^E -> glyphs
+ * ^E -> bytes
+ * :^E -> length
+ *
+ * Translates from glyph/character to byte indexes when called
+ * without a colon.
+ * Otherwise when colon-modified, translates from byte indexes
+ * back to glyph indexes.
+ * These values can differ in documents with multi-byte
+ * encodings (of which only UTF-8 is supported).
+ * It is especially useful to translate between these indexes
+ * when manually invoking Scintilla messages (\fBES\fP command), as
+ * they almost always take byte positions.
+ *
+ * When called without arguments, \fB^E\fP returns the current
+ * position (dot) in bytes.
+ * This is equivalent, but faster than \(lq.^E\(rq.
+ * \fB:^E\fP without arguments returns the length of the current
+ * document in bytes, which is equivalent but faster than \(lqZ^E\(rq.
+ *
+ * When passing in indexes outside of the document's valid area,
+ * -1 is returned, so the return value can also be interpreted
+ * as a TECO boolean, signalling truth/success for invalid indexes.
+ * This provides an elegant and effective way to validate
+ * buffer addresses.
+ */
+static void
+teco_state_control_glyphs2bytes(teco_machine_main_t *ctx, GError **error)
+{
+ teco_int_t res;
+
+ if (!teco_expressions_eval(FALSE, error))
+ return;
+
+ gboolean colon_modified = teco_machine_main_eval_colon(ctx);
+
+ if (!teco_expressions_args()) {
+ /*
+ * This is shorter than .^E or Z^E and avoids unnecessary glyph to
+ * byte index translations.
+ * On the other hand :^E is inconsistent, as it will return a byte
+ * index, instead of glyph index.
+ */
+ res = teco_interface_ssm(colon_modified ? SCI_GETLENGTH : SCI_GETCURRENTPOS, 0, 0);
+ } else {
+ teco_int_t pos;
+ if (!teco_expressions_pop_num_calc(&pos, 0, error))
+ return;
+ if (colon_modified) {
+ /* teco_interface_bytes2glyphs() does not check addresses */
+ res = 0 <= pos && pos <= teco_interface_ssm(SCI_GETLENGTH, 0, 0)
+ ? teco_interface_bytes2glyphs(pos) : -1;
+ } else {
+ /* negative values for invalid indexes are passed down. */
+ res = teco_interface_glyphs2bytes(pos);
+ }
+ }
+
+ teco_expressions_push(res);
+}
+
static teco_state_t *
-teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_control_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
{
static teco_machine_main_transition_t transitions[] = {
/*
@@ -1746,7 +1834,8 @@ teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error)
['C'] = {&teco_state_start, teco_state_control_exit},
['O'] = {&teco_state_start, teco_state_control_octal},
['D'] = {&teco_state_start, teco_state_control_decimal},
- ['R'] = {&teco_state_start, teco_state_control_radix}
+ ['R'] = {&teco_state_start, teco_state_control_radix},
+ ['E'] = {&teco_state_start, teco_state_control_glyphs2bytes}
};
/*
@@ -1761,7 +1850,7 @@ teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error)
TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_control);
static teco_state_t *
-teco_state_ascii_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_ascii_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
{
if (ctx->mode == TECO_MODE_NORMAL)
teco_expressions_push(chr);
@@ -1797,7 +1886,7 @@ TECO_DEFINE_STATE(teco_state_ascii);
* only be seen when executing the following command.
*/
static teco_state_t *
-teco_state_escape_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_escape_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
{
/*$ ^[^[ ^[$ $$ terminate return
* [a1,a2,...]$$ -- Terminate command line or return from macro
@@ -1891,7 +1980,7 @@ TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_escape,
* when it comes to function key macro masking.
*/
.is_start = TRUE,
- .fnmacro_mask = TECO_FNMACRO_MASK_START
+ .keymacro_mask = TECO_KEYMACRO_MASK_START | TECO_KEYMACRO_MASK_CASEINSENSITIVE
);
/*$ EF close
@@ -1958,6 +2047,11 @@ teco_state_ecommand_close(teco_machine_main_t *ctx, GError **error)
* Without any argument ED returns the current flags.
*
* Currently, the following flags are used by \*(ST:
+ * - 4: If enabled, prefer raw single-byte ANSI encoding
+ * for all new buffers and registers.
+ * This does not change the encoding of any existing
+ * buffers and any initialized default register when set via
+ * \fBED\fP, so you might want to launch \*(ST with \fB--8bit\fP.
* - 8: Enable/disable automatic folding of case-insensitive
* command characters during interactive key translation.
* The case of letter keys is inverted, so one or two
@@ -1973,14 +2067,17 @@ teco_state_ecommand_close(teco_machine_main_t *ctx, GError **error)
* of files.
* - 32: Enable/Disable buffer editing hooks
* (via execution of macro in global Q-Register \(lqED\(rq)
- * - 64: Enable/Disable function key macros
* - 128: Enable/Disable enforcement of UNIX98
* \(lq/bin/sh\(rq emulation for operating system command
* executions
- * - 256: Enable/Disable \fBxterm\fP(1) clipboard support.
- * Should only be enabled if XTerm allows the
- * \fIGetSelection\fP and \fISetSelection\fP window
- * operations.
+ * - 256: Enable/Disable OSC-52 clipboard support.
+ * Must only be enabled if the terminal emulator is configured
+ * properly.
+ * - 512: Enable/Disable Unicode icons in the Curses UI.
+ * This requires a capable font, like the ones provided
+ * by the \(lqNerd Fonts\(rq project.
+ * Changes to this flag in interactive mode may not become
+ * effective immediately.
*
* The features controlled thus are discribed in other sections
* of this manual.
@@ -2098,6 +2195,12 @@ teco_state_ecommand_flags(teco_machine_main_t *ctx, GError **error)
* on exit the author is aware of is \fBxterm\fP(1) and
* the Linux console driver.
* You have been warned. Good luck.
+ * .IP 4
+ * The column after the last horizontal movement.
+ * This is only used by \fBfnkeys.tes\fP and is similar to the Scintilla-internal
+ * setting \fBSCI_CHOOSECARETX\fP.
+ * Unless most other settings, this is on purpose not restored on rubout,
+ * so it "survives" command line replacements.
*/
static void
teco_state_ecommand_properties(teco_machine_main_t *ctx, GError **error)
@@ -2106,9 +2209,12 @@ teco_state_ecommand_properties(teco_machine_main_t *ctx, GError **error)
EJ_USER_INTERFACE = 0,
EJ_BUFFERS,
EJ_MEMORY_LIMIT,
- EJ_INIT_COLOR
+ EJ_INIT_COLOR,
+ EJ_CARETX
};
+ static teco_int_t caret_x = 0;
+
teco_int_t property;
if (!teco_expressions_eval(FALSE, error) ||
!teco_expressions_pop_num_calc(&property, teco_num_sign, error))
@@ -2144,6 +2250,10 @@ teco_state_ecommand_properties(teco_machine_main_t *ctx, GError **error)
teco_interface_init_color((guint)value, (guint32)color);
break;
+ case EJ_CARETX:
+ caret_x = value;
+ break;
+
default:
g_set_error(error, TECO_ERROR, TECO_ERROR_FAILED,
"Cannot set property %" TECO_INT_FORMAT " "
@@ -2180,6 +2290,10 @@ teco_state_ecommand_properties(teco_machine_main_t *ctx, GError **error)
teco_expressions_push(teco_memory_limit);
break;
+ case EJ_CARETX:
+ teco_expressions_push(caret_x);
+ break;
+
default:
g_set_error(error, TECO_ERROR, TECO_ERROR_FAILED,
"Invalid property %" TECO_INT_FORMAT " "
@@ -2292,6 +2406,252 @@ teco_state_ecommand_eol(teco_machine_main_t *ctx, GError **error)
}
}
+static const gchar *
+teco_codepage2str(guint codepage)
+{
+ /*
+ * The multi-byte charsets are excluded, since we don't
+ * support them in SciTECO, even though Scintilla has them.
+ * Contrary to the Scintilla documentation, Gtk supports
+ * most of them.
+ * Those that are supported are tested, so the codepage
+ * mapping should be definitive (although there could be
+ * similar related codepages).
+ */
+ switch (codepage) {
+ case SC_CP_UTF8: return "UTF-8";
+ case SC_CHARSET_ANSI:
+ case SC_CHARSET_DEFAULT: return "ISO-8859-1"; /* LATIN1 */
+ case SC_CHARSET_BALTIC: return "ISO-8859-13"; /* LATIN7 */
+ //case SC_CHARSET_CHINESEBIG5: return "BIG5";
+ case SC_CHARSET_EASTEUROPE: return "ISO-8859-2"; /* LATIN2 */
+ //case SC_CHARSET_GB2312: return "GB2312";
+ case SC_CHARSET_GREEK: return "ISO-8859-7"; // CP1253???
+ //case SC_CHARSET_HANGUL: return "UHC";
+ /* unsure whether this is supported on Gtk */
+ case SC_CHARSET_MAC: return "MAC";
+ /* not supported by Gtk */
+ case SC_CHARSET_OEM: return "CP437";
+ /*
+ * Apparently, this can be CP1251 on the native Windows
+ * port of Scintilla.
+ */
+ case SC_CHARSET_RUSSIAN: return "KOI8-R";
+ case SC_CHARSET_OEM866: return "CP866";
+ case SC_CHARSET_CYRILLIC: return "CP1251";
+ //case SC_CHARSET_SHIFTJIS: return "SHIFT-JIS";
+ //case SC_CHARSET_SYMBOL:
+ case SC_CHARSET_TURKISH: return "ISO-8859-9"; /* LATIN5 */
+ //case SC_CHARSET_JOHAB: return "JOHAB";
+ case SC_CHARSET_HEBREW: return "ISO-8859-8"; // CP1255?
+ /*
+ * FIXME: Some arabic codepage is supported by Gtk,
+ * but I am not sure which.
+ */
+ case SC_CHARSET_ARABIC: return "ISO-8859-6"; // CP720, CP1256???
+ /* apparently not supported by Gtk */
+ case SC_CHARSET_VIETNAMESE: return "CP1258";
+ case SC_CHARSET_THAI: return "ISO-8859-11";
+ case SC_CHARSET_8859_15: return "ISO-8859-15"; /* LATIN9 */
+ }
+
+ return NULL;
+}
+
+/*$ EE encoding codepage charset
+ * codepageEE -- Edit current document's encoding (codepage/charset)
+ * EE -> codepage
+ * codepage:EE
+ * :EE -> codepage
+ *
+ * When called with an argument, it sets the current codepage,
+ * otherwise returns it.
+ * The following codepages are supported:
+ * - 0: ANSI (raw bytes)
+ * - 1: ISO-8859-1 (latin1)
+ * - 77: Macintosh Latin encoding
+ * - 161: ISO-8859-7
+ * - 162: ISO-8859-9 (latin5)
+ * - 163: CP1258
+ * - 177: ISO-8859-8
+ * - 178: ISO-8859-6
+ * - 186: ISO-8859-13 (latin7)
+ * - 204: KOI8-R
+ * - 222: ISO-8859-11
+ * - 238: ISO-8859-2 (latin2)
+ * - 255: CP437
+ * - 866: CP866
+ * - 1000: ISO-8859-15 (latin9)
+ * - 1251: CP1251
+ * - 65001: UTF-8
+ *
+ * Displaying characters in the single-byte (non-UTF-8) codepages might
+ * be supported only with the Gtk UI.
+ * At least 77, 178, 163 and 255 are not displayed correctly on Gtk.
+ * 65001 (UTF-8) is the default for new buffers.
+ * 0 (ANSI) should be used when working with raw bytes,
+ * but is currently displayed like ISO-8859-1 (latin1).
+ *
+ * \fBEE\fP does not change the buffer contents itself by default, only
+ * how it is displayed and how \*(ST interacts with it.
+ * This allows fixing up the codepage if it is not in the default UTF-8
+ * or if codepage guessing failed.
+ *
+ * When colon-modified the \fB:EE\fP command will also additionally convert
+ * the current buffer contents into the new code page, preserving the
+ * current position (dot).
+ * This will fail if the conversion would be lossy.
+ * Conversions from and to UTF-8 \fIshould\fP always be successful.
+ */
+static void
+teco_state_ecommand_encoding(teco_machine_main_t *ctx, GError **error)
+{
+ if (!teco_expressions_eval(FALSE, error))
+ return;
+
+ gboolean colon_modified = teco_machine_main_eval_colon(ctx);
+
+ guint old_cp = teco_interface_get_codepage();
+
+ if (!teco_expressions_args()) {
+ /* get current code page */
+ teco_expressions_push(old_cp);
+ return;
+ }
+
+ /*
+ * Set code page
+ */
+ teco_int_t new_cp;
+ if (!teco_expressions_pop_num_calc(&new_cp, 0, error))
+ return;
+
+ if (old_cp == SC_CP_UTF8 && new_cp == SC_CP_UTF8)
+ return;
+
+ if (teco_current_doc_must_undo() && teco_undo_enabled) {
+ if (old_cp == SC_CP_UTF8) { /* new_cp != SC_CP_UTF8 */
+ undo__teco_interface_ssm(SCI_ALLOCATELINECHARACTERINDEX,
+ SC_LINECHARACTERINDEX_UTF32, 0);
+ undo__teco_interface_ssm(SCI_SETCODEPAGE, SC_CP_UTF8, 0);
+ } else {
+ undo__teco_interface_ssm(SCI_SETCODEPAGE, 0, 0);
+ for (gint style = 0; style <= STYLE_LASTPREDEFINED; style++)
+ undo__teco_interface_ssm(SCI_STYLESETCHARACTERSET, style, old_cp);
+ /*
+ * The index is internally reference-counted and could underflow,
+ * so don't do it more than necessary.
+ */
+ if (new_cp == SC_CP_UTF8)
+ undo__teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
+ SC_LINECHARACTERINDEX_UTF32, 0);
+ }
+ }
+
+ teco_int_t dot_glyphs;
+ if (colon_modified) {
+ sptr_t dot_bytes = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0);
+ dot_glyphs = teco_interface_bytes2glyphs(dot_bytes);
+
+ /*
+ * Convert buffer to new codepage.
+ *
+ * FIXME: Could be optimized slightly by converting first
+ * before the gap, inserting the converted text and then
+ * converting after the gap.
+ */
+ const gchar *to_codepage = teco_codepage2str(new_cp);
+ const gchar *from_codepage = teco_codepage2str(old_cp);
+ if (!to_codepage || !from_codepage) {
+ g_set_error_literal(error, TECO_ERROR, TECO_ERROR_FAILED,
+ "Unknown or unsupported codepage/charset");
+ return;
+ }
+
+ const gchar *buf = (const gchar *)teco_interface_ssm(SCI_GETCHARACTERPOINTER, 0, 0);
+ gsize len = teco_interface_ssm(SCI_GETLENGTH, 0, 0);
+ g_autofree gchar *converted;
+ gsize converted_len;
+
+ /*
+ * This fails if there is no direct translation.
+ * If we'd use g_convert_with_fallback(), it would be tricky to choose
+ * fallback characters that will always work.
+ */
+ converted = g_convert(buf, len, to_codepage, from_codepage,
+ NULL, &converted_len, error);
+ if (!converted)
+ return;
+
+ teco_interface_ssm(SCI_BEGINUNDOACTION, 0, 0);
+ teco_interface_ssm(SCI_CLEARALL, 0, 0);
+ teco_interface_ssm(SCI_APPENDTEXT, converted_len, (sptr_t)converted);
+ teco_interface_ssm(SCI_ENDUNDOACTION, 0, 0);
+ teco_ring_dirtify();
+
+ if (teco_current_doc_must_undo()) {
+ undo__teco_interface_ssm(SCI_GOTOPOS, dot_bytes, 0);
+ undo__teco_interface_ssm(SCI_UNDO, 0, 0);
+ }
+ }
+
+ if (new_cp == SC_CP_UTF8) {
+ teco_interface_ssm(SCI_SETCODEPAGE, SC_CP_UTF8, 0);
+ /*
+ * UTF-8 documents strictly require the line character index.
+ * See teco_view_glyphs2bytes() and teco_view_bytes2glyphs().
+ */
+ g_assert(!(teco_interface_ssm(SCI_GETLINECHARACTERINDEX, 0, 0)
+ & SC_LINECHARACTERINDEX_UTF32));
+ teco_interface_ssm(SCI_ALLOCATELINECHARACTERINDEX,
+ SC_LINECHARACTERINDEX_UTF32, 0);
+ } else {
+ /*
+ * The index is NOT released automatically when setting the codepage.
+ * But it is internally reference-counted and could underflow,
+ * so don't do it more than necessary.
+ */
+ if (old_cp == SC_CP_UTF8) {
+ teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
+ SC_LINECHARACTERINDEX_UTF32, 0);
+ g_assert(!(teco_interface_ssm(SCI_GETLINECHARACTERINDEX, 0, 0)
+ & SC_LINECHARACTERINDEX_UTF32));
+ }
+
+ /*
+ * Configure a single-byte codepage/charset.
+ * This requires setting it on all of the possible styles.
+ * Unfortunately there can theoretically even be 255 (STYLE_MAX) styles.
+ * This is important only for display purposes - other than that
+ * all single-byte encodings are handled the same.
+ *
+ * FIXME: Should we avoid this if new_cp == 0?
+ * It will be used for raw byte handling mostly.
+ */
+ if (teco_current_doc_must_undo()) {
+ /*
+ * There is a chance the user will see this buffer even if we
+ * are currently in batch mode.
+ */
+ for (gint style = 0; style <= STYLE_LASTPREDEFINED; style++)
+ teco_interface_ssm(SCI_STYLESETCHARACTERSET, style, new_cp);
+ } else {
+ /* we must still set it, so that <EE> retrieval works */
+ teco_interface_ssm(SCI_STYLESETCHARACTERSET, STYLE_DEFAULT, new_cp);
+ }
+ /* 0 is used for ALL single-byte encodings */
+ teco_interface_ssm(SCI_SETCODEPAGE, 0, 0);
+ }
+
+ if (colon_modified)
+ /*
+ * Only now, it will be safe to recalculate dot in the new encoding.
+ * If the new codepage is UTF-8, the line character index will be
+ * ready only now.
+ */
+ teco_interface_ssm(SCI_GOTOPOS, teco_interface_glyphs2bytes(dot_glyphs), 0);
+}
+
/*$ EX exit
* [bool]EX -- Exit program
* -EX
@@ -2352,7 +2712,7 @@ teco_state_ecommand_exit(teco_machine_main_t *ctx, GError **error)
}
static teco_state_t *
-teco_state_ecommand_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_ecommand_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
{
static teco_machine_main_transition_t transitions[] = {
/*
@@ -2377,6 +2737,7 @@ teco_state_ecommand_input(teco_machine_main_t *ctx, gchar chr, GError **error)
['D'] = {&teco_state_start, teco_state_ecommand_flags},
['J'] = {&teco_state_start, teco_state_ecommand_properties},
['L'] = {&teco_state_start, teco_state_ecommand_eol},
+ ['E'] = {&teco_state_start, teco_state_ecommand_encoding},
['X'] = {&teco_state_start, teco_state_ecommand_exit}
};
@@ -2395,26 +2756,61 @@ teco_state_insert_initial(teco_machine_main_t *ctx, GError **error)
if (ctx->mode > TECO_MODE_NORMAL)
return TRUE;
+ /*
+ * Current document's encoding determines the behaviour of
+ * string building constructs.
+ */
+ teco_machine_stringbuilding_set_codepage(&ctx->expectstring.machine,
+ teco_interface_get_codepage());
+
if (!teco_expressions_eval(FALSE, error))
return FALSE;
guint args = teco_expressions_args();
if (!args)
return TRUE;
- teco_interface_ssm(SCI_BEGINUNDOACTION, 0, 0);
- for (int i = args; i > 0; i--) {
- gchar chr = (gchar)teco_expressions_peek_num(i-1);
- teco_interface_ssm(SCI_ADDTEXT, 1, (sptr_t)&chr);
+ if (teco_interface_ssm(SCI_GETCODEPAGE, 0, 0) == SC_CP_UTF8) {
+ /* detect possible errors before introducing side effects */
+ for (gint i = args; i > 0; i--) {
+ teco_int_t chr = teco_expressions_peek_num(i-1);
+ if (chr < 0 || !g_unichar_validate(chr)) {
+ teco_error_codepoint_set(error, "I");
+ return FALSE;
+ }
+ }
+ teco_interface_ssm(SCI_BEGINUNDOACTION, 0, 0);
+ for (gint i = args; i > 0; i--) {
+ /* 4 bytes should be enough, but we better follow the documentation */
+ gchar buf[6];
+ gsize len = g_unichar_to_utf8(teco_expressions_peek_num(i-1), buf);
+ teco_interface_ssm(SCI_ADDTEXT, len, (sptr_t)buf);
+ }
+ } else {
+ /* everything else is a single-byte encoding */
+ for (gint i = args; i > 0; i--) {
+ teco_int_t chr = teco_expressions_peek_num(i-1);
+ if (chr < 0 || chr > 0xFF) {
+ teco_error_codepoint_set(error, "I");
+ return FALSE;
+ }
+ }
+ teco_interface_ssm(SCI_BEGINUNDOACTION, 0, 0);
+ for (gint i = args; i > 0; i--) {
+ gchar chr = (gchar)teco_expressions_peek_num(i-1);
+ teco_interface_ssm(SCI_ADDTEXT, 1, (sptr_t)&chr);
+ }
}
- for (int i = args; i > 0; i--)
- if (!teco_expressions_pop_num_calc(NULL, 0, error))
- return FALSE;
teco_interface_ssm(SCI_ENDUNDOACTION, 0, 0);
teco_ring_dirtify();
if (teco_current_doc_must_undo())
undo__teco_interface_ssm(SCI_UNDO, 0, 0);
+ /* This is done only now because it can _theoretically_ fail. */
+ for (gint i = args; i > 0; i--)
+ if (!teco_expressions_pop_num_calc(NULL, 0, error))
+ return FALSE;
+
return TRUE;
}
@@ -2451,8 +2847,8 @@ teco_state_insert_process(teco_machine_main_t *ctx, const teco_string_t *str,
* Secondly, the command inserts <text>.
* In interactive mode, <text> is inserted interactively.
*
- * String building characters are \fBenabled\fP for the
- * I command.
+ * Unlike in classic TECO dialects, string building characters are
+ * \fBenabled\fP for the \fBI\fP command.
* When editing \*(ST macros, using the \fBEI\fP command
* may be better, since it has string building characters
* disabled.
@@ -2491,10 +2887,9 @@ teco_state_insert_indent_initial(teco_machine_main_t *ctx, GError **error)
len -= teco_interface_ssm(SCI_GETCOLUMN,
teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0), 0) % len;
- gchar spaces[len];
-
- memset(spaces, ' ', sizeof(spaces));
- teco_interface_ssm(SCI_ADDTEXT, sizeof(spaces), (sptr_t)spaces);
+ gchar space = ' ';
+ while (len-- > 0)
+ teco_interface_ssm(SCI_ADDTEXT, 1, (sptr_t)&space);
}
teco_interface_ssm(SCI_ENDUNDOACTION, 0, 0);
teco_ring_dirtify();