diff options
-rw-r--r-- | doc/sciteco.7.template | 20 | ||||
-rw-r--r-- | src/cmdline.c | 79 | ||||
-rw-r--r-- | src/cmdline.h | 12 | ||||
-rw-r--r-- | src/core-commands.c | 25 | ||||
-rw-r--r-- | src/core-commands.h | 2 | ||||
-rw-r--r-- | src/error.h | 4 | ||||
-rw-r--r-- | src/expressions.c | 3 | ||||
-rw-r--r-- | src/expressions.h | 2 | ||||
-rw-r--r-- | src/goto-commands.c | 6 | ||||
-rw-r--r-- | src/help.c | 2 | ||||
-rw-r--r-- | src/interface-curses/interface.c | 24 | ||||
-rw-r--r-- | src/interface-gtk/interface.c | 10 | ||||
-rw-r--r-- | src/parser.c | 157 | ||||
-rw-r--r-- | src/parser.h | 30 | ||||
-rw-r--r-- | src/qreg-commands.c | 9 | ||||
-rw-r--r-- | src/qreg-commands.h | 6 | ||||
-rw-r--r-- | src/qreg.c | 40 | ||||
-rw-r--r-- | src/qreg.h | 2 | ||||
-rw-r--r-- | src/rb3str.c | 8 | ||||
-rw-r--r-- | src/rb3str.h | 2 | ||||
-rw-r--r-- | src/sciteco.h | 2 | ||||
-rw-r--r-- | src/search.c | 8 | ||||
-rw-r--r-- | src/spawn.c | 4 | ||||
-rw-r--r-- | src/string-utils.c | 36 | ||||
-rw-r--r-- | src/string-utils.h | 19 | ||||
-rw-r--r-- | src/symbols.c | 2 | ||||
-rw-r--r-- | src/undo.c | 2 | ||||
-rw-r--r-- | src/undo.h | 4 | ||||
-rw-r--r-- | tests/testsuite.at | 7 |
29 files changed, 325 insertions, 202 deletions
diff --git a/doc/sciteco.7.template b/doc/sciteco.7.template index a5b7f4a..d0574d7 100644 --- a/doc/sciteco.7.template +++ b/doc/sciteco.7.template @@ -87,10 +87,6 @@ regular commands for command-line editing. . When the user presses a key or key-combination it is first translated to an UTF-8 string. -All immediate editing commands and regular \*(ST commands however operate on -a language based solely on -.B ASCII -codes, which is a subset of Unicode. The rules for translating keys are as follows: .RS .IP 1. 4 @@ -138,6 +134,18 @@ This feature is called function key macros and explained in the next subsection. .RE . +.LP +All immediate editing commands and regular \*(ST commands however operate on +a language based solely on +.B ASCII +codes, which is a subset of Unicode. +\# This is because we cannot assume the presence of any particular non-ANSI +\# symbol on a user's keyboard. +Since the \*(ST parser is Unicode-aware, this does not exclude +using Unicode glyphs wherever a single character is expected, +ie. \fB^^\fIx\fR and \fBU\fIq\fR works with arbitrary Unicode glyphs. +All \*(ST macros must be in valid UTF-8. +. .SS Function Key Macros . .SCITECO_TOPIC "function key" @@ -1082,8 +1090,8 @@ Consequently when querying the code at a character position or inserting characters by code, the code may be an Unicode codepoint instead of byte-sized integer. .LP -Currently, \*(ST supports UTF-8 and single-byte ANSI encodings, -that can also be used for editing raw binary files. +Currently, \*(ST supports buffers in UTF-8 and single-byte +ANSI encodings, that can also be used for editing raw binary files. \# You can configure other single-byte code pages with EE, \# but there isn't yet any way to insert characters. UTF-8 is the default codepage for new buffers and Q-Registers diff --git a/src/cmdline.c b/src/cmdline.c index 47ef86f..be7a5b1 100644 --- a/src/cmdline.c +++ b/src/cmdline.c @@ -194,7 +194,7 @@ teco_cmdline_rubin(GError **error) } gboolean -teco_cmdline_keypress_c(gchar key, GError **error) +teco_cmdline_keypress_wc(gunichar key, GError **error) { teco_machine_t *machine = &teco_cmdline.machine.parent; g_autoptr(GError) tmp_error = NULL; @@ -283,6 +283,30 @@ teco_cmdline_keypress_c(gchar key, GError **error) return TRUE; } +/* + * FIXME: If one character causes an error, we should rub out the + * entire string. + * Usually it will be called only with single keys (strings containing + * single codepoints), but especially teco_cmdline_fnmacro() can emulate + * many key presses at once. + */ +gboolean +teco_cmdline_keypress(const gchar *str, gsize len, GError **error) +{ + for (guint i = 0; i < len; i += g_utf8_next_char(str+i) - (str+i)) { + gunichar chr = g_utf8_get_char_validated(str+i, len-i); + if ((gint32)chr < 0) { + g_set_error_literal(error, TECO_ERROR, TECO_ERROR_CODEPOINT, + "Invalid UTF-8 sequence"); + return FALSE; + } + if (!teco_cmdline_keypress_wc(chr, error)) + return FALSE; + } + + return TRUE; +} + gboolean teco_cmdline_fnmacro(const gchar *name, GError **error) { @@ -361,7 +385,7 @@ teco_cmdline_cleanup(void) */ gboolean -teco_state_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error) +teco_state_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error) { switch (key) { case '\n': /* insert EOL sequence */ @@ -431,23 +455,30 @@ teco_state_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gch } teco_interface_popup_clear(); - return teco_cmdline_insert(&key, sizeof(key), error); + + gchar buf[6]; + gsize len = g_unichar_to_utf8(key, buf); + return teco_cmdline_insert(buf, len, error); } gboolean -teco_state_caseinsensitive_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error) +teco_state_caseinsensitive_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error) { + /* + * Auto case folding is for syntactic characters, + * so this could be done by working only with a-z and A-Z. + * However, it's also not speed critical. + */ if (teco_ed & TECO_ED_AUTOCASEFOLD) - /* will not modify non-letter keys */ - key = g_ascii_islower(key) ? g_ascii_toupper(key) - : g_ascii_tolower(key); + key = g_unichar_islower(key) ? g_unichar_toupper(key) + : g_unichar_tolower(key); return teco_state_process_edit_cmd(ctx, parent_ctx, key, error); } gboolean teco_state_stringbuilding_start_process_edit_cmd(teco_machine_stringbuilding_t *ctx, teco_machine_t *parent_ctx, - gchar key, GError **error) + gunichar key, GError **error) { teco_state_t *current = ctx->parent.current; @@ -597,7 +628,7 @@ teco_state_stringbuilding_start_process_edit_cmd(teco_machine_stringbuilding_t * gboolean teco_state_stringbuilding_qreg_process_edit_cmd(teco_machine_stringbuilding_t *ctx, teco_machine_t *parent_ctx, - gchar chr, GError **error) + gunichar chr, GError **error) { g_assert(ctx->machine_qregspec != NULL); /* We downcast since teco_machine_qregspec_t is private in qreg.c */ @@ -606,7 +637,7 @@ teco_state_stringbuilding_qreg_process_edit_cmd(teco_machine_stringbuilding_t *c } gboolean -teco_state_expectstring_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error) +teco_state_expectstring_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error) { teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine; teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current; @@ -614,7 +645,7 @@ teco_state_expectstring_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_ } gboolean -teco_state_insert_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error) +teco_state_insert_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error) { teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine; teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current; @@ -650,7 +681,7 @@ teco_state_insert_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *par } gboolean -teco_state_expectfile_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error) +teco_state_expectfile_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error) { teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine; teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current; @@ -720,8 +751,8 @@ teco_state_expectfile_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t gboolean unambiguous = teco_file_auto_complete(ctx->expectstring.string.data, G_FILE_TEST_EXISTS, &new_chars); teco_machine_stringbuilding_escape(stringbuilding_ctx, new_chars.data, new_chars.len, &new_chars_escaped); if (unambiguous && ctx->expectstring.nesting == 1) - teco_string_append_c(&new_chars_escaped, - ctx->expectstring.machine.escape_char == '{' ? '}' : ctx->expectstring.machine.escape_char); + teco_string_append_wc(&new_chars_escaped, + ctx->expectstring.machine.escape_char == '{' ? '}' : ctx->expectstring.machine.escape_char); return teco_cmdline_insert(new_chars_escaped.data, new_chars_escaped.len, error); } @@ -731,7 +762,7 @@ teco_state_expectfile_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t } gboolean -teco_state_expectdir_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error) +teco_state_expectdir_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error) { teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine; teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current; @@ -773,7 +804,7 @@ teco_state_expectdir_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t * } gboolean -teco_state_expectqreg_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error) +teco_state_expectqreg_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error) { g_assert(ctx->expectqreg != NULL); /* @@ -785,7 +816,7 @@ teco_state_expectqreg_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t } gboolean -teco_state_qregspec_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error) +teco_state_qregspec_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error) { switch (key) { case '\t': { /* autocomplete Q-Register name */ @@ -820,7 +851,7 @@ teco_state_qregspec_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_ } gboolean -teco_state_qregspec_string_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error) +teco_state_qregspec_string_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error) { teco_machine_stringbuilding_t *stringbuilding_ctx = teco_machine_qregspec_get_stringbuilding(ctx); teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current; @@ -860,7 +891,7 @@ teco_state_qregspec_string_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_m } gboolean -teco_state_execute_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error) +teco_state_execute_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error) { teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine; teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current; @@ -905,7 +936,7 @@ teco_state_execute_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *pa } gboolean -teco_state_scintilla_symbols_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error) +teco_state_scintilla_symbols_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error) { teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine; teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current; @@ -950,7 +981,7 @@ teco_state_scintilla_symbols_process_edit_cmd(teco_machine_main_t *ctx, teco_mac } gboolean -teco_state_goto_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error) +teco_state_goto_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error) { teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine; teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current; @@ -997,7 +1028,7 @@ teco_state_goto_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *paren } gboolean -teco_state_help_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error) +teco_state_help_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error) { teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine; teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current; @@ -1028,8 +1059,8 @@ teco_state_help_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *paren gboolean unambiguous = teco_help_auto_complete(ctx->expectstring.string.data, &new_chars); teco_machine_stringbuilding_escape(stringbuilding_ctx, new_chars.data, new_chars.len, &new_chars_escaped); if (unambiguous && ctx->expectstring.nesting == 1) - teco_string_append_c(&new_chars_escaped, - ctx->expectstring.machine.escape_char == '{' ? '}' : ctx->expectstring.machine.escape_char); + teco_string_append_wc(&new_chars_escaped, + ctx->expectstring.machine.escape_char == '{' ? '}' : ctx->expectstring.machine.escape_char); return new_chars_escaped.len ? teco_cmdline_insert(new_chars_escaped.data, new_chars_escaped.len, error) : TRUE; } diff --git a/src/cmdline.h b/src/cmdline.h index 7f40b5f..78d101c 100644 --- a/src/cmdline.h +++ b/src/cmdline.h @@ -64,16 +64,8 @@ gboolean teco_cmdline_insert(const gchar *data, gsize len, GError **error); gboolean teco_cmdline_rubin(GError **error); -gboolean teco_cmdline_keypress_c(gchar key, GError **error); - -static inline gboolean -teco_cmdline_keypress(const gchar *str, gsize len, GError **error) -{ - for (guint i = 0; i < len; i++) - if (!teco_cmdline_keypress_c(str[i], error)) - return FALSE; - return TRUE; -} +gboolean teco_cmdline_keypress_wc(gunichar key, GError **error); +gboolean teco_cmdline_keypress(const gchar *str, gsize len, GError **error); gboolean teco_cmdline_fnmacro(const gchar *name, GError **error); diff --git a/src/core-commands.c b/src/core-commands.c index 3686624..ef763d5 100644 --- a/src/core-commands.c +++ b/src/core-commands.c @@ -45,7 +45,7 @@ #include "goto-commands.h" #include "core-commands.h" -static teco_state_t *teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error); +static teco_state_t *teco_state_control_input(teco_machine_main_t *ctx, gunichar chr, GError **error); /* * NOTE: This needs some extra code in teco_state_start_input(). @@ -1049,7 +1049,7 @@ teco_state_start_get(teco_machine_main_t *ctx, GError **error) } static teco_state_t * -teco_state_start_input(teco_machine_main_t *ctx, gchar chr, GError **error) +teco_state_start_input(teco_machine_main_t *ctx, gunichar chr, GError **error) { static teco_machine_main_transition_t transitions[] = { /* @@ -1388,7 +1388,7 @@ teco_state_fcommand_cond_else(teco_machine_main_t *ctx, GError **error) } static teco_state_t * -teco_state_fcommand_input(teco_machine_main_t *ctx, gchar chr, GError **error) +teco_state_fcommand_input(teco_machine_main_t *ctx, gunichar chr, GError **error) { static teco_machine_main_transition_t transitions[] = { /* @@ -1512,7 +1512,7 @@ teco_state_changedir_done(teco_machine_main_t *ctx, const teco_string_t *str, GE TECO_DEFINE_STATE_EXPECTDIR(teco_state_changedir); static teco_state_t * -teco_state_condcommand_input(teco_machine_main_t *ctx, gchar chr, GError **error) +teco_state_condcommand_input(teco_machine_main_t *ctx, gunichar chr, GError **error) { teco_int_t value = 0; gboolean result = TRUE; @@ -1800,7 +1800,7 @@ teco_state_control_glyphs2bytes(teco_machine_main_t *ctx, GError **error) } static teco_state_t * -teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error) +teco_state_control_input(teco_machine_main_t *ctx, gunichar chr, GError **error) { static teco_machine_main_transition_t transitions[] = { /* @@ -1841,10 +1841,10 @@ teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error) TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_control); static teco_state_t * -teco_state_ascii_input(teco_machine_main_t *ctx, gchar chr, GError **error) +teco_state_ascii_input(teco_machine_main_t *ctx, gunichar chr, GError **error) { if (ctx->mode == TECO_MODE_NORMAL) - teco_expressions_push((guchar)chr); + teco_expressions_push(chr); return &teco_state_start; } @@ -1877,7 +1877,7 @@ TECO_DEFINE_STATE(teco_state_ascii); * only be seen when executing the following command. */ static teco_state_t * -teco_state_escape_input(teco_machine_main_t *ctx, gchar chr, GError **error) +teco_state_escape_input(teco_machine_main_t *ctx, gunichar chr, GError **error) { /*$ ^[^[ ^[$ $$ terminate return * [a1,a2,...]$$ -- Terminate command line or return from macro @@ -2700,7 +2700,7 @@ teco_state_ecommand_exit(teco_machine_main_t *ctx, GError **error) } static teco_state_t * -teco_state_ecommand_input(teco_machine_main_t *ctx, gchar chr, GError **error) +teco_state_ecommand_input(teco_machine_main_t *ctx, gunichar chr, GError **error) { static teco_machine_main_transition_t transitions[] = { /* @@ -2874,10 +2874,9 @@ teco_state_insert_indent_initial(teco_machine_main_t *ctx, GError **error) len -= teco_interface_ssm(SCI_GETCOLUMN, teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0), 0) % len; - gchar spaces[len]; - - memset(spaces, ' ', sizeof(spaces)); - teco_interface_ssm(SCI_ADDTEXT, sizeof(spaces), (sptr_t)spaces); + gchar space = ' '; + while (len-- > 0) + teco_interface_ssm(SCI_ADDTEXT, 1, (sptr_t)&space); } teco_interface_ssm(SCI_ENDUNDOACTION, 0, 0); teco_ring_dirtify(); diff --git a/src/core-commands.h b/src/core-commands.h index 370c7ba..e30770d 100644 --- a/src/core-commands.h +++ b/src/core-commands.h @@ -43,7 +43,7 @@ gboolean teco_state_insert_process(teco_machine_main_t *ctx, const teco_string_t gsize new_chars, GError **error); /* in cmdline.c */ -gboolean teco_state_insert_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar chr, GError **error); +gboolean teco_state_insert_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar chr, GError **error); /** * @class TECO_DEFINE_STATE_INSERT diff --git a/src/error.h b/src/error.h index f60be1a..7543d02 100644 --- a/src/error.h +++ b/src/error.h @@ -61,10 +61,10 @@ typedef enum { } teco_error_t; static inline void -teco_error_syntax_set(GError **error, gchar chr) +teco_error_syntax_set(GError **error, gunichar chr) { g_set_error(error, TECO_ERROR, TECO_ERROR_SYNTAX, - "Syntax error \"%c\" (%d)", chr, chr); + "Syntax error \"%C\" (U+%04" G_GINT32_MODIFIER "X)", chr, chr); } static inline void diff --git a/src/expressions.c b/src/expressions.c index ef785e0..1ba8706 100644 --- a/src/expressions.c +++ b/src/expressions.c @@ -114,10 +114,11 @@ teco_expressions_pop_num_calc(teco_int_t *ret, teco_int_t imply, GError **error) } void -teco_expressions_add_digit(gchar digit) +teco_expressions_add_digit(gunichar digit) { teco_int_t n = teco_expressions_args() > 0 ? teco_expressions_pop_num(0) : 0; + /* use g_unichar_digit_value()? */ teco_expressions_push(n*teco_radix + (n < 0 ? -1 : 1)*(digit - '0')); } diff --git a/src/expressions.h b/src/expressions.h index 24c5eff..68d8ddb 100644 --- a/src/expressions.h +++ b/src/expressions.h @@ -123,7 +123,7 @@ teco_int_t teco_expressions_peek_num(guint index); teco_int_t teco_expressions_pop_num(guint index); gboolean teco_expressions_pop_num_calc(teco_int_t *ret, teco_int_t imply, GError **error); -void teco_expressions_add_digit(gchar digit); +void teco_expressions_add_digit(gunichar digit); void teco_expressions_push_op(teco_operator_t op); gboolean teco_expressions_push_calc(teco_operator_t op, GError **error); diff --git a/src/goto-commands.c b/src/goto-commands.c index 2326f64..bf80c0b 100644 --- a/src/goto-commands.c +++ b/src/goto-commands.c @@ -53,7 +53,7 @@ teco_state_label_initial(teco_machine_main_t *ctx, GError **error) * I'm unsure whether !-signs should be allowed within comments. */ static teco_state_t * -teco_state_label_input(teco_machine_main_t *ctx, gchar chr, GError **error) +teco_state_label_input(teco_machine_main_t *ctx, gunichar chr, GError **error) { if (chr == '!') { /* @@ -85,7 +85,7 @@ teco_state_label_input(teco_machine_main_t *ctx, gchar chr, GError **error) if (ctx->parent.must_undo) undo__teco_string_truncate(&ctx->goto_label, ctx->goto_label.len); - teco_string_append_c(&ctx->goto_label, chr); + teco_string_append_wc(&ctx->goto_label, chr); return &teco_state_label; } @@ -138,7 +138,7 @@ teco_state_goto_done(teco_machine_main_t *ctx, const teco_string_t *str, GError } /* in cmdline.c */ -gboolean teco_state_goto_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar chr, GError **error); +gboolean teco_state_goto_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar chr, GError **error); /*$ O * Olabel$ -- Go to label @@ -314,7 +314,7 @@ teco_state_help_done(teco_machine_main_t *ctx, const teco_string_t *str, GError } /* in cmdline.c */ -gboolean teco_state_help_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar chr, GError **error); +gboolean teco_state_help_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar chr, GError **error); /*$ "?" help * ?[topic]$ -- Get help for topic diff --git a/src/interface-curses/interface.c b/src/interface-curses/interface.c index 443a903..96254a9 100644 --- a/src/interface-curses/interface.c +++ b/src/interface-curses/interface.c @@ -1582,6 +1582,9 @@ teco_interface_blocking_getch(void) void teco_interface_event_loop_iter(void) { + static gchar keybuf[4]; + static gint keybuf_i = 0; + gint key = g_queue_is_empty(teco_interface.input_queue) ? teco_interface_blocking_getch() : GPOINTER_TO_INT(g_queue_pop_head(teco_interface.input_queue)); @@ -1610,14 +1613,14 @@ teco_interface_event_loop_iter(void) * backspace. * In SciTECO backspace is normalized to ^H. */ - if (!teco_cmdline_keypress_c(TECO_CTL_KEY('H'), - &teco_interface.event_loop_error)) + if (!teco_cmdline_keypress_wc(TECO_CTL_KEY('H'), + &teco_interface.event_loop_error)) return; break; case KEY_ENTER: case '\r': case '\n': - if (!teco_cmdline_keypress_c('\n', &teco_interface.event_loop_error)) + if (!teco_cmdline_keypress_wc('\n', &teco_interface.event_loop_error)) return; break; @@ -1658,8 +1661,19 @@ teco_interface_event_loop_iter(void) * Control keys and keys with printable representation */ default: - if (key <= 0xFF && - !teco_cmdline_keypress_c(key, &teco_interface.event_loop_error)) + if (key > 0xFF) + return; + + /* + * NOTE: There's also wget_wch(), but it requires + * a widechar version of Curses. + */ + keybuf[keybuf_i++] = key; + gunichar cp = g_utf8_get_char_validated(keybuf, keybuf_i); + if (keybuf_i >= sizeof(keybuf) || cp != (gunichar)-2) + keybuf_i = 0; + if ((gint32)cp < 0 || + !teco_cmdline_keypress_wc(cp, &teco_interface.event_loop_error)) return; } diff --git a/src/interface-gtk/interface.c b/src/interface-gtk/interface.c index 2ad8335..9c1ce6a 100644 --- a/src/interface-gtk/interface.c +++ b/src/interface-gtk/interface.c @@ -927,19 +927,19 @@ teco_interface_handle_key_press(GdkEventKey *event, GError **error) switch (event->keyval) { case GDK_KEY_Escape: - if (!teco_cmdline_keypress_c('\e', error)) + if (!teco_cmdline_keypress_wc('\e', error)) return FALSE; break; case GDK_KEY_BackSpace: - if (!teco_cmdline_keypress_c(TECO_CTL_KEY('H'), error)) + if (!teco_cmdline_keypress_wc(TECO_CTL_KEY('H'), error)) return FALSE; break; case GDK_KEY_Tab: - if (!teco_cmdline_keypress_c('\t', error)) + if (!teco_cmdline_keypress_wc('\t', error)) return FALSE; break; case GDK_KEY_Return: - if (!teco_cmdline_keypress_c('\n', error)) + if (!teco_cmdline_keypress_wc('\n', error)) return FALSE; break; @@ -994,7 +994,7 @@ teco_interface_handle_key_press(GdkEventKey *event, GError **error) if ((event->state & (GDK_CONTROL_MASK | GDK_MOD1_MASK)) == GDK_CONTROL_MASK) { gchar c = teco_interface_get_ansi_key(event); if (c) { - if (!teco_cmdline_keypress_c(TECO_CTL_KEY(g_ascii_toupper(c)), error)) + if (!teco_cmdline_keypress_wc(TECO_CTL_KEY(g_ascii_toupper(c)), error)) return FALSE; break; } diff --git a/src/parser.c b/src/parser.c index ed21740..321803a 100644 --- a/src/parser.c +++ b/src/parser.c @@ -59,7 +59,7 @@ teco_loop_stack_cleanup(void) } gboolean -teco_machine_input(teco_machine_t *ctx, gchar chr, GError **error) +teco_machine_input(teco_machine_t *ctx, gunichar chr, GError **error) { teco_state_t *next = ctx->current->input_cb(ctx, chr, error); if (!next) @@ -86,10 +86,20 @@ teco_state_end_of_macro(teco_machine_t *ctx, GError **error) } /** + * Execute macro from current PC to stop position. + * * Handles all expected exceptions and preparing them for stack frame insertion. + * + * @param ctx State machine. + * @param macro The macro to execute. + * It does not have to be complete. + * It must consist only of validated UTF-8 sequences, though. + * @param stop_pos Where to stop execution in bytes. + * @param error Location to store error. + * @return FALSE if an error occurred. */ gboolean -teco_machine_main_step(teco_machine_main_t *ctx, const gchar *macro, gint stop_pos, GError **error) +teco_machine_main_step(teco_machine_main_t *ctx, const gchar *macro, gsize stop_pos, GError **error) { while (ctx->macro_pc < stop_pos) { #ifdef DEBUG @@ -110,9 +120,13 @@ teco_machine_main_step(teco_machine_main_t *ctx, const gchar *macro, gint stop_p if (!teco_memory_check(0, error)) goto error_attach; - if (!teco_machine_input(&ctx->parent, macro[ctx->macro_pc], error)) + /* UTF-8 sequences are already validated */ + gunichar chr = g_utf8_get_char(macro+ctx->macro_pc); + + if (!teco_machine_input(&ctx->parent, chr, error)) goto error_attach; - ctx->macro_pc++; + + ctx->macro_pc = g_utf8_next_char(macro+ctx->macro_pc) - macro; } /* @@ -145,6 +159,20 @@ teco_execute_macro(const gchar *macro, gsize macro_len, teco_qreg_table_t *qreg_table_locals, GError **error) { /* + * Validate UTF-8, but accept null bytes. + * NOTE: there is g_utf8_validate_len() in Glib 2.60 + */ + const gchar *p = macro; + while (!g_utf8_validate(p, macro_len - (p - macro), &p) && !*p) + p++; + if (p - macro < macro_len) { + g_set_error(error, TECO_ERROR, TECO_ERROR_CODEPOINT, + "Invalid UTF-8 byte sequence at %" G_GSIZE_FORMAT, + p - macro); + return FALSE; + } + + /* * This is not auto-cleaned up, so it can be initialized * on demand. */ @@ -309,26 +337,26 @@ teco_machine_main_eval_colon(teco_machine_main_t *ctx) teco_state_t * teco_machine_main_transition_input(teco_machine_main_t *ctx, teco_machine_main_transition_t *transitions, - guint len, gchar chr, GError **error) + guint len, gunichar chr, GError **error) { - if (chr < 0 || chr >= len || !transitions[(guint)chr].next) { + if (chr >= len || !transitions[chr].next) { teco_error_syntax_set(error, chr); return NULL; } - if (ctx->mode == TECO_MODE_NORMAL && transitions[(guint)chr].transition_cb) { + if (ctx->mode == TECO_MODE_NORMAL && transitions[chr].transition_cb) { /* * NOTE: We could also just let transition_cb return a boolean... */ GError *tmp_error = NULL; - transitions[(guint)chr].transition_cb(ctx, &tmp_error); + transitions[chr].transition_cb(ctx, &tmp_error); if (tmp_error) { g_propagate_error(error, tmp_error); return NULL; } } - return transitions[(guint)chr].next; + return transitions[chr].next; } void @@ -342,11 +370,11 @@ teco_machine_main_clear(teco_machine_main_t *ctx) * FIXME: All teco_state_stringbuilding_* states could be static? */ static teco_state_t *teco_state_stringbuilding_ctl_input(teco_machine_stringbuilding_t *ctx, - gchar chr, GError **error); + gunichar chr, GError **error); TECO_DECLARE_STATE(teco_state_stringbuilding_ctl); static teco_state_t *teco_state_stringbuilding_escaped_input(teco_machine_stringbuilding_t *ctx, - gchar chr, GError **error); + gunichar chr, GError **error); TECO_DECLARE_STATE(teco_state_stringbuilding_escaped); TECO_DECLARE_STATE(teco_state_stringbuilding_lower); @@ -360,7 +388,7 @@ TECO_DECLARE_STATE(teco_state_stringbuilding_ctle_quote); TECO_DECLARE_STATE(teco_state_stringbuilding_ctle_n); static teco_state_t * -teco_state_stringbuilding_start_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_start_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { if (chr == '^') return &teco_state_stringbuilding_ctl; @@ -372,7 +400,7 @@ teco_state_stringbuilding_start_input(teco_machine_stringbuilding_t *ctx, gchar /* in cmdline.c */ gboolean teco_state_stringbuilding_start_process_edit_cmd(teco_machine_stringbuilding_t *ctx, teco_machine_t *parent_ctx, - gchar key, GError **error); + gunichar key, GError **error); TECO_DEFINE_STATE(teco_state_stringbuilding_start, .is_start = TRUE, @@ -381,7 +409,7 @@ TECO_DEFINE_STATE(teco_state_stringbuilding_start, ); static teco_state_t * -teco_state_stringbuilding_ctl_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_ctl_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { chr = teco_ascii_toupper(chr); @@ -396,40 +424,50 @@ teco_state_stringbuilding_ctl_input(teco_machine_stringbuilding_t *ctx, gchar ch chr = TECO_CTL_KEY(chr); } + /* + * Source code is always in UTF-8, so it does not + * make sense to handle ctx->codepage != SC_CP_UTF8 + * separately. + */ if (ctx->result) - teco_string_append_c(ctx->result, chr); + teco_string_append_wc(ctx->result, chr); return &teco_state_stringbuilding_start; } TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_stringbuilding_ctl); static teco_state_t * -teco_state_stringbuilding_escaped_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_escaped_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { if (!ctx->result) /* parse-only mode */ return &teco_state_stringbuilding_start; - /* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */ + /* + * The subtle difference between UTF-8 and single-byte targets + * is that we don't try to casefold non-ANSI characters in single-byte mode. + */ switch (ctx->mode) { case TECO_STRINGBUILDING_MODE_UPPER: - chr = g_ascii_toupper(chr); + chr = ctx->codepage == SC_CP_UTF8 || chr < 0x80 + ? g_unichar_toupper(chr) : chr; break; case TECO_STRINGBUILDING_MODE_LOWER: - chr = g_ascii_tolower(chr); + chr = ctx->codepage == SC_CP_UTF8 || chr < 0x80 + ? g_unichar_tolower(chr) : chr; break; default: break; } - teco_string_append_c(ctx->result, chr); + teco_string_append_wc(ctx->result, chr); return &teco_state_stringbuilding_start; } TECO_DEFINE_STATE(teco_state_stringbuilding_escaped); static teco_state_t * -teco_state_stringbuilding_lower_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_lower_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { if (!ctx->result) /* parse-only mode */ @@ -443,8 +481,9 @@ teco_state_stringbuilding_lower_input(teco_machine_stringbuilding_t *ctx, gchar teco_undo_guint(ctx->mode); ctx->mode = TECO_STRINGBUILDING_MODE_LOWER; } else { - /* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */ - teco_string_append_c(ctx->result, g_ascii_tolower(chr)); + chr = ctx->codepage == SC_CP_UTF8 || chr < 0x80 + ? g_unichar_tolower(chr) : chr; + teco_string_append_wc(ctx->result, chr); } return &teco_state_stringbuilding_start; @@ -453,7 +492,7 @@ teco_state_stringbuilding_lower_input(teco_machine_stringbuilding_t *ctx, gchar TECO_DEFINE_STATE(teco_state_stringbuilding_lower); static teco_state_t * -teco_state_stringbuilding_upper_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_upper_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { if (!ctx->result) /* parse-only mode */ @@ -467,8 +506,9 @@ teco_state_stringbuilding_upper_input(teco_machine_stringbuilding_t *ctx, gchar teco_undo_guint(ctx->mode); ctx->mode = TECO_STRINGBUILDING_MODE_UPPER; } else { - /* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */ - teco_string_append_c(ctx->result, g_ascii_toupper(chr)); + chr = ctx->codepage == SC_CP_UTF8 || chr < 0x80 + ? g_unichar_toupper(chr) : chr; + teco_string_append_wc(ctx->result, chr); } return &teco_state_stringbuilding_start; @@ -477,7 +517,7 @@ teco_state_stringbuilding_upper_input(teco_machine_stringbuilding_t *ctx, gchar TECO_DEFINE_STATE(teco_state_stringbuilding_upper); static teco_state_t * -teco_state_stringbuilding_ctle_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_ctle_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { teco_state_t *next; @@ -489,8 +529,9 @@ teco_state_stringbuilding_ctle_input(teco_machine_stringbuilding_t *ctx, gchar c case 'N': next = &teco_state_stringbuilding_ctle_n; break; default: if (ctx->result) { - gchar buf[] = {TECO_CTL_KEY('E'), chr}; - teco_string_append(ctx->result, buf, sizeof(buf)); + gchar buf[1+6] = {TECO_CTL_KEY('E')}; + gsize len = g_unichar_to_utf8(chr, buf+1); + teco_string_append(ctx->result, buf, 1+len); } return &teco_state_stringbuilding_start; } @@ -508,7 +549,7 @@ TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_stringbuilding_ctle); /* in cmdline.c */ gboolean teco_state_stringbuilding_qreg_process_edit_cmd(teco_machine_stringbuilding_t *ctx, teco_machine_t *parent_ctx, - gchar chr, GError **error); + gunichar chr, GError **error); /** * @interface TECO_DEFINE_STATE_STRINGBUILDING_QREG @@ -523,7 +564,7 @@ gboolean teco_state_stringbuilding_qreg_process_edit_cmd(teco_machine_stringbuil ) static teco_state_t * -teco_state_stringbuilding_ctle_num_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_ctle_num_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { teco_qreg_t *qreg; @@ -558,7 +599,7 @@ teco_state_stringbuilding_ctle_num_input(teco_machine_stringbuilding_t *ctx, gch TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_num); static teco_state_t * -teco_state_stringbuilding_ctle_u_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_ctle_u_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { teco_qreg_t *qreg; @@ -583,10 +624,7 @@ teco_state_stringbuilding_ctle_u_input(teco_machine_stringbuilding_t *ctx, gchar if (ctx->codepage == SC_CP_UTF8) { if (value < 0 || !g_unichar_validate(value)) goto error_codepoint; - /* 4 bytes should be enough, but we better follow the documentation */ - gchar buf[6]; - gsize len = g_unichar_to_utf8(value, buf); - teco_string_append(ctx->result, buf, len); + teco_string_append_wc(ctx->result, value); } else { if (value < 0 || value > 0xFF) goto error_codepoint; @@ -606,7 +644,7 @@ error_codepoint: { TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_u); static teco_state_t * -teco_state_stringbuilding_ctle_q_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_ctle_q_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { teco_qreg_t *qreg; @@ -637,7 +675,7 @@ teco_state_stringbuilding_ctle_q_input(teco_machine_stringbuilding_t *ctx, gchar TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_q); static teco_state_t * -teco_state_stringbuilding_ctle_quote_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_ctle_quote_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { teco_qreg_t *qreg; teco_qreg_table_t *table; @@ -680,7 +718,7 @@ teco_state_stringbuilding_ctle_quote_input(teco_machine_stringbuilding_t *ctx, g TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_quote); static teco_state_t * -teco_state_stringbuilding_ctle_n_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_ctle_n_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { teco_qreg_t *qreg; teco_qreg_table_t *table; @@ -717,7 +755,7 @@ teco_state_stringbuilding_ctle_n_input(teco_machine_stringbuilding_t *ctx, gchar TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_n); void -teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gchar escape_char, +teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gunichar escape_char, teco_qreg_table_t *locals, gboolean must_undo) { memset(ctx, 0, sizeof(*ctx)); @@ -738,6 +776,10 @@ teco_machine_stringbuilding_reset(teco_machine_stringbuilding_t *ctx) ctx->mode = TECO_STRINGBUILDING_MODE_NORMAL; } +/* + * If we case folded only ANSI characters as in teco_ascii_toupper(), + * this could be simplified. + */ void teco_machine_stringbuilding_escape(teco_machine_stringbuilding_t *ctx, const gchar *str, gsize len, teco_string_t *target) @@ -745,12 +787,18 @@ teco_machine_stringbuilding_escape(teco_machine_stringbuilding_t *ctx, const gch target->data = g_malloc(len*2+1); target->len = 0; - for (guint i = 0; i < len; i++) { - if (teco_ascii_toupper(str[i]) == ctx->escape_char || - (ctx->escape_char == '[' && str[i] == ']') || - (ctx->escape_char == '{' && str[i] == '}')) + for (guint i = 0; i < len; ) { + gunichar chr = g_utf8_get_char(str+i); + + if (g_unichar_toupper(chr) == ctx->escape_char || + (ctx->escape_char == '[' && chr == ']') || + (ctx->escape_char == '{' && chr == '}')) target->data[target->len++] = TECO_CTL_KEY('Q'); - target->data[target->len++] = str[i]; + + gsize lenc = g_utf8_next_char(str+i) - (str+i); + memcpy(target->data+target->len, str+i, lenc); + target->len += lenc; + i += lenc; } target->data[target->len] = '\0'; @@ -772,7 +820,7 @@ teco_state_expectstring_initial(teco_machine_main_t *ctx, GError **error) } teco_state_t * -teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **error) +teco_state_expectstring_input(teco_machine_main_t *ctx, gunichar chr, GError **error) { teco_state_t *current = ctx->parent.current; @@ -789,13 +837,18 @@ teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **erro /* * FIXME: Exclude setting at least whitespace characters as the * new string escape character to avoid accidental errors? + * + * FIXME: Should we perhaps restrict case folding escape characters + * to the ANSI range (teco_ascii_toupper())? + * This would be faster than case folding each and every character + * of a string argument to check against the escape char. */ switch (ctx->expectstring.machine.escape_char) { case '\e': case '{': if (ctx->parent.must_undo) - teco_undo_gchar(ctx->expectstring.machine.escape_char); - ctx->expectstring.machine.escape_char = teco_ascii_toupper(chr); + teco_undo_gunichar(ctx->expectstring.machine.escape_char); + ctx->expectstring.machine.escape_char = g_unichar_toupper(chr); return current; } } @@ -819,7 +872,7 @@ teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **erro ctx->expectstring.nesting--; break; } - } else if (teco_ascii_toupper(chr) == ctx->expectstring.machine.escape_char) { + } else if (g_unichar_toupper(chr) == ctx->expectstring.machine.escape_char) { if (ctx->parent.must_undo) teco_undo_gint(ctx->expectstring.nesting); ctx->expectstring.nesting--; @@ -849,7 +902,7 @@ teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **erro if (current->expectstring.last) { if (ctx->parent.must_undo) - teco_undo_gchar(ctx->expectstring.machine.escape_char); + teco_undo_gunichar(ctx->expectstring.machine.escape_char); ctx->expectstring.machine.escape_char = '\e'; } ctx->expectstring.nesting = 1; @@ -880,7 +933,7 @@ teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **erro if (!teco_machine_stringbuilding_input(&ctx->expectstring.machine, chr, str, error)) return NULL; } else if (ctx->mode == TECO_MODE_NORMAL) { - teco_string_append_c(&ctx->expectstring.string, chr); + teco_string_append_wc(&ctx->expectstring.string, chr); } /* @@ -924,7 +977,7 @@ teco_state_expectfile_process(teco_machine_main_t *ctx, const teco_string_t *str g_assert(str->data != NULL); /* - * Null-chars must not ocur in filename/path strings and at some point + * Null-chars must not occur in filename/path strings and at some point * teco_string_t has to be converted to a null-terminated C string * as all the glib filename functions rely on null-terminated strings. * Doing it here ensures that teco_file_expand_path() can be safely called diff --git a/src/parser.h b/src/parser.h index 09ec483..ae2cb9b 100644 --- a/src/parser.h +++ b/src/parser.h @@ -101,11 +101,11 @@ typedef const struct { } teco_state_expectqreg_t; typedef gboolean (*teco_state_initial_cb_t)(teco_machine_t *ctx, GError **error); -typedef teco_state_t *(*teco_state_input_cb_t)(teco_machine_t *ctx, gchar chr, GError **error); +typedef teco_state_t *(*teco_state_input_cb_t)(teco_machine_t *ctx, gunichar chr, GError **error); typedef gboolean (*teco_state_refresh_cb_t)(teco_machine_t *ctx, GError **error); typedef gboolean (*teco_state_end_of_macro_cb_t)(teco_machine_t *ctx, GError **error); typedef gboolean (*teco_state_process_edit_cmd_cb_t)(teco_machine_t *ctx, teco_machine_t *parent_ctx, - gchar key, GError **error); + gunichar key, GError **error); typedef enum { TECO_FNMACRO_MASK_START = (1 << 0), @@ -225,7 +225,7 @@ struct teco_state_t { gboolean teco_state_end_of_macro(teco_machine_t *ctx, GError **error); /* in cmdline.c */ -gboolean teco_state_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gchar chr, GError **error); +gboolean teco_state_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gunichar chr, GError **error); /** * @interface TECO_DEFINE_STATE @@ -254,7 +254,7 @@ gboolean teco_state_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent extern teco_state_t NAME /* in cmdline.c */ -gboolean teco_state_caseinsensitive_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gchar chr, GError **error); +gboolean teco_state_caseinsensitive_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gunichar chr, GError **error); /** * @interface TECO_DEFINE_STATE_CASEINSENSITIVE @@ -308,7 +308,7 @@ teco_machine_reset(teco_machine_t *ctx, teco_state_t *initial) teco_undo_ptr(ctx->current) = initial; } -gboolean teco_machine_input(teco_machine_t *ctx, gchar chr, GError **error); +gboolean teco_machine_input(teco_machine_t *ctx, gunichar chr, GError **error); typedef enum { TECO_STRINGBUILDING_MODE_NORMAL = 0, @@ -336,7 +336,7 @@ typedef struct teco_machine_stringbuilding_t { * If this is `[` or `{`, it is assumed that `]` and `}` must * be escaped as well by teco_machine_stringbuilding_escape(). */ - gchar escape_char; + gunichar escape_char; /** * Q-Register table for local registers. @@ -366,7 +366,7 @@ typedef struct teco_machine_stringbuilding_t { guint codepage; } teco_machine_stringbuilding_t; -void teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gchar escape_char, +void teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gunichar escape_char, teco_qreg_table_t *locals, gboolean must_undo); void teco_machine_stringbuilding_reset(teco_machine_stringbuilding_t *ctx); @@ -381,7 +381,7 @@ void teco_machine_stringbuilding_reset(teco_machine_stringbuilding_t *ctx); * @return FALSE in case of error. */ static inline gboolean -teco_machine_stringbuilding_input(teco_machine_stringbuilding_t *ctx, gchar chr, +teco_machine_stringbuilding_input(teco_machine_stringbuilding_t *ctx, gunichar chr, teco_string_t *result, GError **error) { ctx->result = result; @@ -497,7 +497,7 @@ void teco_machine_main_init(teco_machine_main_t *ctx, gboolean teco_machine_main_eval_colon(teco_machine_main_t *ctx); gboolean teco_machine_main_step(teco_machine_main_t *ctx, - const gchar *macro, gint stop_pos, GError **error); + const gchar *macro, gsize stop_pos, GError **error); gboolean teco_execute_macro(const gchar *macro, gsize macro_len, teco_qreg_table_t *qreg_table_locals, GError **error); @@ -516,18 +516,18 @@ typedef const struct { */ teco_state_t *teco_machine_main_transition_input(teco_machine_main_t *ctx, teco_machine_main_transition_t *transitions, - guint len, gchar chr, GError **error); + guint len, gunichar chr, GError **error); void teco_machine_main_clear(teco_machine_main_t *ctx); G_DEFINE_AUTO_CLEANUP_CLEAR_FUNC(teco_machine_main_t, teco_machine_main_clear); gboolean teco_state_expectstring_initial(teco_machine_main_t *ctx, GError **error); -teco_state_t *teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **error); +teco_state_t *teco_state_expectstring_input(teco_machine_main_t *ctx, gunichar chr, GError **error); gboolean teco_state_expectstring_refresh(teco_machine_main_t *ctx, GError **error); /* in cmdline.c */ -gboolean teco_state_expectstring_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error); +gboolean teco_state_expectstring_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error); /** * @interface TECO_DEFINE_STATE_EXPECTSTRING @@ -543,7 +543,7 @@ gboolean teco_state_expectstring_process_edit_cmd(teco_machine_main_t *ctx, teco */ #define TECO_DEFINE_STATE_EXPECTSTRING(NAME, ...) \ static teco_state_t * \ - NAME##_input(teco_machine_main_t *ctx, gchar chr, GError **error) \ + NAME##_input(teco_machine_main_t *ctx, gunichar chr, GError **error) \ { \ return teco_state_expectstring_input(ctx, chr, error); \ } \ @@ -564,7 +564,7 @@ gboolean teco_state_expectfile_process(teco_machine_main_t *ctx, const teco_stri gsize new_chars, GError **error); /* in cmdline.c */ -gboolean teco_state_expectfile_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error); +gboolean teco_state_expectfile_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error); /** * @interface TECO_DEFINE_STATE_EXPECTFILE @@ -580,7 +580,7 @@ gboolean teco_state_expectfile_process_edit_cmd(teco_machine_main_t *ctx, teco_m ) /* in cmdline.c */ -gboolean teco_state_expectdir_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error); +gboolean teco_state_expectdir_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error); /** * @interface TECO_DEFINE_STATE_EXPECTDIR diff --git a/src/qreg-commands.c b/src/qreg-commands.c index f248ced..8d28e7d 100644 --- a/src/qreg-commands.c +++ b/src/qreg-commands.c @@ -50,7 +50,7 @@ teco_state_expectqreg_initial(teco_machine_main_t *ctx, GError **error) } teco_state_t * -teco_state_expectqreg_input(teco_machine_main_t *ctx, gchar chr, GError **error) +teco_state_expectqreg_input(teco_machine_main_t *ctx, gunichar chr, GError **error) { teco_state_t *current = ctx->parent.current; @@ -680,6 +680,10 @@ teco_state_macro_got_register(teco_machine_main_t *ctx, teco_qreg_t *qreg, * Note that the string of <q> will be copied upon macro execution, * so subsequent changes to Q-Register <q> from inside the macro do * not modify the executed code. + * + * While \fBM\fP does not check the register's configured encoding + * (as reported by \fBEE\fP), its contents must be and are checked to be in + * valid UTF-8. */ TECO_DEFINE_STATE_EXPECTQREG(teco_state_macro); @@ -714,6 +718,9 @@ teco_state_macrofile_done(teco_machine_main_t *ctx, const teco_string_t *str, GE * It is otherwise similar to the \(lqM\(rq command. * * If <file> could not be read, the command yields an error. + * + * As all \*(ST code, the contents of <file> must be in valid UTF-8 + * even if operating in the \(lqdefault ANSI\(rq mode as configured by \fBED\fP. */ TECO_DEFINE_STATE_EXPECTFILE(teco_state_macrofile); diff --git a/src/qreg-commands.h b/src/qreg-commands.h index b190e9f..27a6a5c 100644 --- a/src/qreg-commands.h +++ b/src/qreg-commands.h @@ -33,10 +33,10 @@ teco_state_expectqreg_reset(teco_machine_main_t *ctx) gboolean teco_state_expectqreg_initial(teco_machine_main_t *ctx, GError **error); -teco_state_t *teco_state_expectqreg_input(teco_machine_main_t *ctx, gchar chr, GError **error); +teco_state_t *teco_state_expectqreg_input(teco_machine_main_t *ctx, gunichar chr, GError **error); /* in cmdline.c */ -gboolean teco_state_expectqreg_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error); +gboolean teco_state_expectqreg_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error); /** * @interface TECO_DEFINE_STATE_EXPECTQREG @@ -47,7 +47,7 @@ gboolean teco_state_expectqreg_process_edit_cmd(teco_machine_main_t *ctx, teco_m */ #define TECO_DEFINE_STATE_EXPECTQREG(NAME, ...) \ static teco_state_t * \ - NAME##_input(teco_machine_main_t *ctx, gchar chr, GError **error) \ + NAME##_input(teco_machine_main_t *ctx, gunichar chr, GError **error) \ { \ return teco_state_expectqreg_input(ctx, chr, error); \ } \ @@ -84,10 +84,9 @@ teco_qreg_execute(teco_qreg_t *qreg, teco_qreg_table_t *qreg_table_locals, GErro g_auto(teco_string_t) macro = {NULL, 0}; /* - * FIXME: Once we have an Unicode-aware parser, - * we should probably check the encoding of the register. - * On the other hand, we will have to validate the - * UTF-8 codepoints before execution anyway. + * SciTECO macros must be in UTF-8, but we don't check the encoding, + * so as not to complicate TECO_ED_DEFAULT_ANSI mode. + * The UTF-8 byte sequences are checked anyway. */ if (!qreg->vtable->get_string(qreg, ¯o.data, ¯o.len, NULL, error) || !teco_execute_macro(macro.data, macro.len, qreg_table_locals, error)) { @@ -1220,7 +1219,7 @@ TECO_DECLARE_STATE(teco_state_qregspec_secondchar); TECO_DECLARE_STATE(teco_state_qregspec_string); static teco_state_t *teco_state_qregspec_start_global_input(teco_machine_qregspec_t *ctx, - gchar chr, GError **error); + gunichar chr, GError **error); static teco_state_t * teco_state_qregspec_done(teco_machine_qregspec_t *ctx, GError **error) @@ -1255,7 +1254,7 @@ teco_state_qregspec_done(teco_machine_qregspec_t *ctx, GError **error) } static teco_state_t * -teco_state_qregspec_start_input(teco_machine_qregspec_t *ctx, gchar chr, GError **error) +teco_state_qregspec_start_input(teco_machine_qregspec_t *ctx, gunichar chr, GError **error) { /* * FIXME: We're using teco_state_qregspec_start as a success condition, @@ -1272,7 +1271,7 @@ teco_state_qregspec_start_input(teco_machine_qregspec_t *ctx, gchar chr, GError } /* in cmdline.c */ -gboolean teco_state_qregspec_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error); +gboolean teco_state_qregspec_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error); TECO_DEFINE_STATE(teco_state_qregspec_start, .is_start = TRUE, @@ -1280,7 +1279,7 @@ TECO_DEFINE_STATE(teco_state_qregspec_start, ); static teco_state_t * -teco_state_qregspec_start_global_input(teco_machine_qregspec_t *ctx, gchar chr, GError **error) +teco_state_qregspec_start_global_input(teco_machine_qregspec_t *ctx, gunichar chr, GError **error) { /* * FIXME: Disallow space characters? @@ -1299,8 +1298,7 @@ teco_state_qregspec_start_global_input(teco_machine_qregspec_t *ctx, gchar chr, if (!ctx->parse_only) { if (ctx->parent.must_undo) undo__teco_string_truncate(&ctx->name, ctx->name.len); - /* FIXME: g_unicode_toupper() once we have an Unicode-conforming parser */ - teco_string_append_c(&ctx->name, g_ascii_toupper(chr)); + teco_string_append_wc(&ctx->name, g_unichar_toupper(chr)); } return teco_state_qregspec_done(ctx, error); } @@ -1316,7 +1314,7 @@ TECO_DEFINE_STATE(teco_state_qregspec_start_global, ); static teco_state_t * -teco_state_qregspec_firstchar_input(teco_machine_qregspec_t *ctx, gchar chr, GError **error) +teco_state_qregspec_firstchar_input(teco_machine_qregspec_t *ctx, gunichar chr, GError **error) { /* * FIXME: Disallow space characters? @@ -1324,8 +1322,7 @@ teco_state_qregspec_firstchar_input(teco_machine_qregspec_t *ctx, gchar chr, GEr if (!ctx->parse_only) { if (ctx->parent.must_undo) undo__teco_string_truncate(&ctx->name, ctx->name.len); - /* FIXME: g_unicode_toupper() once we have an Unicode-conforming parser */ - teco_string_append_c(&ctx->name, g_ascii_toupper(chr)); + teco_string_append_wc(&ctx->name, g_unichar_toupper(chr)); } return &teco_state_qregspec_secondchar; } @@ -1335,7 +1332,7 @@ TECO_DEFINE_STATE(teco_state_qregspec_firstchar, ); static teco_state_t * -teco_state_qregspec_secondchar_input(teco_machine_qregspec_t *ctx, gchar chr, GError **error) +teco_state_qregspec_secondchar_input(teco_machine_qregspec_t *ctx, gunichar chr, GError **error) { /* * FIXME: Disallow space characters? @@ -1343,8 +1340,7 @@ teco_state_qregspec_secondchar_input(teco_machine_qregspec_t *ctx, gchar chr, GE if (!ctx->parse_only) { if (ctx->parent.must_undo) undo__teco_string_truncate(&ctx->name, ctx->name.len); - /* FIXME: g_unicode_toupper() once we have an Unicode-conforming parser */ - teco_string_append_c(&ctx->name, g_ascii_toupper(chr)); + teco_string_append_wc(&ctx->name, g_unichar_toupper(chr)); } return teco_state_qregspec_done(ctx, error); } @@ -1354,7 +1350,7 @@ TECO_DEFINE_STATE(teco_state_qregspec_secondchar, ); static teco_state_t * -teco_state_qregspec_string_input(teco_machine_qregspec_t *ctx, gchar chr, GError **error) +teco_state_qregspec_string_input(teco_machine_qregspec_t *ctx, gunichar chr, GError **error) { /* * Makes sure that braces within string building constructs do not have to be @@ -1395,7 +1391,7 @@ teco_state_qregspec_string_input(teco_machine_qregspec_t *ctx, gchar chr, GError /* in cmdline.c */ gboolean teco_state_qregspec_string_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_t *parent_ctx, - gchar key, GError **error); + gunichar key, GError **error); TECO_DEFINE_STATE(teco_state_qregspec_string, .process_edit_cmd_cb = (teco_state_process_edit_cmd_cb_t)teco_state_qregspec_string_process_edit_cmd @@ -1456,7 +1452,7 @@ teco_machine_qregspec_get_stringbuilding(teco_machine_qregspec_t *ctx) * @memberof teco_machine_qregspec_t */ teco_machine_qregspec_status_t -teco_machine_qregspec_input(teco_machine_qregspec_t *ctx, gchar chr, +teco_machine_qregspec_input(teco_machine_qregspec_t *ctx, gunichar chr, teco_qreg_t **result, teco_qreg_table_t **result_table, GError **error) { ctx->parse_only = result == NULL; @@ -1484,7 +1480,7 @@ teco_machine_qregspec_get_results(teco_machine_qregspec_t *ctx, gboolean teco_machine_qregspec_auto_complete(teco_machine_qregspec_t *ctx, teco_string_t *insert) { - gsize restrict_len = 0; + guint restrict_len = 0; /* * NOTE: We could have separate process_edit_cmd_cb() for @@ -1499,6 +1495,10 @@ teco_machine_qregspec_auto_complete(teco_machine_qregspec_t *ctx, teco_string_t /* two-letter Q-Reg */ restrict_len = 2; + /* + * FIXME: This is not quite right as it will propose even + * lower case single or two-letter Q-Register names. + */ return teco_rb3str_auto_complete(&ctx->result_table->tree, !restrict_len, ctx->name.data, ctx->name.len, restrict_len, insert) && ctx->nesting == 1; @@ -227,7 +227,7 @@ void teco_machine_qregspec_reset(teco_machine_qregspec_t *ctx); */ struct teco_machine_stringbuilding_t *teco_machine_qregspec_get_stringbuilding(teco_machine_qregspec_t *ctx); -teco_machine_qregspec_status_t teco_machine_qregspec_input(teco_machine_qregspec_t *ctx, gchar chr, +teco_machine_qregspec_status_t teco_machine_qregspec_input(teco_machine_qregspec_t *ctx, gunichar chr, teco_qreg_t **result, teco_qreg_table_t **result_table, GError **error); diff --git a/src/rb3str.c b/src/rb3str.c index 72cf444..d51ac5d 100644 --- a/src/rb3str.c +++ b/src/rb3str.c @@ -95,7 +95,7 @@ teco_rb3str_nfind(teco_rb3str_tree_t *tree, gboolean case_sensitive, const gchar * @param case_sensitive Whether to match case-sensitive. * @param str String to complete (not necessarily null-terminated). * @param str_len Length of characters in `str`. - * @param restrict_len Limit completions to this size. + * @param restrict_len Limit completions to this size (in characters). * @param insert String to set with characters that can be autocompleted. * @return TRUE if the completion was unambiguous, else FALSE. * @@ -103,7 +103,7 @@ teco_rb3str_nfind(teco_rb3str_tree_t *tree, gboolean case_sensitive, const gchar */ gboolean teco_rb3str_auto_complete(teco_rb3str_tree_t *tree, gboolean case_sensitive, - const gchar *str, gsize str_len, gsize restrict_len, teco_string_t *insert) + const gchar *str, gsize str_len, guint restrict_len, teco_string_t *insert) { memset(insert, 0, sizeof(*insert)); @@ -115,7 +115,7 @@ teco_rb3str_auto_complete(teco_rb3str_tree_t *tree, gboolean case_sensitive, for (teco_rb3str_head_t *cur = teco_rb3str_nfind(tree, case_sensitive, str, str_len); cur && cur->key.len >= str_len && diff(&cur->key, str, str_len) == str_len; cur = teco_rb3str_get_next(cur)) { - if (restrict_len && cur->key.len != restrict_len) + if (restrict_len && g_utf8_strlen(cur->key.data, cur->key.len) != restrict_len) continue; if (G_UNLIKELY(!first)) { @@ -136,7 +136,7 @@ teco_rb3str_auto_complete(teco_rb3str_tree_t *tree, gboolean case_sensitive, for (teco_rb3str_head_t *cur = first; cur && cur->key.len >= str_len && diff(&cur->key, str, str_len) == str_len; cur = teco_rb3str_get_next(cur)) { - if (restrict_len && cur->key.len != restrict_len) + if (restrict_len && g_utf8_strlen(cur->key.data, cur->key.len) != restrict_len) continue; teco_interface_popup_add(TECO_POPUP_PLAIN, diff --git a/src/rb3str.h b/src/rb3str.h index 74b3a37..adf5f89 100644 --- a/src/rb3str.h +++ b/src/rb3str.h @@ -65,5 +65,5 @@ teco_rb3str_head_t *teco_rb3str_nfind(teco_rb3str_tree_t *tree, gboolean case_se const gchar *str, gsize len); gboolean teco_rb3str_auto_complete(teco_rb3str_tree_t *tree, gboolean case_sensitive, - const gchar *str, gsize str_len, gsize restrict_len, + const gchar *str, gsize str_len, guint restrict_len, teco_string_t *insert); diff --git a/src/sciteco.h b/src/sciteco.h index 09dea3b..02eed97 100644 --- a/src/sciteco.h +++ b/src/sciteco.h @@ -71,7 +71,7 @@ teco_is_failure(teco_bool_t x) #endif /** TRUE if C is a control character */ -#define TECO_IS_CTL(C) ((guchar)(C) < ' ') +#define TECO_IS_CTL(C) ((gunichar)(C) < ' ') /** ASCII character to echo control character C */ #define TECO_CTL_ECHO(C) ((C) | 0x40) /** diff --git a/src/search.c b/src/search.c index e146def..43a2936 100644 --- a/src/search.c +++ b/src/search.c @@ -308,14 +308,6 @@ teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error do { /* - * FIXME: Currently we are fed single bytes, so there - * could be an incomplete UTF-8 sequence at the end of the pattern. - * This should not be necessary once we have an Unicode-aware parser. - */ - if (pattern->len > 0 && (gint32)g_utf8_get_char_validated(pattern->data, -1) < 0) - break; - - /* * First check whether it is a class. * This will not treat individual characters * as classes, so we do not convert them to regexp diff --git a/src/spawn.c b/src/spawn.c index 044b8de..445acc5 100644 --- a/src/spawn.c +++ b/src/spawn.c @@ -417,7 +417,7 @@ cleanup: } /* in cmdline.c */ -gboolean teco_state_execute_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error); +gboolean teco_state_execute_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error); /*$ EC pipe filter * ECcommand$ -- Execute operating system command and filter buffer contents @@ -642,7 +642,7 @@ teco_spawn_stdin_watch_cb(GIOChannel *chan, GIOCondition condition, gpointer dat gssize bytes_written = teco_eol_writer_convert(&teco_spawn_ctx.stdin_writer, buffer, convert_len, &teco_spawn_ctx.error); if (bytes_written < 0) { - /* GError ocurred */ + /* GError occurred */ g_main_loop_quit(teco_spawn_ctx.mainloop); return G_SOURCE_REMOVE; } diff --git a/src/string-utils.c b/src/string-utils.c index ac5835b..d9b12e0 100644 --- a/src/string-utils.c +++ b/src/string-utils.c @@ -78,7 +78,17 @@ teco_string_get_coord(const gchar *str, guint pos, guint *line, guint *column) } } -/** @memberof teco_string_t */ +/** + * Get the length of the prefix common to two strings. + * Works with UTF-8 and single-byte encodings. + * + * @param a Left string. + * @param b Right string. + * @param b_len Length of right string. + * @return Length of the common prefix in bytes. + * + * @memberof teco_string_t + */ gsize teco_string_diff(const teco_string_t *a, const gchar *b, gsize b_len) { @@ -92,14 +102,16 @@ teco_string_diff(const teco_string_t *a, const gchar *b, gsize b_len) } /** - * Get the length of the prefix common to two strings + * Get the length of the prefix common to two UTF-8 strings * without considering case. * - * @fixme This is currently only used for symbols and one/two letter - * Q-Register names, which cannot be UTF-8. - * If we rewrote this to perform Unicode case folding, we would - * also have to check for character validity. - * Once our parser is Unicode-aware, this is not necessary. + * The UTF-8 strings must be validated, which should be the case + * for help labels and short Q-Register names. + * + * @param a Left UTF-8 string. + * @param b Right UTF-8 string. + * @param b_len Length of right UTF-8 string. + * @return Length of the common prefix in bytes. * * @memberof teco_string_t */ @@ -108,9 +120,13 @@ teco_string_casediff(const teco_string_t *a, const gchar *b, gsize b_len) { gsize len = 0; - while (len < a->len && len < b_len && - g_ascii_tolower(a->data[len]) == g_ascii_tolower(b[len])) - len++; + while (len < a->len && len < b_len) { + gunichar a_chr = g_utf8_get_char(a->data+len); + gunichar b_chr = g_utf8_get_char(b+len); + if (g_unichar_tolower(a_chr) != g_unichar_tolower(b_chr)) + break; + len = g_utf8_next_char(b+len) - b; + } return len; } diff --git a/src/string-utils.h b/src/string-utils.h index bb9ed37..1b4957f 100644 --- a/src/string-utils.h +++ b/src/string-utils.h @@ -26,11 +26,11 @@ /** * Upper-case SciTECO command character. * - * There are implementations in glib (g_ascii_toupper) and libc, + * There are implementations in glib (g_ascii_toupper() and g_unichar_toupper()) and libc, * but this implementation is sufficient for all letters used by SciTECO commands. */ -static inline gchar -teco_ascii_toupper(gchar chr) +static inline gunichar +teco_ascii_toupper(gunichar chr) { return chr >= 'a' && chr <= 'z' ? chr & ~0x20 : chr; } @@ -52,6 +52,7 @@ teco_strv_remove(gchar **strv, guint i) * and the allocation length is not stored. * Just like GString, teco_string_t are always null-terminated but at the * same time 8-bit clean (can contain null-characters). + * It may or may not contain UTF-8 byte sequences. * * The API is designed such that teco_string_t operations operate on plain * (null-terminated) C strings, a single character or character array as well as @@ -74,7 +75,7 @@ typedef struct { * The pointer is guaranteed to be non-NULL after initialization. */ gchar *data; - /** Length of `data` without the trailing null-byte. */ + /** Length of `data` without the trailing null-byte in bytes. */ gsize len; } teco_string_t; @@ -128,6 +129,16 @@ teco_string_append_c(teco_string_t *str, gchar chr) teco_string_append(str, &chr, sizeof(chr)); } +/** @memberof teco_string_t */ +static inline void +teco_string_append_wc(teco_string_t *target, gunichar chr) +{ + /* 4 bytes should be enough, but we better follow the documentation */ + target->data = g_realloc(target->data, target->len + 6 + 1); + target->len += g_unichar_to_utf8(chr, target->data+target->len); + target->data[target->len] = '\0'; +} + /** * @fixme Should this also realloc str->data? * diff --git a/src/symbols.c b/src/symbols.c index ba407cc..feead76 100644 --- a/src/symbols.c +++ b/src/symbols.c @@ -251,7 +251,7 @@ teco_state_scintilla_symbols_done(teco_machine_main_t *ctx, const teco_string_t } /* in cmdline.c */ -gboolean teco_state_scintilla_symbols_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error); +gboolean teco_state_scintilla_symbols_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error); /*$ ES scintilla message * -- Send Scintilla message @@ -30,7 +30,7 @@ //#define DEBUG -TECO_DEFINE_UNDO_SCALAR(gchar); +TECO_DEFINE_UNDO_SCALAR(gunichar); TECO_DEFINE_UNDO_SCALAR(gint); TECO_DEFINE_UNDO_SCALAR(guint); TECO_DEFINE_UNDO_SCALAR(gsize); @@ -164,8 +164,8 @@ gpointer teco_undo_push_size(teco_undo_action_t action_cb, gsize size) * significantly improves batch-mode performance. */ -TECO_DECLARE_UNDO_SCALAR(gchar); -#define teco_undo_gchar(VAR) (*teco_undo_object_gchar_push(&(VAR))) +TECO_DECLARE_UNDO_SCALAR(gunichar); +#define teco_undo_gunichar(VAR) (*teco_undo_object_gunichar_push(&(VAR))) TECO_DECLARE_UNDO_SCALAR(gint); #define teco_undo_gint(VAR) (*teco_undo_object_gint_push(&(VAR))) diff --git a/tests/testsuite.at b/tests/testsuite.at index 4749b13..0733d2a 100644 --- a/tests/testsuite.at +++ b/tests/testsuite.at @@ -84,8 +84,6 @@ AT_CHECK([$SCITECO -e "0@I//J 0A\"N(0/0)' :@S/^@/\"F(0/0)'"], 0, ignore, ignore) AT_CHECK([$SCITECO -e "@EQa//0EE 1U*0EE 0:@EUa/f^@^@/ :Qa-4\"N(0/0)' Ga Z-4\"N(0/0)'"], 0, ignore, ignore) AT_CHECK([$SCITECO -e "0EE 129@I// -A-129\"N(0/0)' HXa @EQa// EE\"N(0/0)'"], 0, ignore, ignore) AT_CHECK([$SCITECO -8e "129@:^Ua// 0Qa-129\"N(0/0)'"], 0, ignore, ignore) -# FIXME: This will fail once we have an UTF-8-only parser. -AT_CHECK([$SCITECO -8e "@:^Ua/^^/ 129:@^Ua// Ma-129\"N(0/0)'"], 0, ignore, ignore) AT_CHECK([$SCITECO -e "1EE 167Ua @I/^EUa/ .-1\"N(0/0)'"], 0, ignore, ignore) AT_CLEANUP @@ -95,6 +93,8 @@ AT_CHECK([$SCITECO -e "8594@^Ua/Здравствуй, мир!/ :Qa-17\"N(0/0)' 0 AT_CHECK([$SCITECO -e "@I/Здравствуй, мир!/ JW .-10\"N(0/0)' ^E-20\"N(0/0)' 204:EE .-10\"N(0/0)'"], 0, ignore, ignore) AT_CHECK([$SCITECO -e "@I/TEST/ @EW/юникод.txt/"], 0, ignore, ignore) AT_CHECK([test -f юникод.txt], 0, ignore, ignore) +AT_CHECK([$SCITECO -e "^^ß-223\"N(0/0) 23Uъ Q[Ъ]-23\"N(0/0)'"], 0, ignore, ignore) +AT_CHECK([$SCITECO -e "@O/метка/ !метка!"], 0, ignore, ignore) AT_CLEANUP AT_SETUP([Automatic EOL normalization]) @@ -207,8 +207,7 @@ AT_CLEANUP AT_SETUP([Unicode glitches]) # While TECO code must always be UTF-8, strings after string building # can be in single-byte encodings as well. -# This might already work after introducing the Unicode-aware parser. -# If not, it should be fixed. +# It must be possible to search for single bytes in single-byte encodings. AT_CHECK([$SCITECO -8e "164Ua Ga@I//J :@S/^EUa/\"F(0/0)'"], 0, ignore, ignore) AT_XFAIL_IF(true) AT_CLEANUP |