diff options
Diffstat (limited to 'src/parser.c')
-rw-r--r-- | src/parser.c | 341 |
1 files changed, 255 insertions, 86 deletions
diff --git a/src/parser.c b/src/parser.c index 910fc7f..b1aa06e 100644 --- a/src/parser.c +++ b/src/parser.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2023 Robin Haberkorn + * Copyright (C) 2012-2024 Robin Haberkorn * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -52,16 +52,14 @@ teco_loop_stack_init(void) TECO_DEFINE_ARRAY_UNDO_INSERT_VAL(teco_loop_stack, teco_loop_context_t); TECO_DEFINE_ARRAY_UNDO_REMOVE_INDEX(teco_loop_stack); -#ifndef NDEBUG -static void __attribute__((destructor)) +static void TECO_DEBUG_CLEANUP teco_loop_stack_cleanup(void) { g_array_free(teco_loop_stack, TRUE); } -#endif gboolean -teco_machine_input(teco_machine_t *ctx, gchar chr, GError **error) +teco_machine_input(teco_machine_t *ctx, gunichar chr, GError **error) { teco_state_t *next = ctx->current->input_cb(ctx, chr, error); if (!next) @@ -88,18 +86,22 @@ teco_state_end_of_macro(teco_machine_t *ctx, GError **error) } /** + * Execute macro from current PC to stop position. + * * Handles all expected exceptions and preparing them for stack frame insertion. + * + * @param ctx State machine. + * @param macro The macro to execute. + * It does not have to be complete. + * It must consist only of validated UTF-8 sequences, though. + * @param stop_pos Where to stop execution in bytes. + * @param error Location to store error. + * @return FALSE if an error occurred. */ gboolean -teco_machine_main_step(teco_machine_main_t *ctx, const gchar *macro, gint stop_pos, GError **error) +teco_machine_main_step(teco_machine_main_t *ctx, const gchar *macro, gsize stop_pos, GError **error) { while (ctx->macro_pc < stop_pos) { -#ifdef DEBUG - g_printf("EXEC(%d): input='%c'/%x, state=%p, mode=%d\n", - ctx->macro_pc, macro[ctx->macro_pc], macro[ctx->macro_pc], - ctx->parent.current, ctx->mode); -#endif - if (G_UNLIKELY(teco_interface_is_interrupted())) { teco_error_interrupted_set(error); goto error_attach; @@ -112,9 +114,18 @@ teco_machine_main_step(teco_machine_main_t *ctx, const gchar *macro, gint stop_p if (!teco_memory_check(0, error)) goto error_attach; - if (!teco_machine_input(&ctx->parent, macro[ctx->macro_pc], error)) + /* UTF-8 sequences are already validated */ + gunichar chr = g_utf8_get_char(macro+ctx->macro_pc); + +#ifdef DEBUG + g_printf("EXEC(%d): input='%C' (U+%04" G_GINT32_MODIFIER "X), state=%p, mode=%d\n", + ctx->macro_pc, chr, chr, ctx->parent.current, ctx->mode); +#endif + + if (!teco_machine_input(&ctx->parent, chr, error)) goto error_attach; - ctx->macro_pc++; + + ctx->macro_pc = g_utf8_next_char(macro+ctx->macro_pc) - macro; } /* @@ -146,6 +157,14 @@ gboolean teco_execute_macro(const gchar *macro, gsize macro_len, teco_qreg_table_t *qreg_table_locals, GError **error) { + const teco_string_t str = {(gchar *)macro, macro_len}; + + if (!teco_string_validate_utf8(&str)) { + g_set_error_literal(error, TECO_ERROR, TECO_ERROR_CODEPOINT, + "Invalid UTF-8 byte sequence in macro"); + return FALSE; + } + /* * This is not auto-cleaned up, so it can be initialized * on demand. @@ -311,26 +330,26 @@ teco_machine_main_eval_colon(teco_machine_main_t *ctx) teco_state_t * teco_machine_main_transition_input(teco_machine_main_t *ctx, teco_machine_main_transition_t *transitions, - guint len, gchar chr, GError **error) + guint len, gunichar chr, GError **error) { - if (chr < 0 || chr >= len || !transitions[(guint)chr].next) { + if (chr >= len || !transitions[chr].next) { teco_error_syntax_set(error, chr); return NULL; } - if (ctx->mode == TECO_MODE_NORMAL && transitions[(guint)chr].transition_cb) { + if (ctx->mode == TECO_MODE_NORMAL && transitions[chr].transition_cb) { /* * NOTE: We could also just let transition_cb return a boolean... */ GError *tmp_error = NULL; - transitions[(guint)chr].transition_cb(ctx, &tmp_error); + transitions[chr].transition_cb(ctx, &tmp_error); if (tmp_error) { g_propagate_error(error, tmp_error); return NULL; } } - return transitions[(guint)chr].next; + return transitions[chr].next; } void @@ -340,15 +359,40 @@ teco_machine_main_clear(teco_machine_main_t *ctx) teco_machine_stringbuilding_clear(&ctx->expectstring.machine); } +/** Append string to result with case folding. */ +static void +teco_machine_stringbuilding_append(teco_machine_stringbuilding_t *ctx, const gchar *str, gsize len) +{ + g_assert(ctx->result != NULL); + + switch (ctx->mode) { + case TECO_STRINGBUILDING_MODE_NORMAL: + teco_string_append(ctx->result, str, len); + break; + case TECO_STRINGBUILDING_MODE_UPPER: { + g_autofree gchar *folded = ctx->codepage == SC_CP_UTF8 + ? g_utf8_strup(str, len) : g_ascii_strup(str, len); + teco_string_append(ctx->result, folded, strlen(folded)); + break; + } + case TECO_STRINGBUILDING_MODE_LOWER: { + g_autofree gchar *folded = ctx->codepage == SC_CP_UTF8 + ? g_utf8_strdown(str, len) : g_ascii_strdown(str, len); + teco_string_append(ctx->result, folded, strlen(folded)); + break; + } + } +} + /* * FIXME: All teco_state_stringbuilding_* states could be static? */ static teco_state_t *teco_state_stringbuilding_ctl_input(teco_machine_stringbuilding_t *ctx, - gchar chr, GError **error); + gunichar chr, GError **error); TECO_DECLARE_STATE(teco_state_stringbuilding_ctl); static teco_state_t *teco_state_stringbuilding_escaped_input(teco_machine_stringbuilding_t *ctx, - gchar chr, GError **error); + gunichar chr, GError **error); TECO_DECLARE_STATE(teco_state_stringbuilding_escaped); TECO_DECLARE_STATE(teco_state_stringbuilding_lower); @@ -362,19 +406,29 @@ TECO_DECLARE_STATE(teco_state_stringbuilding_ctle_quote); TECO_DECLARE_STATE(teco_state_stringbuilding_ctle_n); static teco_state_t * -teco_state_stringbuilding_start_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_start_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { - if (chr == '^') + switch (chr) { + case '^': return &teco_state_stringbuilding_ctl; - if (TECO_IS_CTL(chr)) - return teco_state_stringbuilding_ctl_input(ctx, TECO_CTL_ECHO(chr), error); + case TECO_CTL_KEY('^'): + /* + * Ctrl+^ is inserted verbatim as code 30. + * Otherwise it would expand to a single caret + * just like caret+caret (^^). + */ + break; + default: + if (TECO_IS_CTL(chr)) + return teco_state_stringbuilding_ctl_input(ctx, TECO_CTL_ECHO(chr), error); + } return teco_state_stringbuilding_escaped_input(ctx, chr, error); } /* in cmdline.c */ gboolean teco_state_stringbuilding_start_process_edit_cmd(teco_machine_stringbuilding_t *ctx, teco_machine_t *parent_ctx, - gchar key, GError **error); + gunichar key, GError **error); TECO_DEFINE_STATE(teco_state_stringbuilding_start, .is_start = TRUE, @@ -383,12 +437,19 @@ TECO_DEFINE_STATE(teco_state_stringbuilding_start, ); static teco_state_t * -teco_state_stringbuilding_ctl_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_ctl_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { chr = teco_ascii_toupper(chr); switch (chr) { - case '^': break; + case '^': + /* + * Double-caret expands to a single caret. + * Ctrl+^ (30) is handled separately and inserts code 30. + * The special handling of the double-caret should perhaps + * be abolished altogether. + */ + break; case 'Q': case 'R': return &teco_state_stringbuilding_escaped; case 'V': return &teco_state_stringbuilding_lower; @@ -398,85 +459,139 @@ teco_state_stringbuilding_ctl_input(teco_machine_stringbuilding_t *ctx, gchar ch chr = TECO_CTL_KEY(chr); } + /* + * Source code is always in UTF-8, so it does not + * make sense to handle ctx->codepage != SC_CP_UTF8 + * separately. + */ if (ctx->result) - teco_string_append_c(ctx->result, chr); + teco_string_append_wc(ctx->result, chr); return &teco_state_stringbuilding_start; } TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_stringbuilding_ctl); static teco_state_t * -teco_state_stringbuilding_escaped_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_escaped_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { if (!ctx->result) /* parse-only mode */ return &teco_state_stringbuilding_start; + /* + * The subtle difference between UTF-8 and single-byte targets + * is that we don't try to casefold non-ANSI characters in single-byte mode. + */ switch (ctx->mode) { + case TECO_STRINGBUILDING_MODE_NORMAL: + break; case TECO_STRINGBUILDING_MODE_UPPER: - chr = g_ascii_toupper(chr); + chr = ctx->codepage == SC_CP_UTF8 || chr < 0x80 + ? g_unichar_toupper(chr) : chr; break; case TECO_STRINGBUILDING_MODE_LOWER: - chr = g_ascii_tolower(chr); - break; - default: + chr = ctx->codepage == SC_CP_UTF8 || chr < 0x80 + ? g_unichar_tolower(chr) : chr; break; } - teco_string_append_c(ctx->result, chr); + teco_string_append_wc(ctx->result, chr); return &teco_state_stringbuilding_start; } -TECO_DEFINE_STATE(teco_state_stringbuilding_escaped); +/* in cmdline.c */ +gboolean teco_state_stringbuilding_escaped_process_edit_cmd(teco_machine_stringbuilding_t *ctx, teco_machine_t *parent_ctx, + gunichar key, GError **error); + +TECO_DEFINE_STATE(teco_state_stringbuilding_escaped, + .process_edit_cmd_cb = (teco_state_process_edit_cmd_cb_t) + teco_state_stringbuilding_escaped_process_edit_cmd +); static teco_state_t * -teco_state_stringbuilding_lower_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_lower_ctl_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { if (!ctx->result) /* parse-only mode */ return &teco_state_stringbuilding_start; - /* - * FIXME: This does not handle ^V^V typed with up-carets. - */ - if (chr == TECO_CTL_KEY('V')) { + chr = teco_ascii_toupper(chr); + + if (chr == 'V') { if (ctx->parent.must_undo) teco_undo_guint(ctx->mode); ctx->mode = TECO_STRINGBUILDING_MODE_LOWER; } else { - teco_string_append_c(ctx->result, g_ascii_tolower(chr)); + /* control keys cannot be case folded */ + teco_string_append_wc(ctx->result, TECO_CTL_KEY(chr)); } return &teco_state_stringbuilding_start; } +TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_stringbuilding_lower_ctl); + +static teco_state_t * +teco_state_stringbuilding_lower_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) +{ + if (chr == '^') + return &teco_state_stringbuilding_lower_ctl; + if (TECO_IS_CTL(chr)) + return teco_state_stringbuilding_lower_ctl_input(ctx, TECO_CTL_ECHO(chr), error); + + if (ctx->result) { + chr = ctx->codepage == SC_CP_UTF8 || chr < 0x80 + ? g_unichar_tolower(chr) : chr; + teco_string_append_wc(ctx->result, chr); + } + return &teco_state_stringbuilding_start; +} + TECO_DEFINE_STATE(teco_state_stringbuilding_lower); static teco_state_t * -teco_state_stringbuilding_upper_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_upper_ctl_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { if (!ctx->result) /* parse-only mode */ return &teco_state_stringbuilding_start; - /* - * FIXME: This does not handle ^W^W typed with up-carets. - */ - if (chr == TECO_CTL_KEY('W')) { + chr = teco_ascii_toupper(chr); + + if (chr == 'W') { if (ctx->parent.must_undo) teco_undo_guint(ctx->mode); ctx->mode = TECO_STRINGBUILDING_MODE_UPPER; } else { - teco_string_append_c(ctx->result, g_ascii_toupper(chr)); + /* control keys cannot be case folded */ + teco_string_append_wc(ctx->result, TECO_CTL_KEY(chr)); } return &teco_state_stringbuilding_start; } +TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_stringbuilding_upper_ctl); + +static teco_state_t * +teco_state_stringbuilding_upper_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) +{ + if (chr == '^') + return &teco_state_stringbuilding_upper_ctl; + if (TECO_IS_CTL(chr)) + return teco_state_stringbuilding_upper_ctl_input(ctx, TECO_CTL_ECHO(chr), error); + + if (ctx->result) { + chr = ctx->codepage == SC_CP_UTF8 || chr < 0x80 + ? g_unichar_toupper(chr) : chr; + teco_string_append_wc(ctx->result, chr); + } + return &teco_state_stringbuilding_start; +} + TECO_DEFINE_STATE(teco_state_stringbuilding_upper); static teco_state_t * -teco_state_stringbuilding_ctle_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_ctle_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { teco_state_t *next; @@ -488,8 +603,10 @@ teco_state_stringbuilding_ctle_input(teco_machine_stringbuilding_t *ctx, gchar c case 'N': next = &teco_state_stringbuilding_ctle_n; break; default: if (ctx->result) { - gchar buf[] = {TECO_CTL_KEY('E'), chr}; - teco_string_append(ctx->result, buf, sizeof(buf)); + /* also makes sure that search patterns can start with ^E */ + gchar buf[1+6] = {TECO_CTL_KEY('E')}; + gsize len = g_unichar_to_utf8(chr, buf+1); + teco_machine_stringbuilding_append(ctx, buf, 1+len); } return &teco_state_stringbuilding_start; } @@ -507,7 +624,7 @@ TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_stringbuilding_ctle); /* in cmdline.c */ gboolean teco_state_stringbuilding_qreg_process_edit_cmd(teco_machine_stringbuilding_t *ctx, teco_machine_t *parent_ctx, - gchar chr, GError **error); + gunichar chr, GError **error); /** * @interface TECO_DEFINE_STATE_STRINGBUILDING_QREG @@ -522,7 +639,7 @@ gboolean teco_state_stringbuilding_qreg_process_edit_cmd(teco_machine_stringbuil ) static teco_state_t * -teco_state_stringbuilding_ctle_num_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_ctle_num_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { teco_qreg_t *qreg; @@ -549,7 +666,7 @@ teco_state_stringbuilding_ctle_num_input(teco_machine_stringbuilding_t *ctx, gch */ gchar buffer[TECO_EXPRESSIONS_FORMAT_LEN]; const gchar *num = teco_expressions_format(buffer, value); - teco_string_append(ctx->result, num, strlen(num)); + teco_machine_stringbuilding_append(ctx, num, strlen(num)); return &teco_state_stringbuilding_start; } @@ -557,7 +674,7 @@ teco_state_stringbuilding_ctle_num_input(teco_machine_stringbuilding_t *ctx, gch TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_num); static teco_state_t * -teco_state_stringbuilding_ctle_u_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_ctle_u_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { teco_qreg_t *qreg; @@ -578,21 +695,51 @@ teco_state_stringbuilding_ctle_u_input(teco_machine_stringbuilding_t *ctx, gchar teco_int_t value; if (!qreg->vtable->get_integer(qreg, &value, error)) return NULL; - if (value < 0 || value > 0xFF) { - g_autofree gchar *name_printable = teco_string_echo(qreg->head.name.data, qreg->head.name.len); - g_set_error(error, TECO_ERROR, TECO_ERROR_FAILED, - "Q-Register \"%s\" does not contain a valid character", name_printable); - return NULL; + + if (ctx->codepage == SC_CP_UTF8) { + if (value < 0 || !g_unichar_validate(value)) + goto error_codepoint; + switch (ctx->mode) { + case TECO_STRINGBUILDING_MODE_NORMAL: + break; + case TECO_STRINGBUILDING_MODE_UPPER: + value = g_unichar_toupper(value); + break; + case TECO_STRINGBUILDING_MODE_LOWER: + value = g_unichar_tolower(value); + break; + } + teco_string_append_wc(ctx->result, value); + } else { + if (value < 0 || value > 0xFF) + goto error_codepoint; + switch (ctx->mode) { + case TECO_STRINGBUILDING_MODE_NORMAL: + break; + case TECO_STRINGBUILDING_MODE_UPPER: + value = g_ascii_toupper(value); + break; + case TECO_STRINGBUILDING_MODE_LOWER: + value = g_ascii_tolower(value); + break; + } + teco_string_append_c(ctx->result, value); } - teco_string_append_c(ctx->result, (gchar)value); return &teco_state_stringbuilding_start; + +error_codepoint: { + g_autofree gchar *name_printable = teco_string_echo(qreg->head.name.data, qreg->head.name.len); + g_set_error(error, TECO_ERROR, TECO_ERROR_CODEPOINT, + "Q-Register \"%s\" does not contain a valid codepoint", name_printable); + return NULL; +} } TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_u); static teco_state_t * -teco_state_stringbuilding_ctle_q_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_ctle_q_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { teco_qreg_t *qreg; @@ -610,20 +757,17 @@ teco_state_stringbuilding_ctle_q_input(teco_machine_stringbuilding_t *ctx, gchar /* parse-only mode */ return &teco_state_stringbuilding_start; - /* - * FIXME: Should we have a special teco_qreg_get_string_append() function? - */ g_auto(teco_string_t) str = {NULL, 0}; - if (!qreg->vtable->get_string(qreg, &str.data, &str.len, error)) + if (!qreg->vtable->get_string(qreg, &str.data, &str.len, NULL, error)) return NULL; - teco_string_append(ctx->result, str.data, str.len); + teco_machine_stringbuilding_append(ctx, str.data, str.len); return &teco_state_stringbuilding_start; } TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_q); static teco_state_t * -teco_state_stringbuilding_ctle_quote_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_ctle_quote_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { teco_qreg_t *qreg; teco_qreg_table_t *table; @@ -643,7 +787,7 @@ teco_state_stringbuilding_ctle_quote_input(teco_machine_stringbuilding_t *ctx, g return &teco_state_stringbuilding_start; g_auto(teco_string_t) str = {NULL, 0}; - if (!qreg->vtable->get_string(qreg, &str.data, &str.len, error)) + if (!qreg->vtable->get_string(qreg, &str.data, &str.len, NULL, error)) return NULL; /* * NOTE: g_shell_quote() expects a null-terminated string, so it is @@ -658,7 +802,7 @@ teco_state_stringbuilding_ctle_quote_input(teco_machine_stringbuilding_t *ctx, g return NULL; } g_autofree gchar *str_quoted = g_shell_quote(str.data ? : ""); - teco_string_append(ctx->result, str_quoted, strlen(str_quoted)); + teco_machine_stringbuilding_append(ctx, str_quoted, strlen(str_quoted)); return &teco_state_stringbuilding_start; } @@ -666,7 +810,7 @@ teco_state_stringbuilding_ctle_quote_input(teco_machine_stringbuilding_t *ctx, g TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_quote); static teco_state_t * -teco_state_stringbuilding_ctle_n_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error) +teco_state_stringbuilding_ctle_n_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) { teco_qreg_t *qreg; teco_qreg_table_t *table; @@ -686,7 +830,7 @@ teco_state_stringbuilding_ctle_n_input(teco_machine_stringbuilding_t *ctx, gchar return &teco_state_stringbuilding_start; g_auto(teco_string_t) str = {NULL, 0}; - if (!qreg->vtable->get_string(qreg, &str.data, &str.len, error)) + if (!qreg->vtable->get_string(qreg, &str.data, &str.len, NULL, error)) return NULL; if (teco_string_contains(&str, '\0')) { teco_error_qregcontainsnull_set(error, qreg->head.name.data, qreg->head.name.len, @@ -695,7 +839,7 @@ teco_state_stringbuilding_ctle_n_input(teco_machine_stringbuilding_t *ctx, gchar } g_autofree gchar *str_escaped = teco_globber_escape_pattern(str.data); - teco_string_append(ctx->result, str_escaped, strlen(str_escaped)); + teco_machine_stringbuilding_append(ctx, str_escaped, strlen(str_escaped)); return &teco_state_stringbuilding_start; } @@ -703,13 +847,14 @@ teco_state_stringbuilding_ctle_n_input(teco_machine_stringbuilding_t *ctx, gchar TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_n); void -teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gchar escape_char, +teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gunichar escape_char, teco_qreg_table_t *locals, gboolean must_undo) { memset(ctx, 0, sizeof(*ctx)); teco_machine_init(&ctx->parent, &teco_state_stringbuilding_start, must_undo); ctx->escape_char = escape_char; ctx->qreg_table_locals = locals; + ctx->codepage = teco_default_codepage(); } void @@ -723,6 +868,10 @@ teco_machine_stringbuilding_reset(teco_machine_stringbuilding_t *ctx) ctx->mode = TECO_STRINGBUILDING_MODE_NORMAL; } +/* + * If we case folded only ANSI characters as in teco_ascii_toupper(), + * this could be simplified. + */ void teco_machine_stringbuilding_escape(teco_machine_stringbuilding_t *ctx, const gchar *str, gsize len, teco_string_t *target) @@ -730,12 +879,18 @@ teco_machine_stringbuilding_escape(teco_machine_stringbuilding_t *ctx, const gch target->data = g_malloc(len*2+1); target->len = 0; - for (guint i = 0; i < len; i++) { - if (teco_ascii_toupper(str[i]) == ctx->escape_char || - (ctx->escape_char == '[' && str[i] == ']') || - (ctx->escape_char == '{' && str[i] == '}')) + for (guint i = 0; i < len; ) { + gunichar chr = g_utf8_get_char(str+i); + + if (g_unichar_toupper(chr) == ctx->escape_char || + (ctx->escape_char == '[' && chr == ']') || + (ctx->escape_char == '{' && chr == '}')) target->data[target->len++] = TECO_CTL_KEY('Q'); - target->data[target->len++] = str[i]; + + gsize lenc = g_utf8_next_char(str+i) - (str+i); + memcpy(target->data+target->len, str+i, lenc); + target->len += lenc; + i += lenc; } target->data[target->len] = '\0'; @@ -748,8 +903,17 @@ teco_machine_stringbuilding_clear(teco_machine_stringbuilding_t *ctx) teco_machine_qregspec_free(ctx->machine_qregspec); } +gboolean +teco_state_expectstring_initial(teco_machine_main_t *ctx, GError **error) +{ + if (ctx->mode == TECO_MODE_NORMAL) + teco_machine_stringbuilding_set_codepage(&ctx->expectstring.machine, + teco_default_codepage()); + return TRUE; +} + teco_state_t * -teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **error) +teco_state_expectstring_input(teco_machine_main_t *ctx, gunichar chr, GError **error) { teco_state_t *current = ctx->parent.current; @@ -766,13 +930,18 @@ teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **erro /* * FIXME: Exclude setting at least whitespace characters as the * new string escape character to avoid accidental errors? + * + * FIXME: Should we perhaps restrict case folding escape characters + * to the ANSI range (teco_ascii_toupper())? + * This would be faster than case folding each and every character + * of a string argument to check against the escape char. */ switch (ctx->expectstring.machine.escape_char) { case '\e': case '{': if (ctx->parent.must_undo) - teco_undo_gchar(ctx->expectstring.machine.escape_char); - ctx->expectstring.machine.escape_char = teco_ascii_toupper(chr); + teco_undo_gunichar(ctx->expectstring.machine.escape_char); + ctx->expectstring.machine.escape_char = g_unichar_toupper(chr); return current; } } @@ -796,7 +965,7 @@ teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **erro ctx->expectstring.nesting--; break; } - } else if (teco_ascii_toupper(chr) == ctx->expectstring.machine.escape_char) { + } else if (g_unichar_toupper(chr) == ctx->expectstring.machine.escape_char) { if (ctx->parent.must_undo) teco_undo_gint(ctx->expectstring.nesting); ctx->expectstring.nesting--; @@ -826,7 +995,7 @@ teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **erro if (current->expectstring.last) { if (ctx->parent.must_undo) - teco_undo_gchar(ctx->expectstring.machine.escape_char); + teco_undo_gunichar(ctx->expectstring.machine.escape_char); ctx->expectstring.machine.escape_char = '\e'; } ctx->expectstring.nesting = 1; @@ -857,7 +1026,7 @@ teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **erro if (!teco_machine_stringbuilding_input(&ctx->expectstring.machine, chr, str, error)) return NULL; } else if (ctx->mode == TECO_MODE_NORMAL) { - teco_string_append_c(&ctx->expectstring.string, chr); + teco_string_append_wc(&ctx->expectstring.string, chr); } /* @@ -901,7 +1070,7 @@ teco_state_expectfile_process(teco_machine_main_t *ctx, const teco_string_t *str g_assert(str->data != NULL); /* - * Null-chars must not ocur in filename/path strings and at some point + * Null-chars must not occur in filename/path strings and at some point * teco_string_t has to be converted to a null-terminated C string * as all the glib filename functions rely on null-terminated strings. * Doing it here ensures that teco_file_expand_path() can be safely called |