aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/parser.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/parser.c')
-rw-r--r--src/parser.c341
1 files changed, 255 insertions, 86 deletions
diff --git a/src/parser.c b/src/parser.c
index 910fc7f..b1aa06e 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2012-2023 Robin Haberkorn
+ * Copyright (C) 2012-2024 Robin Haberkorn
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -52,16 +52,14 @@ teco_loop_stack_init(void)
TECO_DEFINE_ARRAY_UNDO_INSERT_VAL(teco_loop_stack, teco_loop_context_t);
TECO_DEFINE_ARRAY_UNDO_REMOVE_INDEX(teco_loop_stack);
-#ifndef NDEBUG
-static void __attribute__((destructor))
+static void TECO_DEBUG_CLEANUP
teco_loop_stack_cleanup(void)
{
g_array_free(teco_loop_stack, TRUE);
}
-#endif
gboolean
-teco_machine_input(teco_machine_t *ctx, gchar chr, GError **error)
+teco_machine_input(teco_machine_t *ctx, gunichar chr, GError **error)
{
teco_state_t *next = ctx->current->input_cb(ctx, chr, error);
if (!next)
@@ -88,18 +86,22 @@ teco_state_end_of_macro(teco_machine_t *ctx, GError **error)
}
/**
+ * Execute macro from current PC to stop position.
+ *
* Handles all expected exceptions and preparing them for stack frame insertion.
+ *
+ * @param ctx State machine.
+ * @param macro The macro to execute.
+ * It does not have to be complete.
+ * It must consist only of validated UTF-8 sequences, though.
+ * @param stop_pos Where to stop execution in bytes.
+ * @param error Location to store error.
+ * @return FALSE if an error occurred.
*/
gboolean
-teco_machine_main_step(teco_machine_main_t *ctx, const gchar *macro, gint stop_pos, GError **error)
+teco_machine_main_step(teco_machine_main_t *ctx, const gchar *macro, gsize stop_pos, GError **error)
{
while (ctx->macro_pc < stop_pos) {
-#ifdef DEBUG
- g_printf("EXEC(%d): input='%c'/%x, state=%p, mode=%d\n",
- ctx->macro_pc, macro[ctx->macro_pc], macro[ctx->macro_pc],
- ctx->parent.current, ctx->mode);
-#endif
-
if (G_UNLIKELY(teco_interface_is_interrupted())) {
teco_error_interrupted_set(error);
goto error_attach;
@@ -112,9 +114,18 @@ teco_machine_main_step(teco_machine_main_t *ctx, const gchar *macro, gint stop_p
if (!teco_memory_check(0, error))
goto error_attach;
- if (!teco_machine_input(&ctx->parent, macro[ctx->macro_pc], error))
+ /* UTF-8 sequences are already validated */
+ gunichar chr = g_utf8_get_char(macro+ctx->macro_pc);
+
+#ifdef DEBUG
+ g_printf("EXEC(%d): input='%C' (U+%04" G_GINT32_MODIFIER "X), state=%p, mode=%d\n",
+ ctx->macro_pc, chr, chr, ctx->parent.current, ctx->mode);
+#endif
+
+ if (!teco_machine_input(&ctx->parent, chr, error))
goto error_attach;
- ctx->macro_pc++;
+
+ ctx->macro_pc = g_utf8_next_char(macro+ctx->macro_pc) - macro;
}
/*
@@ -146,6 +157,14 @@ gboolean
teco_execute_macro(const gchar *macro, gsize macro_len,
teco_qreg_table_t *qreg_table_locals, GError **error)
{
+ const teco_string_t str = {(gchar *)macro, macro_len};
+
+ if (!teco_string_validate_utf8(&str)) {
+ g_set_error_literal(error, TECO_ERROR, TECO_ERROR_CODEPOINT,
+ "Invalid UTF-8 byte sequence in macro");
+ return FALSE;
+ }
+
/*
* This is not auto-cleaned up, so it can be initialized
* on demand.
@@ -311,26 +330,26 @@ teco_machine_main_eval_colon(teco_machine_main_t *ctx)
teco_state_t *
teco_machine_main_transition_input(teco_machine_main_t *ctx,
teco_machine_main_transition_t *transitions,
- guint len, gchar chr, GError **error)
+ guint len, gunichar chr, GError **error)
{
- if (chr < 0 || chr >= len || !transitions[(guint)chr].next) {
+ if (chr >= len || !transitions[chr].next) {
teco_error_syntax_set(error, chr);
return NULL;
}
- if (ctx->mode == TECO_MODE_NORMAL && transitions[(guint)chr].transition_cb) {
+ if (ctx->mode == TECO_MODE_NORMAL && transitions[chr].transition_cb) {
/*
* NOTE: We could also just let transition_cb return a boolean...
*/
GError *tmp_error = NULL;
- transitions[(guint)chr].transition_cb(ctx, &tmp_error);
+ transitions[chr].transition_cb(ctx, &tmp_error);
if (tmp_error) {
g_propagate_error(error, tmp_error);
return NULL;
}
}
- return transitions[(guint)chr].next;
+ return transitions[chr].next;
}
void
@@ -340,15 +359,40 @@ teco_machine_main_clear(teco_machine_main_t *ctx)
teco_machine_stringbuilding_clear(&ctx->expectstring.machine);
}
+/** Append string to result with case folding. */
+static void
+teco_machine_stringbuilding_append(teco_machine_stringbuilding_t *ctx, const gchar *str, gsize len)
+{
+ g_assert(ctx->result != NULL);
+
+ switch (ctx->mode) {
+ case TECO_STRINGBUILDING_MODE_NORMAL:
+ teco_string_append(ctx->result, str, len);
+ break;
+ case TECO_STRINGBUILDING_MODE_UPPER: {
+ g_autofree gchar *folded = ctx->codepage == SC_CP_UTF8
+ ? g_utf8_strup(str, len) : g_ascii_strup(str, len);
+ teco_string_append(ctx->result, folded, strlen(folded));
+ break;
+ }
+ case TECO_STRINGBUILDING_MODE_LOWER: {
+ g_autofree gchar *folded = ctx->codepage == SC_CP_UTF8
+ ? g_utf8_strdown(str, len) : g_ascii_strdown(str, len);
+ teco_string_append(ctx->result, folded, strlen(folded));
+ break;
+ }
+ }
+}
+
/*
* FIXME: All teco_state_stringbuilding_* states could be static?
*/
static teco_state_t *teco_state_stringbuilding_ctl_input(teco_machine_stringbuilding_t *ctx,
- gchar chr, GError **error);
+ gunichar chr, GError **error);
TECO_DECLARE_STATE(teco_state_stringbuilding_ctl);
static teco_state_t *teco_state_stringbuilding_escaped_input(teco_machine_stringbuilding_t *ctx,
- gchar chr, GError **error);
+ gunichar chr, GError **error);
TECO_DECLARE_STATE(teco_state_stringbuilding_escaped);
TECO_DECLARE_STATE(teco_state_stringbuilding_lower);
@@ -362,19 +406,29 @@ TECO_DECLARE_STATE(teco_state_stringbuilding_ctle_quote);
TECO_DECLARE_STATE(teco_state_stringbuilding_ctle_n);
static teco_state_t *
-teco_state_stringbuilding_start_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_start_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
- if (chr == '^')
+ switch (chr) {
+ case '^':
return &teco_state_stringbuilding_ctl;
- if (TECO_IS_CTL(chr))
- return teco_state_stringbuilding_ctl_input(ctx, TECO_CTL_ECHO(chr), error);
+ case TECO_CTL_KEY('^'):
+ /*
+ * Ctrl+^ is inserted verbatim as code 30.
+ * Otherwise it would expand to a single caret
+ * just like caret+caret (^^).
+ */
+ break;
+ default:
+ if (TECO_IS_CTL(chr))
+ return teco_state_stringbuilding_ctl_input(ctx, TECO_CTL_ECHO(chr), error);
+ }
return teco_state_stringbuilding_escaped_input(ctx, chr, error);
}
/* in cmdline.c */
gboolean teco_state_stringbuilding_start_process_edit_cmd(teco_machine_stringbuilding_t *ctx, teco_machine_t *parent_ctx,
- gchar key, GError **error);
+ gunichar key, GError **error);
TECO_DEFINE_STATE(teco_state_stringbuilding_start,
.is_start = TRUE,
@@ -383,12 +437,19 @@ TECO_DEFINE_STATE(teco_state_stringbuilding_start,
);
static teco_state_t *
-teco_state_stringbuilding_ctl_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_ctl_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
chr = teco_ascii_toupper(chr);
switch (chr) {
- case '^': break;
+ case '^':
+ /*
+ * Double-caret expands to a single caret.
+ * Ctrl+^ (30) is handled separately and inserts code 30.
+ * The special handling of the double-caret should perhaps
+ * be abolished altogether.
+ */
+ break;
case 'Q':
case 'R': return &teco_state_stringbuilding_escaped;
case 'V': return &teco_state_stringbuilding_lower;
@@ -398,85 +459,139 @@ teco_state_stringbuilding_ctl_input(teco_machine_stringbuilding_t *ctx, gchar ch
chr = TECO_CTL_KEY(chr);
}
+ /*
+ * Source code is always in UTF-8, so it does not
+ * make sense to handle ctx->codepage != SC_CP_UTF8
+ * separately.
+ */
if (ctx->result)
- teco_string_append_c(ctx->result, chr);
+ teco_string_append_wc(ctx->result, chr);
return &teco_state_stringbuilding_start;
}
TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_stringbuilding_ctl);
static teco_state_t *
-teco_state_stringbuilding_escaped_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_escaped_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
if (!ctx->result)
/* parse-only mode */
return &teco_state_stringbuilding_start;
+ /*
+ * The subtle difference between UTF-8 and single-byte targets
+ * is that we don't try to casefold non-ANSI characters in single-byte mode.
+ */
switch (ctx->mode) {
+ case TECO_STRINGBUILDING_MODE_NORMAL:
+ break;
case TECO_STRINGBUILDING_MODE_UPPER:
- chr = g_ascii_toupper(chr);
+ chr = ctx->codepage == SC_CP_UTF8 || chr < 0x80
+ ? g_unichar_toupper(chr) : chr;
break;
case TECO_STRINGBUILDING_MODE_LOWER:
- chr = g_ascii_tolower(chr);
- break;
- default:
+ chr = ctx->codepage == SC_CP_UTF8 || chr < 0x80
+ ? g_unichar_tolower(chr) : chr;
break;
}
- teco_string_append_c(ctx->result, chr);
+ teco_string_append_wc(ctx->result, chr);
return &teco_state_stringbuilding_start;
}
-TECO_DEFINE_STATE(teco_state_stringbuilding_escaped);
+/* in cmdline.c */
+gboolean teco_state_stringbuilding_escaped_process_edit_cmd(teco_machine_stringbuilding_t *ctx, teco_machine_t *parent_ctx,
+ gunichar key, GError **error);
+
+TECO_DEFINE_STATE(teco_state_stringbuilding_escaped,
+ .process_edit_cmd_cb = (teco_state_process_edit_cmd_cb_t)
+ teco_state_stringbuilding_escaped_process_edit_cmd
+);
static teco_state_t *
-teco_state_stringbuilding_lower_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_lower_ctl_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
if (!ctx->result)
/* parse-only mode */
return &teco_state_stringbuilding_start;
- /*
- * FIXME: This does not handle ^V^V typed with up-carets.
- */
- if (chr == TECO_CTL_KEY('V')) {
+ chr = teco_ascii_toupper(chr);
+
+ if (chr == 'V') {
if (ctx->parent.must_undo)
teco_undo_guint(ctx->mode);
ctx->mode = TECO_STRINGBUILDING_MODE_LOWER;
} else {
- teco_string_append_c(ctx->result, g_ascii_tolower(chr));
+ /* control keys cannot be case folded */
+ teco_string_append_wc(ctx->result, TECO_CTL_KEY(chr));
}
return &teco_state_stringbuilding_start;
}
+TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_stringbuilding_lower_ctl);
+
+static teco_state_t *
+teco_state_stringbuilding_lower_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
+{
+ if (chr == '^')
+ return &teco_state_stringbuilding_lower_ctl;
+ if (TECO_IS_CTL(chr))
+ return teco_state_stringbuilding_lower_ctl_input(ctx, TECO_CTL_ECHO(chr), error);
+
+ if (ctx->result) {
+ chr = ctx->codepage == SC_CP_UTF8 || chr < 0x80
+ ? g_unichar_tolower(chr) : chr;
+ teco_string_append_wc(ctx->result, chr);
+ }
+ return &teco_state_stringbuilding_start;
+}
+
TECO_DEFINE_STATE(teco_state_stringbuilding_lower);
static teco_state_t *
-teco_state_stringbuilding_upper_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_upper_ctl_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
if (!ctx->result)
/* parse-only mode */
return &teco_state_stringbuilding_start;
- /*
- * FIXME: This does not handle ^W^W typed with up-carets.
- */
- if (chr == TECO_CTL_KEY('W')) {
+ chr = teco_ascii_toupper(chr);
+
+ if (chr == 'W') {
if (ctx->parent.must_undo)
teco_undo_guint(ctx->mode);
ctx->mode = TECO_STRINGBUILDING_MODE_UPPER;
} else {
- teco_string_append_c(ctx->result, g_ascii_toupper(chr));
+ /* control keys cannot be case folded */
+ teco_string_append_wc(ctx->result, TECO_CTL_KEY(chr));
}
return &teco_state_stringbuilding_start;
}
+TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_stringbuilding_upper_ctl);
+
+static teco_state_t *
+teco_state_stringbuilding_upper_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
+{
+ if (chr == '^')
+ return &teco_state_stringbuilding_upper_ctl;
+ if (TECO_IS_CTL(chr))
+ return teco_state_stringbuilding_upper_ctl_input(ctx, TECO_CTL_ECHO(chr), error);
+
+ if (ctx->result) {
+ chr = ctx->codepage == SC_CP_UTF8 || chr < 0x80
+ ? g_unichar_toupper(chr) : chr;
+ teco_string_append_wc(ctx->result, chr);
+ }
+ return &teco_state_stringbuilding_start;
+}
+
TECO_DEFINE_STATE(teco_state_stringbuilding_upper);
static teco_state_t *
-teco_state_stringbuilding_ctle_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_ctle_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
teco_state_t *next;
@@ -488,8 +603,10 @@ teco_state_stringbuilding_ctle_input(teco_machine_stringbuilding_t *ctx, gchar c
case 'N': next = &teco_state_stringbuilding_ctle_n; break;
default:
if (ctx->result) {
- gchar buf[] = {TECO_CTL_KEY('E'), chr};
- teco_string_append(ctx->result, buf, sizeof(buf));
+ /* also makes sure that search patterns can start with ^E */
+ gchar buf[1+6] = {TECO_CTL_KEY('E')};
+ gsize len = g_unichar_to_utf8(chr, buf+1);
+ teco_machine_stringbuilding_append(ctx, buf, 1+len);
}
return &teco_state_stringbuilding_start;
}
@@ -507,7 +624,7 @@ TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_stringbuilding_ctle);
/* in cmdline.c */
gboolean teco_state_stringbuilding_qreg_process_edit_cmd(teco_machine_stringbuilding_t *ctx, teco_machine_t *parent_ctx,
- gchar chr, GError **error);
+ gunichar chr, GError **error);
/**
* @interface TECO_DEFINE_STATE_STRINGBUILDING_QREG
@@ -522,7 +639,7 @@ gboolean teco_state_stringbuilding_qreg_process_edit_cmd(teco_machine_stringbuil
)
static teco_state_t *
-teco_state_stringbuilding_ctle_num_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_ctle_num_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
teco_qreg_t *qreg;
@@ -549,7 +666,7 @@ teco_state_stringbuilding_ctle_num_input(teco_machine_stringbuilding_t *ctx, gch
*/
gchar buffer[TECO_EXPRESSIONS_FORMAT_LEN];
const gchar *num = teco_expressions_format(buffer, value);
- teco_string_append(ctx->result, num, strlen(num));
+ teco_machine_stringbuilding_append(ctx, num, strlen(num));
return &teco_state_stringbuilding_start;
}
@@ -557,7 +674,7 @@ teco_state_stringbuilding_ctle_num_input(teco_machine_stringbuilding_t *ctx, gch
TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_num);
static teco_state_t *
-teco_state_stringbuilding_ctle_u_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_ctle_u_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
teco_qreg_t *qreg;
@@ -578,21 +695,51 @@ teco_state_stringbuilding_ctle_u_input(teco_machine_stringbuilding_t *ctx, gchar
teco_int_t value;
if (!qreg->vtable->get_integer(qreg, &value, error))
return NULL;
- if (value < 0 || value > 0xFF) {
- g_autofree gchar *name_printable = teco_string_echo(qreg->head.name.data, qreg->head.name.len);
- g_set_error(error, TECO_ERROR, TECO_ERROR_FAILED,
- "Q-Register \"%s\" does not contain a valid character", name_printable);
- return NULL;
+
+ if (ctx->codepage == SC_CP_UTF8) {
+ if (value < 0 || !g_unichar_validate(value))
+ goto error_codepoint;
+ switch (ctx->mode) {
+ case TECO_STRINGBUILDING_MODE_NORMAL:
+ break;
+ case TECO_STRINGBUILDING_MODE_UPPER:
+ value = g_unichar_toupper(value);
+ break;
+ case TECO_STRINGBUILDING_MODE_LOWER:
+ value = g_unichar_tolower(value);
+ break;
+ }
+ teco_string_append_wc(ctx->result, value);
+ } else {
+ if (value < 0 || value > 0xFF)
+ goto error_codepoint;
+ switch (ctx->mode) {
+ case TECO_STRINGBUILDING_MODE_NORMAL:
+ break;
+ case TECO_STRINGBUILDING_MODE_UPPER:
+ value = g_ascii_toupper(value);
+ break;
+ case TECO_STRINGBUILDING_MODE_LOWER:
+ value = g_ascii_tolower(value);
+ break;
+ }
+ teco_string_append_c(ctx->result, value);
}
- teco_string_append_c(ctx->result, (gchar)value);
return &teco_state_stringbuilding_start;
+
+error_codepoint: {
+ g_autofree gchar *name_printable = teco_string_echo(qreg->head.name.data, qreg->head.name.len);
+ g_set_error(error, TECO_ERROR, TECO_ERROR_CODEPOINT,
+ "Q-Register \"%s\" does not contain a valid codepoint", name_printable);
+ return NULL;
+}
}
TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_u);
static teco_state_t *
-teco_state_stringbuilding_ctle_q_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_ctle_q_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
teco_qreg_t *qreg;
@@ -610,20 +757,17 @@ teco_state_stringbuilding_ctle_q_input(teco_machine_stringbuilding_t *ctx, gchar
/* parse-only mode */
return &teco_state_stringbuilding_start;
- /*
- * FIXME: Should we have a special teco_qreg_get_string_append() function?
- */
g_auto(teco_string_t) str = {NULL, 0};
- if (!qreg->vtable->get_string(qreg, &str.data, &str.len, error))
+ if (!qreg->vtable->get_string(qreg, &str.data, &str.len, NULL, error))
return NULL;
- teco_string_append(ctx->result, str.data, str.len);
+ teco_machine_stringbuilding_append(ctx, str.data, str.len);
return &teco_state_stringbuilding_start;
}
TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_q);
static teco_state_t *
-teco_state_stringbuilding_ctle_quote_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_ctle_quote_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
teco_qreg_t *qreg;
teco_qreg_table_t *table;
@@ -643,7 +787,7 @@ teco_state_stringbuilding_ctle_quote_input(teco_machine_stringbuilding_t *ctx, g
return &teco_state_stringbuilding_start;
g_auto(teco_string_t) str = {NULL, 0};
- if (!qreg->vtable->get_string(qreg, &str.data, &str.len, error))
+ if (!qreg->vtable->get_string(qreg, &str.data, &str.len, NULL, error))
return NULL;
/*
* NOTE: g_shell_quote() expects a null-terminated string, so it is
@@ -658,7 +802,7 @@ teco_state_stringbuilding_ctle_quote_input(teco_machine_stringbuilding_t *ctx, g
return NULL;
}
g_autofree gchar *str_quoted = g_shell_quote(str.data ? : "");
- teco_string_append(ctx->result, str_quoted, strlen(str_quoted));
+ teco_machine_stringbuilding_append(ctx, str_quoted, strlen(str_quoted));
return &teco_state_stringbuilding_start;
}
@@ -666,7 +810,7 @@ teco_state_stringbuilding_ctle_quote_input(teco_machine_stringbuilding_t *ctx, g
TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_quote);
static teco_state_t *
-teco_state_stringbuilding_ctle_n_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_ctle_n_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
teco_qreg_t *qreg;
teco_qreg_table_t *table;
@@ -686,7 +830,7 @@ teco_state_stringbuilding_ctle_n_input(teco_machine_stringbuilding_t *ctx, gchar
return &teco_state_stringbuilding_start;
g_auto(teco_string_t) str = {NULL, 0};
- if (!qreg->vtable->get_string(qreg, &str.data, &str.len, error))
+ if (!qreg->vtable->get_string(qreg, &str.data, &str.len, NULL, error))
return NULL;
if (teco_string_contains(&str, '\0')) {
teco_error_qregcontainsnull_set(error, qreg->head.name.data, qreg->head.name.len,
@@ -695,7 +839,7 @@ teco_state_stringbuilding_ctle_n_input(teco_machine_stringbuilding_t *ctx, gchar
}
g_autofree gchar *str_escaped = teco_globber_escape_pattern(str.data);
- teco_string_append(ctx->result, str_escaped, strlen(str_escaped));
+ teco_machine_stringbuilding_append(ctx, str_escaped, strlen(str_escaped));
return &teco_state_stringbuilding_start;
}
@@ -703,13 +847,14 @@ teco_state_stringbuilding_ctle_n_input(teco_machine_stringbuilding_t *ctx, gchar
TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_n);
void
-teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gchar escape_char,
+teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gunichar escape_char,
teco_qreg_table_t *locals, gboolean must_undo)
{
memset(ctx, 0, sizeof(*ctx));
teco_machine_init(&ctx->parent, &teco_state_stringbuilding_start, must_undo);
ctx->escape_char = escape_char;
ctx->qreg_table_locals = locals;
+ ctx->codepage = teco_default_codepage();
}
void
@@ -723,6 +868,10 @@ teco_machine_stringbuilding_reset(teco_machine_stringbuilding_t *ctx)
ctx->mode = TECO_STRINGBUILDING_MODE_NORMAL;
}
+/*
+ * If we case folded only ANSI characters as in teco_ascii_toupper(),
+ * this could be simplified.
+ */
void
teco_machine_stringbuilding_escape(teco_machine_stringbuilding_t *ctx, const gchar *str, gsize len,
teco_string_t *target)
@@ -730,12 +879,18 @@ teco_machine_stringbuilding_escape(teco_machine_stringbuilding_t *ctx, const gch
target->data = g_malloc(len*2+1);
target->len = 0;
- for (guint i = 0; i < len; i++) {
- if (teco_ascii_toupper(str[i]) == ctx->escape_char ||
- (ctx->escape_char == '[' && str[i] == ']') ||
- (ctx->escape_char == '{' && str[i] == '}'))
+ for (guint i = 0; i < len; ) {
+ gunichar chr = g_utf8_get_char(str+i);
+
+ if (g_unichar_toupper(chr) == ctx->escape_char ||
+ (ctx->escape_char == '[' && chr == ']') ||
+ (ctx->escape_char == '{' && chr == '}'))
target->data[target->len++] = TECO_CTL_KEY('Q');
- target->data[target->len++] = str[i];
+
+ gsize lenc = g_utf8_next_char(str+i) - (str+i);
+ memcpy(target->data+target->len, str+i, lenc);
+ target->len += lenc;
+ i += lenc;
}
target->data[target->len] = '\0';
@@ -748,8 +903,17 @@ teco_machine_stringbuilding_clear(teco_machine_stringbuilding_t *ctx)
teco_machine_qregspec_free(ctx->machine_qregspec);
}
+gboolean
+teco_state_expectstring_initial(teco_machine_main_t *ctx, GError **error)
+{
+ if (ctx->mode == TECO_MODE_NORMAL)
+ teco_machine_stringbuilding_set_codepage(&ctx->expectstring.machine,
+ teco_default_codepage());
+ return TRUE;
+}
+
teco_state_t *
-teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_expectstring_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
{
teco_state_t *current = ctx->parent.current;
@@ -766,13 +930,18 @@ teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **erro
/*
* FIXME: Exclude setting at least whitespace characters as the
* new string escape character to avoid accidental errors?
+ *
+ * FIXME: Should we perhaps restrict case folding escape characters
+ * to the ANSI range (teco_ascii_toupper())?
+ * This would be faster than case folding each and every character
+ * of a string argument to check against the escape char.
*/
switch (ctx->expectstring.machine.escape_char) {
case '\e':
case '{':
if (ctx->parent.must_undo)
- teco_undo_gchar(ctx->expectstring.machine.escape_char);
- ctx->expectstring.machine.escape_char = teco_ascii_toupper(chr);
+ teco_undo_gunichar(ctx->expectstring.machine.escape_char);
+ ctx->expectstring.machine.escape_char = g_unichar_toupper(chr);
return current;
}
}
@@ -796,7 +965,7 @@ teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **erro
ctx->expectstring.nesting--;
break;
}
- } else if (teco_ascii_toupper(chr) == ctx->expectstring.machine.escape_char) {
+ } else if (g_unichar_toupper(chr) == ctx->expectstring.machine.escape_char) {
if (ctx->parent.must_undo)
teco_undo_gint(ctx->expectstring.nesting);
ctx->expectstring.nesting--;
@@ -826,7 +995,7 @@ teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **erro
if (current->expectstring.last) {
if (ctx->parent.must_undo)
- teco_undo_gchar(ctx->expectstring.machine.escape_char);
+ teco_undo_gunichar(ctx->expectstring.machine.escape_char);
ctx->expectstring.machine.escape_char = '\e';
}
ctx->expectstring.nesting = 1;
@@ -857,7 +1026,7 @@ teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **erro
if (!teco_machine_stringbuilding_input(&ctx->expectstring.machine, chr, str, error))
return NULL;
} else if (ctx->mode == TECO_MODE_NORMAL) {
- teco_string_append_c(&ctx->expectstring.string, chr);
+ teco_string_append_wc(&ctx->expectstring.string, chr);
}
/*
@@ -901,7 +1070,7 @@ teco_state_expectfile_process(teco_machine_main_t *ctx, const teco_string_t *str
g_assert(str->data != NULL);
/*
- * Null-chars must not ocur in filename/path strings and at some point
+ * Null-chars must not occur in filename/path strings and at some point
* teco_string_t has to be converted to a null-terminated C string
* as all the glib filename functions rely on null-terminated strings.
* Doing it here ensures that teco_file_expand_path() can be safely called