From b31b88717172e22b49c0493185f603b8f84989ec Mon Sep 17 00:00:00 2001 From: Robin Haberkorn Date: Wed, 4 Sep 2024 12:49:29 +0200 Subject: the ^EUq string building escape now respects the encoding (can insert bytes or codepoints) (refs #5) * This is trickier than it sounds because there isn't one single place to consult. It depends on the context. If the string argument relates to buffer contents - as in , , etc. - the buffer's encoding is consulted. If it goes into a register (EU), the register's encoding is consulted. Everything else (O, EN, EC, ES...) expects only Unicode codepoints. * This is communicated through a new field teco_machine_stringbuilding_t::codepage which must be set in the states' initial callback. * Seems overkill just for ^EUq, but it can be used for context-sensitive processing of all the other string building constructs as well. * ^V and ^W cannot be supported for Unicode characters for the time being without an Unicode-aware parser --- src/parser.c | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) (limited to 'src/parser.c') diff --git a/src/parser.c b/src/parser.c index 8d3cc92..29519b0 100644 --- a/src/parser.c +++ b/src/parser.c @@ -410,6 +410,7 @@ teco_state_stringbuilding_escaped_input(teco_machine_stringbuilding_t *ctx, gcha /* parse-only mode */ return &teco_state_stringbuilding_start; + /* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */ switch (ctx->mode) { case TECO_STRINGBUILDING_MODE_UPPER: chr = g_ascii_toupper(chr); @@ -442,6 +443,7 @@ teco_state_stringbuilding_lower_input(teco_machine_stringbuilding_t *ctx, gchar teco_undo_guint(ctx->mode); ctx->mode = TECO_STRINGBUILDING_MODE_LOWER; } else { + /* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */ teco_string_append_c(ctx->result, g_ascii_tolower(chr)); } @@ -465,6 +467,7 @@ teco_state_stringbuilding_upper_input(teco_machine_stringbuilding_t *ctx, gchar teco_undo_guint(ctx->mode); ctx->mode = TECO_STRINGBUILDING_MODE_UPPER; } else { + /* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */ teco_string_append_c(ctx->result, g_ascii_toupper(chr)); } @@ -576,15 +579,28 @@ teco_state_stringbuilding_ctle_u_input(teco_machine_stringbuilding_t *ctx, gchar teco_int_t value; if (!qreg->vtable->get_integer(qreg, &value, error)) return NULL; - if (value < 0 || value > 0xFF) { - g_autofree gchar *name_printable = teco_string_echo(qreg->head.name.data, qreg->head.name.len); - g_set_error(error, TECO_ERROR, TECO_ERROR_FAILED, - "Q-Register \"%s\" does not contain a valid character", name_printable); - return NULL; + + if (ctx->codepage == SC_CP_UTF8) { + if (value < 0 || !g_unichar_validate(value)) + goto error_codepoint; + /* 4 bytes should be enough, but we better follow the documentation */ + gchar buf[6]; + gsize len = g_unichar_to_utf8(value, buf); + teco_string_append(ctx->result, buf, len); + } else { + if (value < 0 || value > 0xFF) + goto error_codepoint; + teco_string_append_c(ctx->result, (gchar)value); } - teco_string_append_c(ctx->result, (gchar)value); return &teco_state_stringbuilding_start; + +error_codepoint: { + g_autofree gchar *name_printable = teco_string_echo(qreg->head.name.data, qreg->head.name.len); + g_set_error(error, TECO_ERROR, TECO_ERROR_CODEPOINT, + "Q-Register \"%s\" does not contain a valid codepoint", name_printable); + return NULL; +} } TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_u); @@ -708,6 +724,7 @@ teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gchar escap teco_machine_init(&ctx->parent, &teco_state_stringbuilding_start, must_undo); ctx->escape_char = escape_char; ctx->qreg_table_locals = locals; + ctx->codepage = SC_CP_UTF8; } void @@ -746,6 +763,14 @@ teco_machine_stringbuilding_clear(teco_machine_stringbuilding_t *ctx) teco_machine_qregspec_free(ctx->machine_qregspec); } +gboolean +teco_state_expectstring_initial(teco_machine_main_t *ctx, GError **error) +{ + if (ctx->mode == TECO_MODE_NORMAL) + teco_undo_guint(ctx->expectstring.machine.codepage) = SC_CP_UTF8; + return TRUE; +} + teco_state_t * teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **error) { -- cgit v1.2.3