diff options
author | Robin Haberkorn <robin.haberkorn@googlemail.com> | 2024-09-04 12:49:29 +0200 |
---|---|---|
committer | Robin Haberkorn <robin.haberkorn@googlemail.com> | 2024-09-09 18:22:21 +0200 |
commit | b31b88717172e22b49c0493185f603b8f84989ec (patch) | |
tree | 43850d7d04e721987b89c37c68f24e657b5cb9c6 /src/parser.c | |
parent | b85edaa0021c06d63fee6d8904fc822815e8b933 (diff) | |
download | sciteco-b31b88717172e22b49c0493185f603b8f84989ec.tar.gz |
the ^EUq string building escape now respects the encoding (can insert bytes or codepoints) (refs #5)
* This is trickier than it sounds because there isn't one single place to consult.
It depends on the context.
If the string argument relates to buffer contents - as in <I>, <S>, <FR> etc. -
the buffer's encoding is consulted.
If it goes into a register (EU), the register's encoding is consulted.
Everything else (O, EN, EC, ES...) expects only Unicode codepoints.
* This is communicated through a new field teco_machine_stringbuilding_t::codepage
which must be set in the states' initial callback.
* Seems overkill just for ^EUq, but it can be used for context-sensitive
processing of all the other string building constructs as well.
* ^V and ^W cannot be supported for Unicode characters for the time being without an Unicode-aware parser
Diffstat (limited to 'src/parser.c')
-rw-r--r-- | src/parser.c | 37 |
1 files changed, 31 insertions, 6 deletions
diff --git a/src/parser.c b/src/parser.c index 8d3cc92..29519b0 100644 --- a/src/parser.c +++ b/src/parser.c @@ -410,6 +410,7 @@ teco_state_stringbuilding_escaped_input(teco_machine_stringbuilding_t *ctx, gcha /* parse-only mode */ return &teco_state_stringbuilding_start; + /* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */ switch (ctx->mode) { case TECO_STRINGBUILDING_MODE_UPPER: chr = g_ascii_toupper(chr); @@ -442,6 +443,7 @@ teco_state_stringbuilding_lower_input(teco_machine_stringbuilding_t *ctx, gchar teco_undo_guint(ctx->mode); ctx->mode = TECO_STRINGBUILDING_MODE_LOWER; } else { + /* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */ teco_string_append_c(ctx->result, g_ascii_tolower(chr)); } @@ -465,6 +467,7 @@ teco_state_stringbuilding_upper_input(teco_machine_stringbuilding_t *ctx, gchar teco_undo_guint(ctx->mode); ctx->mode = TECO_STRINGBUILDING_MODE_UPPER; } else { + /* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */ teco_string_append_c(ctx->result, g_ascii_toupper(chr)); } @@ -576,15 +579,28 @@ teco_state_stringbuilding_ctle_u_input(teco_machine_stringbuilding_t *ctx, gchar teco_int_t value; if (!qreg->vtable->get_integer(qreg, &value, error)) return NULL; - if (value < 0 || value > 0xFF) { - g_autofree gchar *name_printable = teco_string_echo(qreg->head.name.data, qreg->head.name.len); - g_set_error(error, TECO_ERROR, TECO_ERROR_FAILED, - "Q-Register \"%s\" does not contain a valid character", name_printable); - return NULL; + + if (ctx->codepage == SC_CP_UTF8) { + if (value < 0 || !g_unichar_validate(value)) + goto error_codepoint; + /* 4 bytes should be enough, but we better follow the documentation */ + gchar buf[6]; + gsize len = g_unichar_to_utf8(value, buf); + teco_string_append(ctx->result, buf, len); + } else { + if (value < 0 || value > 0xFF) + goto error_codepoint; + teco_string_append_c(ctx->result, (gchar)value); } - teco_string_append_c(ctx->result, (gchar)value); return &teco_state_stringbuilding_start; + +error_codepoint: { + g_autofree gchar *name_printable = teco_string_echo(qreg->head.name.data, qreg->head.name.len); + g_set_error(error, TECO_ERROR, TECO_ERROR_CODEPOINT, + "Q-Register \"%s\" does not contain a valid codepoint", name_printable); + return NULL; +} } TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_u); @@ -708,6 +724,7 @@ teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gchar escap teco_machine_init(&ctx->parent, &teco_state_stringbuilding_start, must_undo); ctx->escape_char = escape_char; ctx->qreg_table_locals = locals; + ctx->codepage = SC_CP_UTF8; } void @@ -746,6 +763,14 @@ teco_machine_stringbuilding_clear(teco_machine_stringbuilding_t *ctx) teco_machine_qregspec_free(ctx->machine_qregspec); } +gboolean +teco_state_expectstring_initial(teco_machine_main_t *ctx, GError **error) +{ + if (ctx->mode == TECO_MODE_NORMAL) + teco_undo_guint(ctx->expectstring.machine.codepage) = SC_CP_UTF8; + return TRUE; +} + teco_state_t * teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **error) { |