aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/parser.c
diff options
context:
space:
mode:
authorRobin Haberkorn <robin.haberkorn@googlemail.com>2024-09-04 12:49:29 +0200
committerRobin Haberkorn <robin.haberkorn@googlemail.com>2024-09-09 18:22:21 +0200
commitb31b88717172e22b49c0493185f603b8f84989ec (patch)
tree43850d7d04e721987b89c37c68f24e657b5cb9c6 /src/parser.c
parentb85edaa0021c06d63fee6d8904fc822815e8b933 (diff)
downloadsciteco-b31b88717172e22b49c0493185f603b8f84989ec.tar.gz
the ^EUq string building escape now respects the encoding (can insert bytes or codepoints) (refs #5)
* This is trickier than it sounds because there isn't one single place to consult. It depends on the context. If the string argument relates to buffer contents - as in <I>, <S>, <FR> etc. - the buffer's encoding is consulted. If it goes into a register (EU), the register's encoding is consulted. Everything else (O, EN, EC, ES...) expects only Unicode codepoints. * This is communicated through a new field teco_machine_stringbuilding_t::codepage which must be set in the states' initial callback. * Seems overkill just for ^EUq, but it can be used for context-sensitive processing of all the other string building constructs as well. * ^V and ^W cannot be supported for Unicode characters for the time being without an Unicode-aware parser
Diffstat (limited to 'src/parser.c')
-rw-r--r--src/parser.c37
1 files changed, 31 insertions, 6 deletions
diff --git a/src/parser.c b/src/parser.c
index 8d3cc92..29519b0 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -410,6 +410,7 @@ teco_state_stringbuilding_escaped_input(teco_machine_stringbuilding_t *ctx, gcha
/* parse-only mode */
return &teco_state_stringbuilding_start;
+ /* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */
switch (ctx->mode) {
case TECO_STRINGBUILDING_MODE_UPPER:
chr = g_ascii_toupper(chr);
@@ -442,6 +443,7 @@ teco_state_stringbuilding_lower_input(teco_machine_stringbuilding_t *ctx, gchar
teco_undo_guint(ctx->mode);
ctx->mode = TECO_STRINGBUILDING_MODE_LOWER;
} else {
+ /* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */
teco_string_append_c(ctx->result, g_ascii_tolower(chr));
}
@@ -465,6 +467,7 @@ teco_state_stringbuilding_upper_input(teco_machine_stringbuilding_t *ctx, gchar
teco_undo_guint(ctx->mode);
ctx->mode = TECO_STRINGBUILDING_MODE_UPPER;
} else {
+ /* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */
teco_string_append_c(ctx->result, g_ascii_toupper(chr));
}
@@ -576,15 +579,28 @@ teco_state_stringbuilding_ctle_u_input(teco_machine_stringbuilding_t *ctx, gchar
teco_int_t value;
if (!qreg->vtable->get_integer(qreg, &value, error))
return NULL;
- if (value < 0 || value > 0xFF) {
- g_autofree gchar *name_printable = teco_string_echo(qreg->head.name.data, qreg->head.name.len);
- g_set_error(error, TECO_ERROR, TECO_ERROR_FAILED,
- "Q-Register \"%s\" does not contain a valid character", name_printable);
- return NULL;
+
+ if (ctx->codepage == SC_CP_UTF8) {
+ if (value < 0 || !g_unichar_validate(value))
+ goto error_codepoint;
+ /* 4 bytes should be enough, but we better follow the documentation */
+ gchar buf[6];
+ gsize len = g_unichar_to_utf8(value, buf);
+ teco_string_append(ctx->result, buf, len);
+ } else {
+ if (value < 0 || value > 0xFF)
+ goto error_codepoint;
+ teco_string_append_c(ctx->result, (gchar)value);
}
- teco_string_append_c(ctx->result, (gchar)value);
return &teco_state_stringbuilding_start;
+
+error_codepoint: {
+ g_autofree gchar *name_printable = teco_string_echo(qreg->head.name.data, qreg->head.name.len);
+ g_set_error(error, TECO_ERROR, TECO_ERROR_CODEPOINT,
+ "Q-Register \"%s\" does not contain a valid codepoint", name_printable);
+ return NULL;
+}
}
TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_u);
@@ -708,6 +724,7 @@ teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gchar escap
teco_machine_init(&ctx->parent, &teco_state_stringbuilding_start, must_undo);
ctx->escape_char = escape_char;
ctx->qreg_table_locals = locals;
+ ctx->codepage = SC_CP_UTF8;
}
void
@@ -746,6 +763,14 @@ teco_machine_stringbuilding_clear(teco_machine_stringbuilding_t *ctx)
teco_machine_qregspec_free(ctx->machine_qregspec);
}
+gboolean
+teco_state_expectstring_initial(teco_machine_main_t *ctx, GError **error)
+{
+ if (ctx->mode == TECO_MODE_NORMAL)
+ teco_undo_guint(ctx->expectstring.machine.codepage) = SC_CP_UTF8;
+ return TRUE;
+}
+
teco_state_t *
teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **error)
{