From 893a0a6ad85411a57c1225af03260b34561377c7 Mon Sep 17 00:00:00 2001 From: Robin Haberkorn Date: Wed, 4 Sep 2024 18:26:00 +0200 Subject: leave some comments on what to do when converting the parser to Unicode (refs #5) --- src/qreg.c | 9 +++++++++ src/string-utils.c | 13 ++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/qreg.c b/src/qreg.c index 2c2b6ad..10aaa86 100644 --- a/src/qreg.c +++ b/src/qreg.c @@ -83,6 +83,12 @@ teco_qreg_execute(teco_qreg_t *qreg, teco_qreg_table_t *qreg_table_locals, GErro { g_auto(teco_string_t) macro = {NULL, 0}; + /* + * FIXME: Once we have an Unicode-aware parser, + * we should probably check the encoding of the register. + * On the other hand, we will have to validate the + * UTF-8 codepoints before execution anyway. + */ if (!qreg->vtable->get_string(qreg, ¯o.data, ¯o.len, error) || !teco_execute_macro(macro.data, macro.len, qreg_table_locals, error)) { teco_error_add_frame_qreg(qreg->head.name.data, qreg->head.name.len); @@ -1310,6 +1316,7 @@ teco_state_qregspec_start_global_input(teco_machine_qregspec_t *ctx, gchar chr, if (!ctx->parse_only) { if (ctx->parent.must_undo) undo__teco_string_truncate(&ctx->name, ctx->name.len); + /* FIXME: g_unicode_toupper() once we have an Unicode-conforming parser */ teco_string_append_c(&ctx->name, g_ascii_toupper(chr)); } return teco_state_qregspec_done(ctx, error); @@ -1334,6 +1341,7 @@ teco_state_qregspec_firstchar_input(teco_machine_qregspec_t *ctx, gchar chr, GEr if (!ctx->parse_only) { if (ctx->parent.must_undo) undo__teco_string_truncate(&ctx->name, ctx->name.len); + /* FIXME: g_unicode_toupper() once we have an Unicode-conforming parser */ teco_string_append_c(&ctx->name, g_ascii_toupper(chr)); } return &teco_state_qregspec_secondchar; @@ -1352,6 +1360,7 @@ teco_state_qregspec_secondchar_input(teco_machine_qregspec_t *ctx, gchar chr, GE if (!ctx->parse_only) { if (ctx->parent.must_undo) undo__teco_string_truncate(&ctx->name, ctx->name.len); + /* FIXME: g_unicode_toupper() once we have an Unicode-conforming parser */ teco_string_append_c(&ctx->name, g_ascii_toupper(chr)); } return teco_state_qregspec_done(ctx, error); diff --git a/src/string-utils.c b/src/string-utils.c index e1013f5..ac5835b 100644 --- a/src/string-utils.c +++ b/src/string-utils.c @@ -91,7 +91,18 @@ teco_string_diff(const teco_string_t *a, const gchar *b, gsize b_len) return len; } -/** @memberof teco_string_t */ +/** + * Get the length of the prefix common to two strings + * without considering case. + * + * @fixme This is currently only used for symbols and one/two letter + * Q-Register names, which cannot be UTF-8. + * If we rewrote this to perform Unicode case folding, we would + * also have to check for character validity. + * Once our parser is Unicode-aware, this is not necessary. + * + * @memberof teco_string_t + */ gsize teco_string_casediff(const teco_string_t *a, const gchar *b, gsize b_len) { -- cgit v1.2.3