diff options
author | Robin Haberkorn <robin.haberkorn@googlemail.com> | 2025-07-03 15:21:37 +0200 |
---|---|---|
committer | Robin Haberkorn <robin.haberkorn@googlemail.com> | 2025-07-03 16:04:01 +0200 |
commit | 7bc7662f3cd1ceaf55e00f3d5f84e9772574afc8 (patch) | |
tree | 1f68eb72c387ee25a2f7d67225538252b6b17ef5 /src/parser.c | |
parent | 9e3746a4e42f98cad7dfe5b41779ffa81bd4366a (diff) | |
download | sciteco-7bc7662f3cd1ceaf55e00f3d5f84e9772574afc8.tar.gz |
implemented ^E<code> string building constructs for embedding bytes and codepoints in a strtoul()-like manner
Diffstat (limited to 'src/parser.c')
-rw-r--r-- | src/parser.c | 133 |
1 files changed, 103 insertions, 30 deletions
diff --git a/src/parser.c b/src/parser.c index 8ef7a06..d7f9433 100644 --- a/src/parser.c +++ b/src/parser.c @@ -19,6 +19,7 @@ #include "config.h" #endif +#include <errno.h> #include <string.h> #include <glib.h> @@ -385,6 +386,7 @@ teco_machine_main_clear(teco_machine_main_t *ctx) teco_goto_table_clear(&ctx->goto_table); teco_string_clear(&ctx->expectstring.string); teco_machine_stringbuilding_clear(&ctx->expectstring.machine); + // FIXME: Could leak ctx->goto_label, but it's in an union } /** Append string to result with case folding. */ @@ -412,6 +414,48 @@ teco_machine_stringbuilding_append(teco_machine_stringbuilding_t *ctx, const gch } } +/** + * Append codepoint to result string with case folding. + * + * This also takes the target encoding into account and checks the value + * range accordingly. + * + * @returns FALSE if the codepoint is not valid in the target encoding. + */ +static gboolean +teco_machine_stringbuilding_append_c(teco_machine_stringbuilding_t *ctx, teco_int_t value) +{ + g_assert(ctx->result != NULL); + + if (ctx->codepage == SC_CP_UTF8) { + if (value < 0 || !g_unichar_validate(value)) + return FALSE; + switch (ctx->mode) { + case TECO_STRINGBUILDING_MODE_UPPER: + value = g_unichar_toupper(value); + break; + case TECO_STRINGBUILDING_MODE_LOWER: + value = g_unichar_tolower(value); + break; + } + teco_string_append_wc(ctx->result, value); + } else { + if (value < 0 || value > 0xFF) + return FALSE; + switch (ctx->mode) { + case TECO_STRINGBUILDING_MODE_UPPER: + value = g_ascii_toupper(value); + break; + case TECO_STRINGBUILDING_MODE_LOWER: + value = g_ascii_tolower(value); + break; + } + teco_string_append_c(ctx->result, value); + } + + return TRUE; +} + /* * FIXME: All teco_state_stringbuilding_* states could be static? */ @@ -429,6 +473,7 @@ TECO_DECLARE_STATE(teco_state_stringbuilding_upper); TECO_DECLARE_STATE(teco_state_stringbuilding_ctle); TECO_DECLARE_STATE(teco_state_stringbuilding_ctle_num); TECO_DECLARE_STATE(teco_state_stringbuilding_ctle_u); +TECO_DECLARE_STATE(teco_state_stringbuilding_ctle_code); TECO_DECLARE_STATE(teco_state_stringbuilding_ctle_q); TECO_DECLARE_STATE(teco_state_stringbuilding_ctle_quote); TECO_DECLARE_STATE(teco_state_stringbuilding_ctle_n); @@ -643,6 +688,7 @@ teco_state_stringbuilding_ctle_input(teco_machine_stringbuilding_t *ctx, gunicha switch (teco_ascii_toupper(chr)) { case '\\': next = &teco_state_stringbuilding_ctle_num; break; case 'U': next = &teco_state_stringbuilding_ctle_u; break; + case '<': next = &teco_state_stringbuilding_ctle_code; break; case 'Q': next = &teco_state_stringbuilding_ctle_q; break; case '@': next = &teco_state_stringbuilding_ctle_quote; break; case 'N': next = &teco_state_stringbuilding_ctle_n; break; @@ -741,43 +787,69 @@ teco_state_stringbuilding_ctle_u_input(teco_machine_stringbuilding_t *ctx, gunic if (!qreg->vtable->get_integer(qreg, &value, error)) return NULL; - if (ctx->codepage == SC_CP_UTF8) { - if (value < 0 || !g_unichar_validate(value)) - goto error_codepoint; - switch (ctx->mode) { - case TECO_STRINGBUILDING_MODE_UPPER: - value = g_unichar_toupper(value); - break; - case TECO_STRINGBUILDING_MODE_LOWER: - value = g_unichar_tolower(value); - break; + if (!teco_machine_stringbuilding_append_c(ctx, value)) { + g_autofree gchar *name_printable = teco_string_echo(qreg->head.name.data, qreg->head.name.len); + g_set_error(error, TECO_ERROR, TECO_ERROR_CODEPOINT, + "Q-Register \"%s\" does not contain a valid codepoint", name_printable); + return NULL; + } + + return &teco_state_stringbuilding_start; +} + +TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_u); + +static teco_state_t * +teco_state_stringbuilding_ctle_code_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) +{ + if (chr == '>') { + if (!ctx->result) + /* parse-only mode */ + return &teco_state_stringbuilding_start; + + if (!ctx->code.data) { + g_set_error_literal(error, TECO_ERROR, TECO_ERROR_CODEPOINT, + "Invalid empty ^E<> specified"); + return NULL; } - teco_string_append_wc(ctx->result, value); - } else { - if (value < 0 || value > 0xFF) - goto error_codepoint; - switch (ctx->mode) { - case TECO_STRINGBUILDING_MODE_UPPER: - value = g_ascii_toupper(value); - break; - case TECO_STRINGBUILDING_MODE_LOWER: - value = g_ascii_tolower(value); - break; + + /* + * FIXME: Once we support hexadecimal constants in the SciTECO + * language itself, we might support this syntax as well. + * Or should we perhaps always consider the current radix? + */ + gchar *endp = ctx->code.data; + errno = 0; + gint64 code = g_ascii_strtoll(ctx->code.data, &endp, 0); + if (errno || endp - ctx->code.data != ctx->code.len || + !teco_machine_stringbuilding_append_c(ctx, code)) { + /* will also catch embedded nulls */ + g_set_error(error, TECO_ERROR, TECO_ERROR_CODEPOINT, + "Invalid code ^E<%s> specified", ctx->code.data); + return NULL; } - teco_string_append_c(ctx->result, value); + + if (ctx->parent.must_undo) + teco_undo_string_own(ctx->code); + else + teco_string_clear(&ctx->code); + memset(&ctx->code, 0, sizeof(ctx->code)); + + return &teco_state_stringbuilding_start; } - return &teco_state_stringbuilding_start; + if (!ctx->result) + /* parse-only mode */ + return &teco_state_stringbuilding_ctle_code; -error_codepoint: { - g_autofree gchar *name_printable = teco_string_echo(qreg->head.name.data, qreg->head.name.len); - g_set_error(error, TECO_ERROR, TECO_ERROR_CODEPOINT, - "Q-Register \"%s\" does not contain a valid codepoint", name_printable); - return NULL; -} + if (ctx->parent.must_undo) + undo__teco_string_truncate(&ctx->code, ctx->code.len); + teco_string_append_wc(&ctx->code, chr); + + return &teco_state_stringbuilding_ctle_code; } -TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_u); +TECO_DEFINE_STATE(teco_state_stringbuilding_ctle_code); static teco_state_t * teco_state_stringbuilding_ctle_q_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error) @@ -942,6 +1014,7 @@ teco_machine_stringbuilding_clear(teco_machine_stringbuilding_t *ctx) { if (ctx->machine_qregspec) teco_machine_qregspec_free(ctx->machine_qregspec); + teco_string_clear(&ctx->code); } gboolean |