From b31b88717172e22b49c0493185f603b8f84989ec Mon Sep 17 00:00:00 2001 From: Robin Haberkorn Date: Wed, 4 Sep 2024 12:49:29 +0200 Subject: the ^EUq string building escape now respects the encoding (can insert bytes or codepoints) (refs #5) * This is trickier than it sounds because there isn't one single place to consult. It depends on the context. If the string argument relates to buffer contents - as in , , etc. - the buffer's encoding is consulted. If it goes into a register (EU), the register's encoding is consulted. Everything else (O, EN, EC, ES...) expects only Unicode codepoints. * This is communicated through a new field teco_machine_stringbuilding_t::codepage which must be set in the states' initial callback. * Seems overkill just for ^EUq, but it can be used for context-sensitive processing of all the other string building constructs as well. * ^V and ^W cannot be supported for Unicode characters for the time being without an Unicode-aware parser --- doc/sciteco.7.template | 6 ++++++ src/core-commands.c | 9 +++++++-- src/interface.h | 6 ++++++ src/parser.c | 37 +++++++++++++++++++++++++++++++------ src/parser.h | 12 +++++++++--- src/qreg-commands.c | 18 ++++++++++++++++++ src/qreg.c | 6 +++--- src/qreg.h | 2 +- src/search.c | 12 +++++++++++- src/spawn.c | 5 +++++ src/view.h | 7 +++++++ 11 files changed, 104 insertions(+), 16 deletions(-) diff --git a/doc/sciteco.7.template b/doc/sciteco.7.template index a6cca40..ca23c93 100644 --- a/doc/sciteco.7.template +++ b/doc/sciteco.7.template @@ -1647,6 +1647,12 @@ Expands to the character whose code is stored in the numeric part of Q-Register \fIq\fP. For instance if register \(lqA\(rq contains the code 66, \(lq^EUa\(rq expands to the character \(lqB\(rq. +The interpretation of this code depends on the context. +Within inserts and searches (\fBI\fP, \fBS\fP, etc.) bytes or Unicode codepoints +are expected depending on the buffer's encoding. +Operations on registers (\fBEU\fP) similarily consult the +register's encoding. +Everything else expects Unicode codepoints. .TP .SCITECO_TOPIC ^EQ ^EQq .BI ^EQ q diff --git a/src/core-commands.c b/src/core-commands.c index ef4621f..a84d0ef 100644 --- a/src/core-commands.c +++ b/src/core-commands.c @@ -2494,8 +2494,7 @@ teco_state_ecommand_encoding(teco_machine_main_t *ctx, GError **error) gboolean colon_modified = teco_machine_main_eval_colon(ctx); - sptr_t old_cp = teco_interface_ssm(SCI_GETCODEPAGE, 0, 0) - ? : teco_interface_ssm(SCI_STYLEGETCHARACTERSET, STYLE_DEFAULT, 0); + guint old_cp = teco_interface_get_codepage(); if (!teco_expressions_args()) { /* get current code page */ @@ -2745,6 +2744,12 @@ teco_state_insert_initial(teco_machine_main_t *ctx, GError **error) if (ctx->mode > TECO_MODE_NORMAL) return TRUE; + /* + * Current document's encoding determines the behaviour of + * string building constructs. + */ + teco_undo_guint(ctx->expectstring.machine.codepage) = teco_interface_get_codepage(); + if (!teco_expressions_eval(FALSE, error)) return FALSE; guint args = teco_expressions_args(); diff --git a/src/interface.h b/src/interface.h index cbe10bd..bbefe88 100644 --- a/src/interface.h +++ b/src/interface.h @@ -154,6 +154,12 @@ void teco_interface_process_notify(SCNotification *notify); /** @pure */ void teco_interface_cleanup(void); +static inline guint +teco_interface_get_codepage(void) +{ + return teco_view_get_codepage(teco_interface_current_view); +} + static inline gssize teco_glyphs2bytes(teco_int_t pos) { diff --git a/src/parser.c b/src/parser.c index 8d3cc92..29519b0 100644 --- a/src/parser.c +++ b/src/parser.c @@ -410,6 +410,7 @@ teco_state_stringbuilding_escaped_input(teco_machine_stringbuilding_t *ctx, gcha /* parse-only mode */ return &teco_state_stringbuilding_start; + /* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */ switch (ctx->mode) { case TECO_STRINGBUILDING_MODE_UPPER: chr = g_ascii_toupper(chr); @@ -442,6 +443,7 @@ teco_state_stringbuilding_lower_input(teco_machine_stringbuilding_t *ctx, gchar teco_undo_guint(ctx->mode); ctx->mode = TECO_STRINGBUILDING_MODE_LOWER; } else { + /* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */ teco_string_append_c(ctx->result, g_ascii_tolower(chr)); } @@ -465,6 +467,7 @@ teco_state_stringbuilding_upper_input(teco_machine_stringbuilding_t *ctx, gchar teco_undo_guint(ctx->mode); ctx->mode = TECO_STRINGBUILDING_MODE_UPPER; } else { + /* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */ teco_string_append_c(ctx->result, g_ascii_toupper(chr)); } @@ -576,15 +579,28 @@ teco_state_stringbuilding_ctle_u_input(teco_machine_stringbuilding_t *ctx, gchar teco_int_t value; if (!qreg->vtable->get_integer(qreg, &value, error)) return NULL; - if (value < 0 || value > 0xFF) { - g_autofree gchar *name_printable = teco_string_echo(qreg->head.name.data, qreg->head.name.len); - g_set_error(error, TECO_ERROR, TECO_ERROR_FAILED, - "Q-Register \"%s\" does not contain a valid character", name_printable); - return NULL; + + if (ctx->codepage == SC_CP_UTF8) { + if (value < 0 || !g_unichar_validate(value)) + goto error_codepoint; + /* 4 bytes should be enough, but we better follow the documentation */ + gchar buf[6]; + gsize len = g_unichar_to_utf8(value, buf); + teco_string_append(ctx->result, buf, len); + } else { + if (value < 0 || value > 0xFF) + goto error_codepoint; + teco_string_append_c(ctx->result, (gchar)value); } - teco_string_append_c(ctx->result, (gchar)value); return &teco_state_stringbuilding_start; + +error_codepoint: { + g_autofree gchar *name_printable = teco_string_echo(qreg->head.name.data, qreg->head.name.len); + g_set_error(error, TECO_ERROR, TECO_ERROR_CODEPOINT, + "Q-Register \"%s\" does not contain a valid codepoint", name_printable); + return NULL; +} } TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_u); @@ -708,6 +724,7 @@ teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gchar escap teco_machine_init(&ctx->parent, &teco_state_stringbuilding_start, must_undo); ctx->escape_char = escape_char; ctx->qreg_table_locals = locals; + ctx->codepage = SC_CP_UTF8; } void @@ -746,6 +763,14 @@ teco_machine_stringbuilding_clear(teco_machine_stringbuilding_t *ctx) teco_machine_qregspec_free(ctx->machine_qregspec); } +gboolean +teco_state_expectstring_initial(teco_machine_main_t *ctx, GError **error) +{ + if (ctx->mode == TECO_MODE_NORMAL) + teco_undo_guint(ctx->expectstring.machine.codepage) = SC_CP_UTF8; + return TRUE; +} + teco_state_t * teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **error) { diff --git a/src/parser.h b/src/parser.h index 4b4a3a0..ba6054f 100644 --- a/src/parser.h +++ b/src/parser.h @@ -309,9 +309,6 @@ typedef enum { /** * A stringbuilding state machine. * - * @fixme Should contain the escape char (currently in teco_machine_expectstring_t), - * so that we can escape it via ^Q. - * * @extends teco_machine_t */ typedef struct teco_machine_stringbuilding_t { @@ -350,6 +347,13 @@ typedef struct teco_machine_stringbuilding_t { * (see teco_state_stringbuilding_start_process_edit_cmd()). */ teco_string_t *result; + + /** + * Encoding of string in `result`. + * This is inherited from the embedding command and may depend on + * the buffer's or Q-Register's encoding. + */ + guint codepage; } teco_machine_stringbuilding_t; void teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gchar escape_char, @@ -508,6 +512,7 @@ void teco_machine_main_clear(teco_machine_main_t *ctx); G_DEFINE_AUTO_CLEANUP_CLEAR_FUNC(teco_machine_main_t, teco_machine_main_clear); +gboolean teco_state_expectstring_initial(teco_machine_main_t *ctx, GError **error); teco_state_t *teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **error); gboolean teco_state_expectstring_refresh(teco_machine_main_t *ctx, GError **error); @@ -533,6 +538,7 @@ gboolean teco_state_expectstring_process_edit_cmd(teco_machine_main_t *ctx, teco return teco_state_expectstring_input(ctx, chr, error); \ } \ TECO_DEFINE_STATE(NAME, \ + .initial_cb = (teco_state_initial_cb_t)teco_state_expectstring_initial, \ .refresh_cb = (teco_state_refresh_cb_t)teco_state_expectstring_refresh, \ .process_edit_cmd_cb = (teco_state_process_edit_cmd_cb_t) \ teco_state_expectstring_process_edit_cmd, \ diff --git a/src/qreg-commands.c b/src/qreg-commands.c index 09b2b90..1bde944 100644 --- a/src/qreg-commands.c +++ b/src/qreg-commands.c @@ -470,6 +470,23 @@ TECO_DEFINE_STATE_EXPECTQREG(teco_state_eucommand, .expectqreg.type = TECO_QREG_OPTIONAL_INIT ); +static gboolean +teco_state_setqregstring_building_initial(teco_machine_main_t *ctx, GError **error) +{ + if (ctx->mode > TECO_MODE_NORMAL) + return TRUE; + + teco_qreg_t *qreg; + teco_machine_qregspec_get_results(ctx->expectqreg, &qreg, NULL); + + /* + * The expected codepage of string building constructs is determined + * by the Q-Register. + */ + teco_undo_guint(ctx->expectstring.machine.codepage) = qreg->vtable->get_codepage(qreg); + return TRUE; +} + static teco_state_t * teco_state_setqregstring_building_done(teco_machine_main_t *ctx, const teco_string_t *str, GError **error) { @@ -487,6 +504,7 @@ teco_state_setqregstring_building_done(teco_machine_main_t *ctx, const teco_stri * characters \fBenabled\fP. */ TECO_DEFINE_STATE_EXPECTSTRING(teco_state_setqregstring_building, + .initial_cb = (teco_state_initial_cb_t)teco_state_setqregstring_building_initial, .expectstring.string_building = TRUE ); diff --git a/src/qreg.c b/src/qreg.c index c3ab1a5..2c2b6ad 100644 --- a/src/qreg.c +++ b/src/qreg.c @@ -204,14 +204,14 @@ teco_qreg_plain_get_integer(teco_qreg_t *qreg, teco_int_t *ret, GError **error) return TRUE; } -static gint +static guint teco_qreg_plain_get_codepage(teco_qreg_t *qreg) { if (teco_qreg_current) teco_doc_update(&teco_qreg_current->string, teco_qreg_view); teco_doc_edit(&qreg->string); - gint ret = teco_view_ssm(teco_qreg_view, SCI_GETCODEPAGE, 0, 0); + guint ret = teco_view_get_codepage(teco_qreg_view); if (teco_qreg_current) teco_doc_edit(&teco_qreg_current->string); @@ -408,7 +408,7 @@ teco_qreg_external_edit(teco_qreg_t *qreg, GError **error) return TRUE; } -static gint +static guint teco_qreg_external_get_codepage(teco_qreg_t *qreg) { /* diff --git a/src/qreg.h b/src/qreg.h index 7a150ea..f87b877 100644 --- a/src/qreg.h +++ b/src/qreg.h @@ -47,7 +47,7 @@ typedef const struct { gboolean (*undo_set_integer)(teco_qreg_t *qreg, GError **error); gboolean (*get_integer)(teco_qreg_t *qreg, teco_int_t *ret, GError **error); - gint (*get_codepage)(teco_qreg_t *qreg); + guint (*get_codepage)(teco_qreg_t *qreg); gboolean (*set_string)(teco_qreg_t *qreg, const gchar *str, gsize len, GError **error); gboolean (*undo_set_string)(teco_qreg_t *qreg, GError **error); gboolean (*append_string)(teco_qreg_t *qreg, const gchar *str, gsize len, GError **error); diff --git a/src/search.c b/src/search.c index 88b0e16..f72616d 100644 --- a/src/search.c +++ b/src/search.c @@ -60,6 +60,8 @@ teco_state_search_initial(teco_machine_main_t *ctx, GError **error) if (ctx->mode > TECO_MODE_NORMAL) return TRUE; + teco_undo_guint(ctx->expectstring.machine.codepage) = teco_interface_get_codepage(); + if (G_UNLIKELY(!teco_search_qreg_machine)) teco_search_qreg_machine = teco_machine_qregspec_new(TECO_QREG_REQUIRED, ctx->qreg_table_locals, ctx->parent.must_undo); @@ -978,11 +980,19 @@ teco_state_search_delete_done(teco_machine_main_t *ctx, const teco_string_t *str */ TECO_DEFINE_STATE_SEARCH(teco_state_search_delete); +static gboolean +teco_state_replace_insert_initial(teco_machine_main_t *ctx, GError **error) +{ + if (ctx->mode == TECO_MODE_NORMAL) + teco_undo_guint(ctx->expectstring.machine.codepage) = teco_interface_get_codepage(); + return TRUE; +} + /* * FIXME: Could be static */ TECO_DEFINE_STATE_INSERT(teco_state_replace_insert, - .initial_cb = NULL + .initial_cb = (teco_state_initial_cb_t)teco_state_replace_insert_initial ); static teco_state_t * diff --git a/src/spawn.c b/src/spawn.c index c1fb426..4317288 100644 --- a/src/spawn.c +++ b/src/spawn.c @@ -164,6 +164,11 @@ teco_state_execute_initial(teco_machine_main_t *ctx, GError **error) if (ctx->mode > TECO_MODE_NORMAL) return TRUE; + /* + * Command-lines and file names are always assumed to be UTF-8. + */ + teco_undo_guint(ctx->expectstring.machine.codepage) = SC_CP_UTF8; + if (!teco_expressions_eval(FALSE, error)) return FALSE; diff --git a/src/view.h b/src/view.h index 882a33c..8f54fdd 100644 --- a/src/view.h +++ b/src/view.h @@ -71,6 +71,13 @@ gboolean teco_view_save_to_file(teco_view_t *ctx, const gchar *filename, GError /** @pure @memberof teco_view_t */ void teco_view_free(teco_view_t *ctx); +static inline guint +teco_view_get_codepage(teco_view_t *ctx) +{ + return teco_view_ssm(ctx, SCI_GETCODEPAGE, 0, 0) + ? : teco_view_ssm(ctx, SCI_STYLEGETCHARACTERSET, STYLE_DEFAULT, 0); +} + gssize teco_view_glyphs2bytes(teco_view_t *ctx, teco_int_t pos); teco_int_t teco_view_bytes2glyphs(teco_view_t *ctx, gsize pos); gssize teco_view_glyphs2bytes_relative(teco_view_t *ctx, gsize pos, teco_int_t n); -- cgit v1.2.3