the ^EUq string building escape now respects the encoding (can insert bytes or codepoints) (refs #5)

* This is trickier than it sounds because there isn't one single place to consult. It depends on the context. If the string argument relates to buffer contents - as in <I>, <S>, <FR> etc. - the buffer's encoding is consulted. If it goes into a register (EU), the register's encoding is consulted. Everything else (O, EN, EC, ES...) expects only Unicode codepoints. * This is communicated through a new field teco_machine_stringbuilding_t::codepage which must be set in the states' initial callback. * Seems overkill just for ^EUq, but it can be used for context-sensitive processing of all the other string building constructs as well. * ^V and ^W cannot be supported for Unicode characters for the time being without an Unicode-aware parser
author: Robin Haberkorn <robin.haberkorn@googlemail.com> 2024-09-04 12:49:29 +0200
committer: Robin Haberkorn <robin.haberkorn@googlemail.com> 2024-09-09 18:22:21 +0200
commit: b31b88717172e22b49c0493185f603b8f84989ec (patch)
tree: 43850d7d04e721987b89c37c68f24e657b5cb9c6 /src/parser.c
parent: b85edaa0021c06d63fee6d8904fc822815e8b933 (diff)
download: sciteco-b31b88717172e22b49c0493185f603b8f84989ec.tar.gz
1 files changed, 31 insertions, 6 deletions
diff --git a/src/parser.c b/src/parser.c
index 8d3cc92..29519b0 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -410,6 +410,7 @@ teco_state_stringbuilding_escaped_input(teco_machine_stringbuilding_t *ctx, gcha
 		/* parse-only mode */
 		return &teco_state_stringbuilding_start;
 
+	/* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */
 	switch (ctx->mode) {
 	case TECO_STRINGBUILDING_MODE_UPPER:
 		chr = g_ascii_toupper(chr);
@@ -442,6 +443,7 @@ teco_state_stringbuilding_lower_input(teco_machine_stringbuilding_t *ctx, gchar
 			teco_undo_guint(ctx->mode);
 		ctx->mode = TECO_STRINGBUILDING_MODE_LOWER;
 	} else {
+		/* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */
 		teco_string_append_c(ctx->result, g_ascii_tolower(chr));
 	}
 
@@ -465,6 +467,7 @@ teco_state_stringbuilding_upper_input(teco_machine_stringbuilding_t *ctx, gchar
 			teco_undo_guint(ctx->mode);
 		ctx->mode = TECO_STRINGBUILDING_MODE_UPPER;
 	} else {
+		/* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */
 		teco_string_append_c(ctx->result, g_ascii_toupper(chr));
 	}
 
@@ -576,15 +579,28 @@ teco_state_stringbuilding_ctle_u_input(teco_machine_stringbuilding_t *ctx, gchar
 	teco_int_t value;
 	if (!qreg->vtable->get_integer(qreg, &value, error))
 		return NULL;
-	if (value < 0 || value > 0xFF) {
-		g_autofree gchar *name_printable = teco_string_echo(qreg->head.name.data, qreg->head.name.len);
-		g_set_error(error, TECO_ERROR, TECO_ERROR_FAILED,
-		            "Q-Register \"%s\" does not contain a valid character", name_printable);
-		return NULL;
+
+	if (ctx->codepage == SC_CP_UTF8) {
+		if (value < 0 || !g_unichar_validate(value))
+			goto error_codepoint;
+		/* 4 bytes should be enough, but we better follow the documentation */
+		gchar buf[6];
+		gsize len = g_unichar_to_utf8(value, buf);
+		teco_string_append(ctx->result, buf, len);
+	} else {
+		if (value < 0 || value > 0xFF)
+			goto error_codepoint;
+		teco_string_append_c(ctx->result, (gchar)value);
 	}
 
-	teco_string_append_c(ctx->result, (gchar)value);
 	return &teco_state_stringbuilding_start;
+
+error_codepoint: {
+	g_autofree gchar *name_printable = teco_string_echo(qreg->head.name.data, qreg->head.name.len);
+	g_set_error(error, TECO_ERROR, TECO_ERROR_CODEPOINT,
+	            "Q-Register \"%s\" does not contain a valid codepoint", name_printable);
+	return NULL;
+}
 }
 
 TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_u);
@@ -708,6 +724,7 @@ teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gchar escap
 	teco_machine_init(&ctx->parent, &teco_state_stringbuilding_start, must_undo);
 	ctx->escape_char = escape_char;
 	ctx->qreg_table_locals = locals;
+	ctx->codepage = SC_CP_UTF8;
 }
 
 void
@@ -746,6 +763,14 @@ teco_machine_stringbuilding_clear(teco_machine_stringbuilding_t *ctx)
 		teco_machine_qregspec_free(ctx->machine_qregspec);
 }
 
+gboolean
+teco_state_expectstring_initial(teco_machine_main_t *ctx, GError **error)
+{
+	if (ctx->mode == TECO_MODE_NORMAL)
+		teco_undo_guint(ctx->expectstring.machine.codepage) = SC_CP_UTF8;
+	return TRUE;
+}
+
 teco_state_t *
 teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **error)
 {
author	Robin Haberkorn <robin.haberkorn@googlemail.com>	2024-09-04 12:49:29 +0200
committer	Robin Haberkorn <robin.haberkorn@googlemail.com>	2024-09-09 18:22:21 +0200
commit	b31b88717172e22b49c0493185f603b8f84989ec (patch)
tree	43850d7d04e721987b89c37c68f24e657b5cb9c6 /src/parser.c
parent	b85edaa0021c06d63fee6d8904fc822815e8b933 (diff)
download	sciteco-b31b88717172e22b49c0493185f603b8f84989ec.tar.gz