aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--doc/sciteco.7.template20
-rw-r--r--src/cmdline.c79
-rw-r--r--src/cmdline.h12
-rw-r--r--src/core-commands.c25
-rw-r--r--src/core-commands.h2
-rw-r--r--src/error.h4
-rw-r--r--src/expressions.c3
-rw-r--r--src/expressions.h2
-rw-r--r--src/goto-commands.c6
-rw-r--r--src/help.c2
-rw-r--r--src/interface-curses/interface.c24
-rw-r--r--src/interface-gtk/interface.c10
-rw-r--r--src/parser.c157
-rw-r--r--src/parser.h30
-rw-r--r--src/qreg-commands.c9
-rw-r--r--src/qreg-commands.h6
-rw-r--r--src/qreg.c40
-rw-r--r--src/qreg.h2
-rw-r--r--src/rb3str.c8
-rw-r--r--src/rb3str.h2
-rw-r--r--src/sciteco.h2
-rw-r--r--src/search.c8
-rw-r--r--src/spawn.c4
-rw-r--r--src/string-utils.c36
-rw-r--r--src/string-utils.h19
-rw-r--r--src/symbols.c2
-rw-r--r--src/undo.c2
-rw-r--r--src/undo.h4
-rw-r--r--tests/testsuite.at7
29 files changed, 325 insertions, 202 deletions
diff --git a/doc/sciteco.7.template b/doc/sciteco.7.template
index a5b7f4a..d0574d7 100644
--- a/doc/sciteco.7.template
+++ b/doc/sciteco.7.template
@@ -87,10 +87,6 @@ regular commands for command-line editing.
.
When the user presses a key or key-combination it is first translated
to an UTF-8 string.
-All immediate editing commands and regular \*(ST commands however operate on
-a language based solely on
-.B ASCII
-codes, which is a subset of Unicode.
The rules for translating keys are as follows:
.RS
.IP 1. 4
@@ -138,6 +134,18 @@ This feature is called function key macros and explained in the
next subsection.
.RE
.
+.LP
+All immediate editing commands and regular \*(ST commands however operate on
+a language based solely on
+.B ASCII
+codes, which is a subset of Unicode.
+\# This is because we cannot assume the presence of any particular non-ANSI
+\# symbol on a user's keyboard.
+Since the \*(ST parser is Unicode-aware, this does not exclude
+using Unicode glyphs wherever a single character is expected,
+ie. \fB^^\fIx\fR and \fBU\fIq\fR works with arbitrary Unicode glyphs.
+All \*(ST macros must be in valid UTF-8.
+.
.SS Function Key Macros
.
.SCITECO_TOPIC "function key"
@@ -1082,8 +1090,8 @@ Consequently when querying the code at a character position
or inserting characters by code, the code may be an Unicode
codepoint instead of byte-sized integer.
.LP
-Currently, \*(ST supports UTF-8 and single-byte ANSI encodings,
-that can also be used for editing raw binary files.
+Currently, \*(ST supports buffers in UTF-8 and single-byte
+ANSI encodings, that can also be used for editing raw binary files.
\# You can configure other single-byte code pages with EE,
\# but there isn't yet any way to insert characters.
UTF-8 is the default codepage for new buffers and Q-Registers
diff --git a/src/cmdline.c b/src/cmdline.c
index 47ef86f..be7a5b1 100644
--- a/src/cmdline.c
+++ b/src/cmdline.c
@@ -194,7 +194,7 @@ teco_cmdline_rubin(GError **error)
}
gboolean
-teco_cmdline_keypress_c(gchar key, GError **error)
+teco_cmdline_keypress_wc(gunichar key, GError **error)
{
teco_machine_t *machine = &teco_cmdline.machine.parent;
g_autoptr(GError) tmp_error = NULL;
@@ -283,6 +283,30 @@ teco_cmdline_keypress_c(gchar key, GError **error)
return TRUE;
}
+/*
+ * FIXME: If one character causes an error, we should rub out the
+ * entire string.
+ * Usually it will be called only with single keys (strings containing
+ * single codepoints), but especially teco_cmdline_fnmacro() can emulate
+ * many key presses at once.
+ */
+gboolean
+teco_cmdline_keypress(const gchar *str, gsize len, GError **error)
+{
+ for (guint i = 0; i < len; i += g_utf8_next_char(str+i) - (str+i)) {
+ gunichar chr = g_utf8_get_char_validated(str+i, len-i);
+ if ((gint32)chr < 0) {
+ g_set_error_literal(error, TECO_ERROR, TECO_ERROR_CODEPOINT,
+ "Invalid UTF-8 sequence");
+ return FALSE;
+ }
+ if (!teco_cmdline_keypress_wc(chr, error))
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
gboolean
teco_cmdline_fnmacro(const gchar *name, GError **error)
{
@@ -361,7 +385,7 @@ teco_cmdline_cleanup(void)
*/
gboolean
-teco_state_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
{
switch (key) {
case '\n': /* insert EOL sequence */
@@ -431,23 +455,30 @@ teco_state_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gch
}
teco_interface_popup_clear();
- return teco_cmdline_insert(&key, sizeof(key), error);
+
+ gchar buf[6];
+ gsize len = g_unichar_to_utf8(key, buf);
+ return teco_cmdline_insert(buf, len, error);
}
gboolean
-teco_state_caseinsensitive_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_caseinsensitive_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
{
+ /*
+ * Auto case folding is for syntactic characters,
+ * so this could be done by working only with a-z and A-Z.
+ * However, it's also not speed critical.
+ */
if (teco_ed & TECO_ED_AUTOCASEFOLD)
- /* will not modify non-letter keys */
- key = g_ascii_islower(key) ? g_ascii_toupper(key)
- : g_ascii_tolower(key);
+ key = g_unichar_islower(key) ? g_unichar_toupper(key)
+ : g_unichar_tolower(key);
return teco_state_process_edit_cmd(ctx, parent_ctx, key, error);
}
gboolean
teco_state_stringbuilding_start_process_edit_cmd(teco_machine_stringbuilding_t *ctx, teco_machine_t *parent_ctx,
- gchar key, GError **error)
+ gunichar key, GError **error)
{
teco_state_t *current = ctx->parent.current;
@@ -597,7 +628,7 @@ teco_state_stringbuilding_start_process_edit_cmd(teco_machine_stringbuilding_t *
gboolean
teco_state_stringbuilding_qreg_process_edit_cmd(teco_machine_stringbuilding_t *ctx, teco_machine_t *parent_ctx,
- gchar chr, GError **error)
+ gunichar chr, GError **error)
{
g_assert(ctx->machine_qregspec != NULL);
/* We downcast since teco_machine_qregspec_t is private in qreg.c */
@@ -606,7 +637,7 @@ teco_state_stringbuilding_qreg_process_edit_cmd(teco_machine_stringbuilding_t *c
}
gboolean
-teco_state_expectstring_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_expectstring_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
{
teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine;
teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current;
@@ -614,7 +645,7 @@ teco_state_expectstring_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_
}
gboolean
-teco_state_insert_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_insert_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
{
teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine;
teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current;
@@ -650,7 +681,7 @@ teco_state_insert_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *par
}
gboolean
-teco_state_expectfile_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_expectfile_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
{
teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine;
teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current;
@@ -720,8 +751,8 @@ teco_state_expectfile_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t
gboolean unambiguous = teco_file_auto_complete(ctx->expectstring.string.data, G_FILE_TEST_EXISTS, &new_chars);
teco_machine_stringbuilding_escape(stringbuilding_ctx, new_chars.data, new_chars.len, &new_chars_escaped);
if (unambiguous && ctx->expectstring.nesting == 1)
- teco_string_append_c(&new_chars_escaped,
- ctx->expectstring.machine.escape_char == '{' ? '}' : ctx->expectstring.machine.escape_char);
+ teco_string_append_wc(&new_chars_escaped,
+ ctx->expectstring.machine.escape_char == '{' ? '}' : ctx->expectstring.machine.escape_char);
return teco_cmdline_insert(new_chars_escaped.data, new_chars_escaped.len, error);
}
@@ -731,7 +762,7 @@ teco_state_expectfile_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t
}
gboolean
-teco_state_expectdir_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_expectdir_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
{
teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine;
teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current;
@@ -773,7 +804,7 @@ teco_state_expectdir_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *
}
gboolean
-teco_state_expectqreg_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_expectqreg_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
{
g_assert(ctx->expectqreg != NULL);
/*
@@ -785,7 +816,7 @@ teco_state_expectqreg_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t
}
gboolean
-teco_state_qregspec_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_qregspec_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
{
switch (key) {
case '\t': { /* autocomplete Q-Register name */
@@ -820,7 +851,7 @@ teco_state_qregspec_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_
}
gboolean
-teco_state_qregspec_string_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_qregspec_string_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
{
teco_machine_stringbuilding_t *stringbuilding_ctx = teco_machine_qregspec_get_stringbuilding(ctx);
teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current;
@@ -860,7 +891,7 @@ teco_state_qregspec_string_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_m
}
gboolean
-teco_state_execute_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_execute_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
{
teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine;
teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current;
@@ -905,7 +936,7 @@ teco_state_execute_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *pa
}
gboolean
-teco_state_scintilla_symbols_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_scintilla_symbols_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
{
teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine;
teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current;
@@ -950,7 +981,7 @@ teco_state_scintilla_symbols_process_edit_cmd(teco_machine_main_t *ctx, teco_mac
}
gboolean
-teco_state_goto_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_goto_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
{
teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine;
teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current;
@@ -997,7 +1028,7 @@ teco_state_goto_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *paren
}
gboolean
-teco_state_help_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error)
+teco_state_help_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error)
{
teco_machine_stringbuilding_t *stringbuilding_ctx = &ctx->expectstring.machine;
teco_state_t *stringbuilding_current = stringbuilding_ctx->parent.current;
@@ -1028,8 +1059,8 @@ teco_state_help_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *paren
gboolean unambiguous = teco_help_auto_complete(ctx->expectstring.string.data, &new_chars);
teco_machine_stringbuilding_escape(stringbuilding_ctx, new_chars.data, new_chars.len, &new_chars_escaped);
if (unambiguous && ctx->expectstring.nesting == 1)
- teco_string_append_c(&new_chars_escaped,
- ctx->expectstring.machine.escape_char == '{' ? '}' : ctx->expectstring.machine.escape_char);
+ teco_string_append_wc(&new_chars_escaped,
+ ctx->expectstring.machine.escape_char == '{' ? '}' : ctx->expectstring.machine.escape_char);
return new_chars_escaped.len ? teco_cmdline_insert(new_chars_escaped.data, new_chars_escaped.len, error) : TRUE;
}
diff --git a/src/cmdline.h b/src/cmdline.h
index 7f40b5f..78d101c 100644
--- a/src/cmdline.h
+++ b/src/cmdline.h
@@ -64,16 +64,8 @@ gboolean teco_cmdline_insert(const gchar *data, gsize len, GError **error);
gboolean teco_cmdline_rubin(GError **error);
-gboolean teco_cmdline_keypress_c(gchar key, GError **error);
-
-static inline gboolean
-teco_cmdline_keypress(const gchar *str, gsize len, GError **error)
-{
- for (guint i = 0; i < len; i++)
- if (!teco_cmdline_keypress_c(str[i], error))
- return FALSE;
- return TRUE;
-}
+gboolean teco_cmdline_keypress_wc(gunichar key, GError **error);
+gboolean teco_cmdline_keypress(const gchar *str, gsize len, GError **error);
gboolean teco_cmdline_fnmacro(const gchar *name, GError **error);
diff --git a/src/core-commands.c b/src/core-commands.c
index 3686624..ef763d5 100644
--- a/src/core-commands.c
+++ b/src/core-commands.c
@@ -45,7 +45,7 @@
#include "goto-commands.h"
#include "core-commands.h"
-static teco_state_t *teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error);
+static teco_state_t *teco_state_control_input(teco_machine_main_t *ctx, gunichar chr, GError **error);
/*
* NOTE: This needs some extra code in teco_state_start_input().
@@ -1049,7 +1049,7 @@ teco_state_start_get(teco_machine_main_t *ctx, GError **error)
}
static teco_state_t *
-teco_state_start_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_start_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
{
static teco_machine_main_transition_t transitions[] = {
/*
@@ -1388,7 +1388,7 @@ teco_state_fcommand_cond_else(teco_machine_main_t *ctx, GError **error)
}
static teco_state_t *
-teco_state_fcommand_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_fcommand_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
{
static teco_machine_main_transition_t transitions[] = {
/*
@@ -1512,7 +1512,7 @@ teco_state_changedir_done(teco_machine_main_t *ctx, const teco_string_t *str, GE
TECO_DEFINE_STATE_EXPECTDIR(teco_state_changedir);
static teco_state_t *
-teco_state_condcommand_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_condcommand_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
{
teco_int_t value = 0;
gboolean result = TRUE;
@@ -1800,7 +1800,7 @@ teco_state_control_glyphs2bytes(teco_machine_main_t *ctx, GError **error)
}
static teco_state_t *
-teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_control_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
{
static teco_machine_main_transition_t transitions[] = {
/*
@@ -1841,10 +1841,10 @@ teco_state_control_input(teco_machine_main_t *ctx, gchar chr, GError **error)
TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_control);
static teco_state_t *
-teco_state_ascii_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_ascii_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
{
if (ctx->mode == TECO_MODE_NORMAL)
- teco_expressions_push((guchar)chr);
+ teco_expressions_push(chr);
return &teco_state_start;
}
@@ -1877,7 +1877,7 @@ TECO_DEFINE_STATE(teco_state_ascii);
* only be seen when executing the following command.
*/
static teco_state_t *
-teco_state_escape_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_escape_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
{
/*$ ^[^[ ^[$ $$ terminate return
* [a1,a2,...]$$ -- Terminate command line or return from macro
@@ -2700,7 +2700,7 @@ teco_state_ecommand_exit(teco_machine_main_t *ctx, GError **error)
}
static teco_state_t *
-teco_state_ecommand_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_ecommand_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
{
static teco_machine_main_transition_t transitions[] = {
/*
@@ -2874,10 +2874,9 @@ teco_state_insert_indent_initial(teco_machine_main_t *ctx, GError **error)
len -= teco_interface_ssm(SCI_GETCOLUMN,
teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0), 0) % len;
- gchar spaces[len];
-
- memset(spaces, ' ', sizeof(spaces));
- teco_interface_ssm(SCI_ADDTEXT, sizeof(spaces), (sptr_t)spaces);
+ gchar space = ' ';
+ while (len-- > 0)
+ teco_interface_ssm(SCI_ADDTEXT, 1, (sptr_t)&space);
}
teco_interface_ssm(SCI_ENDUNDOACTION, 0, 0);
teco_ring_dirtify();
diff --git a/src/core-commands.h b/src/core-commands.h
index 370c7ba..e30770d 100644
--- a/src/core-commands.h
+++ b/src/core-commands.h
@@ -43,7 +43,7 @@ gboolean teco_state_insert_process(teco_machine_main_t *ctx, const teco_string_t
gsize new_chars, GError **error);
/* in cmdline.c */
-gboolean teco_state_insert_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar chr, GError **error);
+gboolean teco_state_insert_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar chr, GError **error);
/**
* @class TECO_DEFINE_STATE_INSERT
diff --git a/src/error.h b/src/error.h
index f60be1a..7543d02 100644
--- a/src/error.h
+++ b/src/error.h
@@ -61,10 +61,10 @@ typedef enum {
} teco_error_t;
static inline void
-teco_error_syntax_set(GError **error, gchar chr)
+teco_error_syntax_set(GError **error, gunichar chr)
{
g_set_error(error, TECO_ERROR, TECO_ERROR_SYNTAX,
- "Syntax error \"%c\" (%d)", chr, chr);
+ "Syntax error \"%C\" (U+%04" G_GINT32_MODIFIER "X)", chr, chr);
}
static inline void
diff --git a/src/expressions.c b/src/expressions.c
index ef785e0..1ba8706 100644
--- a/src/expressions.c
+++ b/src/expressions.c
@@ -114,10 +114,11 @@ teco_expressions_pop_num_calc(teco_int_t *ret, teco_int_t imply, GError **error)
}
void
-teco_expressions_add_digit(gchar digit)
+teco_expressions_add_digit(gunichar digit)
{
teco_int_t n = teco_expressions_args() > 0 ? teco_expressions_pop_num(0) : 0;
+ /* use g_unichar_digit_value()? */
teco_expressions_push(n*teco_radix + (n < 0 ? -1 : 1)*(digit - '0'));
}
diff --git a/src/expressions.h b/src/expressions.h
index 24c5eff..68d8ddb 100644
--- a/src/expressions.h
+++ b/src/expressions.h
@@ -123,7 +123,7 @@ teco_int_t teco_expressions_peek_num(guint index);
teco_int_t teco_expressions_pop_num(guint index);
gboolean teco_expressions_pop_num_calc(teco_int_t *ret, teco_int_t imply, GError **error);
-void teco_expressions_add_digit(gchar digit);
+void teco_expressions_add_digit(gunichar digit);
void teco_expressions_push_op(teco_operator_t op);
gboolean teco_expressions_push_calc(teco_operator_t op, GError **error);
diff --git a/src/goto-commands.c b/src/goto-commands.c
index 2326f64..bf80c0b 100644
--- a/src/goto-commands.c
+++ b/src/goto-commands.c
@@ -53,7 +53,7 @@ teco_state_label_initial(teco_machine_main_t *ctx, GError **error)
* I'm unsure whether !-signs should be allowed within comments.
*/
static teco_state_t *
-teco_state_label_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_label_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
{
if (chr == '!') {
/*
@@ -85,7 +85,7 @@ teco_state_label_input(teco_machine_main_t *ctx, gchar chr, GError **error)
if (ctx->parent.must_undo)
undo__teco_string_truncate(&ctx->goto_label, ctx->goto_label.len);
- teco_string_append_c(&ctx->goto_label, chr);
+ teco_string_append_wc(&ctx->goto_label, chr);
return &teco_state_label;
}
@@ -138,7 +138,7 @@ teco_state_goto_done(teco_machine_main_t *ctx, const teco_string_t *str, GError
}
/* in cmdline.c */
-gboolean teco_state_goto_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar chr, GError **error);
+gboolean teco_state_goto_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar chr, GError **error);
/*$ O
* Olabel$ -- Go to label
diff --git a/src/help.c b/src/help.c
index 8364496..9ee7239 100644
--- a/src/help.c
+++ b/src/help.c
@@ -314,7 +314,7 @@ teco_state_help_done(teco_machine_main_t *ctx, const teco_string_t *str, GError
}
/* in cmdline.c */
-gboolean teco_state_help_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar chr, GError **error);
+gboolean teco_state_help_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar chr, GError **error);
/*$ "?" help
* ?[topic]$ -- Get help for topic
diff --git a/src/interface-curses/interface.c b/src/interface-curses/interface.c
index 443a903..96254a9 100644
--- a/src/interface-curses/interface.c
+++ b/src/interface-curses/interface.c
@@ -1582,6 +1582,9 @@ teco_interface_blocking_getch(void)
void
teco_interface_event_loop_iter(void)
{
+ static gchar keybuf[4];
+ static gint keybuf_i = 0;
+
gint key = g_queue_is_empty(teco_interface.input_queue)
? teco_interface_blocking_getch()
: GPOINTER_TO_INT(g_queue_pop_head(teco_interface.input_queue));
@@ -1610,14 +1613,14 @@ teco_interface_event_loop_iter(void)
* backspace.
* In SciTECO backspace is normalized to ^H.
*/
- if (!teco_cmdline_keypress_c(TECO_CTL_KEY('H'),
- &teco_interface.event_loop_error))
+ if (!teco_cmdline_keypress_wc(TECO_CTL_KEY('H'),
+ &teco_interface.event_loop_error))
return;
break;
case KEY_ENTER:
case '\r':
case '\n':
- if (!teco_cmdline_keypress_c('\n', &teco_interface.event_loop_error))
+ if (!teco_cmdline_keypress_wc('\n', &teco_interface.event_loop_error))
return;
break;
@@ -1658,8 +1661,19 @@ teco_interface_event_loop_iter(void)
* Control keys and keys with printable representation
*/
default:
- if (key <= 0xFF &&
- !teco_cmdline_keypress_c(key, &teco_interface.event_loop_error))
+ if (key > 0xFF)
+ return;
+
+ /*
+ * NOTE: There's also wget_wch(), but it requires
+ * a widechar version of Curses.
+ */
+ keybuf[keybuf_i++] = key;
+ gunichar cp = g_utf8_get_char_validated(keybuf, keybuf_i);
+ if (keybuf_i >= sizeof(keybuf) || cp != (gunichar)-2)
+ keybuf_i = 0;
+ if ((gint32)cp < 0 ||
+ !teco_cmdline_keypress_wc(cp, &teco_interface.event_loop_error))
return;
}
diff --git a/src/interface-gtk/interface.c b/src/interface-gtk/interface.c
index 2ad8335..9c1ce6a 100644
--- a/src/interface-gtk/interface.c
+++ b/src/interface-gtk/interface.c
@@ -927,19 +927,19 @@ teco_interface_handle_key_press(GdkEventKey *event, GError **error)
switch (event->keyval) {
case GDK_KEY_Escape:
- if (!teco_cmdline_keypress_c('\e', error))
+ if (!teco_cmdline_keypress_wc('\e', error))
return FALSE;
break;
case GDK_KEY_BackSpace:
- if (!teco_cmdline_keypress_c(TECO_CTL_KEY('H'), error))
+ if (!teco_cmdline_keypress_wc(TECO_CTL_KEY('H'), error))
return FALSE;
break;
case GDK_KEY_Tab:
- if (!teco_cmdline_keypress_c('\t', error))
+ if (!teco_cmdline_keypress_wc('\t', error))
return FALSE;
break;
case GDK_KEY_Return:
- if (!teco_cmdline_keypress_c('\n', error))
+ if (!teco_cmdline_keypress_wc('\n', error))
return FALSE;
break;
@@ -994,7 +994,7 @@ teco_interface_handle_key_press(GdkEventKey *event, GError **error)
if ((event->state & (GDK_CONTROL_MASK | GDK_MOD1_MASK)) == GDK_CONTROL_MASK) {
gchar c = teco_interface_get_ansi_key(event);
if (c) {
- if (!teco_cmdline_keypress_c(TECO_CTL_KEY(g_ascii_toupper(c)), error))
+ if (!teco_cmdline_keypress_wc(TECO_CTL_KEY(g_ascii_toupper(c)), error))
return FALSE;
break;
}
diff --git a/src/parser.c b/src/parser.c
index ed21740..321803a 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -59,7 +59,7 @@ teco_loop_stack_cleanup(void)
}
gboolean
-teco_machine_input(teco_machine_t *ctx, gchar chr, GError **error)
+teco_machine_input(teco_machine_t *ctx, gunichar chr, GError **error)
{
teco_state_t *next = ctx->current->input_cb(ctx, chr, error);
if (!next)
@@ -86,10 +86,20 @@ teco_state_end_of_macro(teco_machine_t *ctx, GError **error)
}
/**
+ * Execute macro from current PC to stop position.
+ *
* Handles all expected exceptions and preparing them for stack frame insertion.
+ *
+ * @param ctx State machine.
+ * @param macro The macro to execute.
+ * It does not have to be complete.
+ * It must consist only of validated UTF-8 sequences, though.
+ * @param stop_pos Where to stop execution in bytes.
+ * @param error Location to store error.
+ * @return FALSE if an error occurred.
*/
gboolean
-teco_machine_main_step(teco_machine_main_t *ctx, const gchar *macro, gint stop_pos, GError **error)
+teco_machine_main_step(teco_machine_main_t *ctx, const gchar *macro, gsize stop_pos, GError **error)
{
while (ctx->macro_pc < stop_pos) {
#ifdef DEBUG
@@ -110,9 +120,13 @@ teco_machine_main_step(teco_machine_main_t *ctx, const gchar *macro, gint stop_p
if (!teco_memory_check(0, error))
goto error_attach;
- if (!teco_machine_input(&ctx->parent, macro[ctx->macro_pc], error))
+ /* UTF-8 sequences are already validated */
+ gunichar chr = g_utf8_get_char(macro+ctx->macro_pc);
+
+ if (!teco_machine_input(&ctx->parent, chr, error))
goto error_attach;
- ctx->macro_pc++;
+
+ ctx->macro_pc = g_utf8_next_char(macro+ctx->macro_pc) - macro;
}
/*
@@ -145,6 +159,20 @@ teco_execute_macro(const gchar *macro, gsize macro_len,
teco_qreg_table_t *qreg_table_locals, GError **error)
{
/*
+ * Validate UTF-8, but accept null bytes.
+ * NOTE: there is g_utf8_validate_len() in Glib 2.60
+ */
+ const gchar *p = macro;
+ while (!g_utf8_validate(p, macro_len - (p - macro), &p) && !*p)
+ p++;
+ if (p - macro < macro_len) {
+ g_set_error(error, TECO_ERROR, TECO_ERROR_CODEPOINT,
+ "Invalid UTF-8 byte sequence at %" G_GSIZE_FORMAT,
+ p - macro);
+ return FALSE;
+ }
+
+ /*
* This is not auto-cleaned up, so it can be initialized
* on demand.
*/
@@ -309,26 +337,26 @@ teco_machine_main_eval_colon(teco_machine_main_t *ctx)
teco_state_t *
teco_machine_main_transition_input(teco_machine_main_t *ctx,
teco_machine_main_transition_t *transitions,
- guint len, gchar chr, GError **error)
+ guint len, gunichar chr, GError **error)
{
- if (chr < 0 || chr >= len || !transitions[(guint)chr].next) {
+ if (chr >= len || !transitions[chr].next) {
teco_error_syntax_set(error, chr);
return NULL;
}
- if (ctx->mode == TECO_MODE_NORMAL && transitions[(guint)chr].transition_cb) {
+ if (ctx->mode == TECO_MODE_NORMAL && transitions[chr].transition_cb) {
/*
* NOTE: We could also just let transition_cb return a boolean...
*/
GError *tmp_error = NULL;
- transitions[(guint)chr].transition_cb(ctx, &tmp_error);
+ transitions[chr].transition_cb(ctx, &tmp_error);
if (tmp_error) {
g_propagate_error(error, tmp_error);
return NULL;
}
}
- return transitions[(guint)chr].next;
+ return transitions[chr].next;
}
void
@@ -342,11 +370,11 @@ teco_machine_main_clear(teco_machine_main_t *ctx)
* FIXME: All teco_state_stringbuilding_* states could be static?
*/
static teco_state_t *teco_state_stringbuilding_ctl_input(teco_machine_stringbuilding_t *ctx,
- gchar chr, GError **error);
+ gunichar chr, GError **error);
TECO_DECLARE_STATE(teco_state_stringbuilding_ctl);
static teco_state_t *teco_state_stringbuilding_escaped_input(teco_machine_stringbuilding_t *ctx,
- gchar chr, GError **error);
+ gunichar chr, GError **error);
TECO_DECLARE_STATE(teco_state_stringbuilding_escaped);
TECO_DECLARE_STATE(teco_state_stringbuilding_lower);
@@ -360,7 +388,7 @@ TECO_DECLARE_STATE(teco_state_stringbuilding_ctle_quote);
TECO_DECLARE_STATE(teco_state_stringbuilding_ctle_n);
static teco_state_t *
-teco_state_stringbuilding_start_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_start_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
if (chr == '^')
return &teco_state_stringbuilding_ctl;
@@ -372,7 +400,7 @@ teco_state_stringbuilding_start_input(teco_machine_stringbuilding_t *ctx, gchar
/* in cmdline.c */
gboolean teco_state_stringbuilding_start_process_edit_cmd(teco_machine_stringbuilding_t *ctx, teco_machine_t *parent_ctx,
- gchar key, GError **error);
+ gunichar key, GError **error);
TECO_DEFINE_STATE(teco_state_stringbuilding_start,
.is_start = TRUE,
@@ -381,7 +409,7 @@ TECO_DEFINE_STATE(teco_state_stringbuilding_start,
);
static teco_state_t *
-teco_state_stringbuilding_ctl_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_ctl_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
chr = teco_ascii_toupper(chr);
@@ -396,40 +424,50 @@ teco_state_stringbuilding_ctl_input(teco_machine_stringbuilding_t *ctx, gchar ch
chr = TECO_CTL_KEY(chr);
}
+ /*
+ * Source code is always in UTF-8, so it does not
+ * make sense to handle ctx->codepage != SC_CP_UTF8
+ * separately.
+ */
if (ctx->result)
- teco_string_append_c(ctx->result, chr);
+ teco_string_append_wc(ctx->result, chr);
return &teco_state_stringbuilding_start;
}
TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_stringbuilding_ctl);
static teco_state_t *
-teco_state_stringbuilding_escaped_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_escaped_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
if (!ctx->result)
/* parse-only mode */
return &teco_state_stringbuilding_start;
- /* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */
+ /*
+ * The subtle difference between UTF-8 and single-byte targets
+ * is that we don't try to casefold non-ANSI characters in single-byte mode.
+ */
switch (ctx->mode) {
case TECO_STRINGBUILDING_MODE_UPPER:
- chr = g_ascii_toupper(chr);
+ chr = ctx->codepage == SC_CP_UTF8 || chr < 0x80
+ ? g_unichar_toupper(chr) : chr;
break;
case TECO_STRINGBUILDING_MODE_LOWER:
- chr = g_ascii_tolower(chr);
+ chr = ctx->codepage == SC_CP_UTF8 || chr < 0x80
+ ? g_unichar_tolower(chr) : chr;
break;
default:
break;
}
- teco_string_append_c(ctx->result, chr);
+ teco_string_append_wc(ctx->result, chr);
return &teco_state_stringbuilding_start;
}
TECO_DEFINE_STATE(teco_state_stringbuilding_escaped);
static teco_state_t *
-teco_state_stringbuilding_lower_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_lower_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
if (!ctx->result)
/* parse-only mode */
@@ -443,8 +481,9 @@ teco_state_stringbuilding_lower_input(teco_machine_stringbuilding_t *ctx, gchar
teco_undo_guint(ctx->mode);
ctx->mode = TECO_STRINGBUILDING_MODE_LOWER;
} else {
- /* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */
- teco_string_append_c(ctx->result, g_ascii_tolower(chr));
+ chr = ctx->codepage == SC_CP_UTF8 || chr < 0x80
+ ? g_unichar_tolower(chr) : chr;
+ teco_string_append_wc(ctx->result, chr);
}
return &teco_state_stringbuilding_start;
@@ -453,7 +492,7 @@ teco_state_stringbuilding_lower_input(teco_machine_stringbuilding_t *ctx, gchar
TECO_DEFINE_STATE(teco_state_stringbuilding_lower);
static teco_state_t *
-teco_state_stringbuilding_upper_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_upper_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
if (!ctx->result)
/* parse-only mode */
@@ -467,8 +506,9 @@ teco_state_stringbuilding_upper_input(teco_machine_stringbuilding_t *ctx, gchar
teco_undo_guint(ctx->mode);
ctx->mode = TECO_STRINGBUILDING_MODE_UPPER;
} else {
- /* FIXME: Consult ctx->codepage once we have an Unicode-conforming parser */
- teco_string_append_c(ctx->result, g_ascii_toupper(chr));
+ chr = ctx->codepage == SC_CP_UTF8 || chr < 0x80
+ ? g_unichar_toupper(chr) : chr;
+ teco_string_append_wc(ctx->result, chr);
}
return &teco_state_stringbuilding_start;
@@ -477,7 +517,7 @@ teco_state_stringbuilding_upper_input(teco_machine_stringbuilding_t *ctx, gchar
TECO_DEFINE_STATE(teco_state_stringbuilding_upper);
static teco_state_t *
-teco_state_stringbuilding_ctle_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_ctle_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
teco_state_t *next;
@@ -489,8 +529,9 @@ teco_state_stringbuilding_ctle_input(teco_machine_stringbuilding_t *ctx, gchar c
case 'N': next = &teco_state_stringbuilding_ctle_n; break;
default:
if (ctx->result) {
- gchar buf[] = {TECO_CTL_KEY('E'), chr};
- teco_string_append(ctx->result, buf, sizeof(buf));
+ gchar buf[1+6] = {TECO_CTL_KEY('E')};
+ gsize len = g_unichar_to_utf8(chr, buf+1);
+ teco_string_append(ctx->result, buf, 1+len);
}
return &teco_state_stringbuilding_start;
}
@@ -508,7 +549,7 @@ TECO_DEFINE_STATE_CASEINSENSITIVE(teco_state_stringbuilding_ctle);
/* in cmdline.c */
gboolean teco_state_stringbuilding_qreg_process_edit_cmd(teco_machine_stringbuilding_t *ctx, teco_machine_t *parent_ctx,
- gchar chr, GError **error);
+ gunichar chr, GError **error);
/**
* @interface TECO_DEFINE_STATE_STRINGBUILDING_QREG
@@ -523,7 +564,7 @@ gboolean teco_state_stringbuilding_qreg_process_edit_cmd(teco_machine_stringbuil
)
static teco_state_t *
-teco_state_stringbuilding_ctle_num_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_ctle_num_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
teco_qreg_t *qreg;
@@ -558,7 +599,7 @@ teco_state_stringbuilding_ctle_num_input(teco_machine_stringbuilding_t *ctx, gch
TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_num);
static teco_state_t *
-teco_state_stringbuilding_ctle_u_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_ctle_u_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
teco_qreg_t *qreg;
@@ -583,10 +624,7 @@ teco_state_stringbuilding_ctle_u_input(teco_machine_stringbuilding_t *ctx, gchar
if (ctx->codepage == SC_CP_UTF8) {
if (value < 0 || !g_unichar_validate(value))
goto error_codepoint;
- /* 4 bytes should be enough, but we better follow the documentation */
- gchar buf[6];
- gsize len = g_unichar_to_utf8(value, buf);
- teco_string_append(ctx->result, buf, len);
+ teco_string_append_wc(ctx->result, value);
} else {
if (value < 0 || value > 0xFF)
goto error_codepoint;
@@ -606,7 +644,7 @@ error_codepoint: {
TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_u);
static teco_state_t *
-teco_state_stringbuilding_ctle_q_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_ctle_q_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
teco_qreg_t *qreg;
@@ -637,7 +675,7 @@ teco_state_stringbuilding_ctle_q_input(teco_machine_stringbuilding_t *ctx, gchar
TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_q);
static teco_state_t *
-teco_state_stringbuilding_ctle_quote_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_ctle_quote_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
teco_qreg_t *qreg;
teco_qreg_table_t *table;
@@ -680,7 +718,7 @@ teco_state_stringbuilding_ctle_quote_input(teco_machine_stringbuilding_t *ctx, g
TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_quote);
static teco_state_t *
-teco_state_stringbuilding_ctle_n_input(teco_machine_stringbuilding_t *ctx, gchar chr, GError **error)
+teco_state_stringbuilding_ctle_n_input(teco_machine_stringbuilding_t *ctx, gunichar chr, GError **error)
{
teco_qreg_t *qreg;
teco_qreg_table_t *table;
@@ -717,7 +755,7 @@ teco_state_stringbuilding_ctle_n_input(teco_machine_stringbuilding_t *ctx, gchar
TECO_DEFINE_STATE_STRINGBUILDING_QREG(teco_state_stringbuilding_ctle_n);
void
-teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gchar escape_char,
+teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gunichar escape_char,
teco_qreg_table_t *locals, gboolean must_undo)
{
memset(ctx, 0, sizeof(*ctx));
@@ -738,6 +776,10 @@ teco_machine_stringbuilding_reset(teco_machine_stringbuilding_t *ctx)
ctx->mode = TECO_STRINGBUILDING_MODE_NORMAL;
}
+/*
+ * If we case folded only ANSI characters as in teco_ascii_toupper(),
+ * this could be simplified.
+ */
void
teco_machine_stringbuilding_escape(teco_machine_stringbuilding_t *ctx, const gchar *str, gsize len,
teco_string_t *target)
@@ -745,12 +787,18 @@ teco_machine_stringbuilding_escape(teco_machine_stringbuilding_t *ctx, const gch
target->data = g_malloc(len*2+1);
target->len = 0;
- for (guint i = 0; i < len; i++) {
- if (teco_ascii_toupper(str[i]) == ctx->escape_char ||
- (ctx->escape_char == '[' && str[i] == ']') ||
- (ctx->escape_char == '{' && str[i] == '}'))
+ for (guint i = 0; i < len; ) {
+ gunichar chr = g_utf8_get_char(str+i);
+
+ if (g_unichar_toupper(chr) == ctx->escape_char ||
+ (ctx->escape_char == '[' && chr == ']') ||
+ (ctx->escape_char == '{' && chr == '}'))
target->data[target->len++] = TECO_CTL_KEY('Q');
- target->data[target->len++] = str[i];
+
+ gsize lenc = g_utf8_next_char(str+i) - (str+i);
+ memcpy(target->data+target->len, str+i, lenc);
+ target->len += lenc;
+ i += lenc;
}
target->data[target->len] = '\0';
@@ -772,7 +820,7 @@ teco_state_expectstring_initial(teco_machine_main_t *ctx, GError **error)
}
teco_state_t *
-teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_expectstring_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
{
teco_state_t *current = ctx->parent.current;
@@ -789,13 +837,18 @@ teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **erro
/*
* FIXME: Exclude setting at least whitespace characters as the
* new string escape character to avoid accidental errors?
+ *
+ * FIXME: Should we perhaps restrict case folding escape characters
+ * to the ANSI range (teco_ascii_toupper())?
+ * This would be faster than case folding each and every character
+ * of a string argument to check against the escape char.
*/
switch (ctx->expectstring.machine.escape_char) {
case '\e':
case '{':
if (ctx->parent.must_undo)
- teco_undo_gchar(ctx->expectstring.machine.escape_char);
- ctx->expectstring.machine.escape_char = teco_ascii_toupper(chr);
+ teco_undo_gunichar(ctx->expectstring.machine.escape_char);
+ ctx->expectstring.machine.escape_char = g_unichar_toupper(chr);
return current;
}
}
@@ -819,7 +872,7 @@ teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **erro
ctx->expectstring.nesting--;
break;
}
- } else if (teco_ascii_toupper(chr) == ctx->expectstring.machine.escape_char) {
+ } else if (g_unichar_toupper(chr) == ctx->expectstring.machine.escape_char) {
if (ctx->parent.must_undo)
teco_undo_gint(ctx->expectstring.nesting);
ctx->expectstring.nesting--;
@@ -849,7 +902,7 @@ teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **erro
if (current->expectstring.last) {
if (ctx->parent.must_undo)
- teco_undo_gchar(ctx->expectstring.machine.escape_char);
+ teco_undo_gunichar(ctx->expectstring.machine.escape_char);
ctx->expectstring.machine.escape_char = '\e';
}
ctx->expectstring.nesting = 1;
@@ -880,7 +933,7 @@ teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **erro
if (!teco_machine_stringbuilding_input(&ctx->expectstring.machine, chr, str, error))
return NULL;
} else if (ctx->mode == TECO_MODE_NORMAL) {
- teco_string_append_c(&ctx->expectstring.string, chr);
+ teco_string_append_wc(&ctx->expectstring.string, chr);
}
/*
@@ -924,7 +977,7 @@ teco_state_expectfile_process(teco_machine_main_t *ctx, const teco_string_t *str
g_assert(str->data != NULL);
/*
- * Null-chars must not ocur in filename/path strings and at some point
+ * Null-chars must not occur in filename/path strings and at some point
* teco_string_t has to be converted to a null-terminated C string
* as all the glib filename functions rely on null-terminated strings.
* Doing it here ensures that teco_file_expand_path() can be safely called
diff --git a/src/parser.h b/src/parser.h
index 09ec483..ae2cb9b 100644
--- a/src/parser.h
+++ b/src/parser.h
@@ -101,11 +101,11 @@ typedef const struct {
} teco_state_expectqreg_t;
typedef gboolean (*teco_state_initial_cb_t)(teco_machine_t *ctx, GError **error);
-typedef teco_state_t *(*teco_state_input_cb_t)(teco_machine_t *ctx, gchar chr, GError **error);
+typedef teco_state_t *(*teco_state_input_cb_t)(teco_machine_t *ctx, gunichar chr, GError **error);
typedef gboolean (*teco_state_refresh_cb_t)(teco_machine_t *ctx, GError **error);
typedef gboolean (*teco_state_end_of_macro_cb_t)(teco_machine_t *ctx, GError **error);
typedef gboolean (*teco_state_process_edit_cmd_cb_t)(teco_machine_t *ctx, teco_machine_t *parent_ctx,
- gchar key, GError **error);
+ gunichar key, GError **error);
typedef enum {
TECO_FNMACRO_MASK_START = (1 << 0),
@@ -225,7 +225,7 @@ struct teco_state_t {
gboolean teco_state_end_of_macro(teco_machine_t *ctx, GError **error);
/* in cmdline.c */
-gboolean teco_state_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gchar chr, GError **error);
+gboolean teco_state_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gunichar chr, GError **error);
/**
* @interface TECO_DEFINE_STATE
@@ -254,7 +254,7 @@ gboolean teco_state_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent
extern teco_state_t NAME
/* in cmdline.c */
-gboolean teco_state_caseinsensitive_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gchar chr, GError **error);
+gboolean teco_state_caseinsensitive_process_edit_cmd(teco_machine_t *ctx, teco_machine_t *parent_ctx, gunichar chr, GError **error);
/**
* @interface TECO_DEFINE_STATE_CASEINSENSITIVE
@@ -308,7 +308,7 @@ teco_machine_reset(teco_machine_t *ctx, teco_state_t *initial)
teco_undo_ptr(ctx->current) = initial;
}
-gboolean teco_machine_input(teco_machine_t *ctx, gchar chr, GError **error);
+gboolean teco_machine_input(teco_machine_t *ctx, gunichar chr, GError **error);
typedef enum {
TECO_STRINGBUILDING_MODE_NORMAL = 0,
@@ -336,7 +336,7 @@ typedef struct teco_machine_stringbuilding_t {
* If this is `[` or `{`, it is assumed that `]` and `}` must
* be escaped as well by teco_machine_stringbuilding_escape().
*/
- gchar escape_char;
+ gunichar escape_char;
/**
* Q-Register table for local registers.
@@ -366,7 +366,7 @@ typedef struct teco_machine_stringbuilding_t {
guint codepage;
} teco_machine_stringbuilding_t;
-void teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gchar escape_char,
+void teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gunichar escape_char,
teco_qreg_table_t *locals, gboolean must_undo);
void teco_machine_stringbuilding_reset(teco_machine_stringbuilding_t *ctx);
@@ -381,7 +381,7 @@ void teco_machine_stringbuilding_reset(teco_machine_stringbuilding_t *ctx);
* @return FALSE in case of error.
*/
static inline gboolean
-teco_machine_stringbuilding_input(teco_machine_stringbuilding_t *ctx, gchar chr,
+teco_machine_stringbuilding_input(teco_machine_stringbuilding_t *ctx, gunichar chr,
teco_string_t *result, GError **error)
{
ctx->result = result;
@@ -497,7 +497,7 @@ void teco_machine_main_init(teco_machine_main_t *ctx,
gboolean teco_machine_main_eval_colon(teco_machine_main_t *ctx);
gboolean teco_machine_main_step(teco_machine_main_t *ctx,
- const gchar *macro, gint stop_pos, GError **error);
+ const gchar *macro, gsize stop_pos, GError **error);
gboolean teco_execute_macro(const gchar *macro, gsize macro_len,
teco_qreg_table_t *qreg_table_locals, GError **error);
@@ -516,18 +516,18 @@ typedef const struct {
*/
teco_state_t *teco_machine_main_transition_input(teco_machine_main_t *ctx,
teco_machine_main_transition_t *transitions,
- guint len, gchar chr, GError **error);
+ guint len, gunichar chr, GError **error);
void teco_machine_main_clear(teco_machine_main_t *ctx);
G_DEFINE_AUTO_CLEANUP_CLEAR_FUNC(teco_machine_main_t, teco_machine_main_clear);
gboolean teco_state_expectstring_initial(teco_machine_main_t *ctx, GError **error);
-teco_state_t *teco_state_expectstring_input(teco_machine_main_t *ctx, gchar chr, GError **error);
+teco_state_t *teco_state_expectstring_input(teco_machine_main_t *ctx, gunichar chr, GError **error);
gboolean teco_state_expectstring_refresh(teco_machine_main_t *ctx, GError **error);
/* in cmdline.c */
-gboolean teco_state_expectstring_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error);
+gboolean teco_state_expectstring_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error);
/**
* @interface TECO_DEFINE_STATE_EXPECTSTRING
@@ -543,7 +543,7 @@ gboolean teco_state_expectstring_process_edit_cmd(teco_machine_main_t *ctx, teco
*/
#define TECO_DEFINE_STATE_EXPECTSTRING(NAME, ...) \
static teco_state_t * \
- NAME##_input(teco_machine_main_t *ctx, gchar chr, GError **error) \
+ NAME##_input(teco_machine_main_t *ctx, gunichar chr, GError **error) \
{ \
return teco_state_expectstring_input(ctx, chr, error); \
} \
@@ -564,7 +564,7 @@ gboolean teco_state_expectfile_process(teco_machine_main_t *ctx, const teco_stri
gsize new_chars, GError **error);
/* in cmdline.c */
-gboolean teco_state_expectfile_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error);
+gboolean teco_state_expectfile_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error);
/**
* @interface TECO_DEFINE_STATE_EXPECTFILE
@@ -580,7 +580,7 @@ gboolean teco_state_expectfile_process_edit_cmd(teco_machine_main_t *ctx, teco_m
)
/* in cmdline.c */
-gboolean teco_state_expectdir_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error);
+gboolean teco_state_expectdir_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error);
/**
* @interface TECO_DEFINE_STATE_EXPECTDIR
diff --git a/src/qreg-commands.c b/src/qreg-commands.c
index f248ced..8d28e7d 100644
--- a/src/qreg-commands.c
+++ b/src/qreg-commands.c
@@ -50,7 +50,7 @@ teco_state_expectqreg_initial(teco_machine_main_t *ctx, GError **error)
}
teco_state_t *
-teco_state_expectqreg_input(teco_machine_main_t *ctx, gchar chr, GError **error)
+teco_state_expectqreg_input(teco_machine_main_t *ctx, gunichar chr, GError **error)
{
teco_state_t *current = ctx->parent.current;
@@ -680,6 +680,10 @@ teco_state_macro_got_register(teco_machine_main_t *ctx, teco_qreg_t *qreg,
* Note that the string of <q> will be copied upon macro execution,
* so subsequent changes to Q-Register <q> from inside the macro do
* not modify the executed code.
+ *
+ * While \fBM\fP does not check the register's configured encoding
+ * (as reported by \fBEE\fP), its contents must be and are checked to be in
+ * valid UTF-8.
*/
TECO_DEFINE_STATE_EXPECTQREG(teco_state_macro);
@@ -714,6 +718,9 @@ teco_state_macrofile_done(teco_machine_main_t *ctx, const teco_string_t *str, GE
* It is otherwise similar to the \(lqM\(rq command.
*
* If <file> could not be read, the command yields an error.
+ *
+ * As all \*(ST code, the contents of <file> must be in valid UTF-8
+ * even if operating in the \(lqdefault ANSI\(rq mode as configured by \fBED\fP.
*/
TECO_DEFINE_STATE_EXPECTFILE(teco_state_macrofile);
diff --git a/src/qreg-commands.h b/src/qreg-commands.h
index b190e9f..27a6a5c 100644
--- a/src/qreg-commands.h
+++ b/src/qreg-commands.h
@@ -33,10 +33,10 @@ teco_state_expectqreg_reset(teco_machine_main_t *ctx)
gboolean teco_state_expectqreg_initial(teco_machine_main_t *ctx, GError **error);
-teco_state_t *teco_state_expectqreg_input(teco_machine_main_t *ctx, gchar chr, GError **error);
+teco_state_t *teco_state_expectqreg_input(teco_machine_main_t *ctx, gunichar chr, GError **error);
/* in cmdline.c */
-gboolean teco_state_expectqreg_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error);
+gboolean teco_state_expectqreg_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error);
/**
* @interface TECO_DEFINE_STATE_EXPECTQREG
@@ -47,7 +47,7 @@ gboolean teco_state_expectqreg_process_edit_cmd(teco_machine_main_t *ctx, teco_m
*/
#define TECO_DEFINE_STATE_EXPECTQREG(NAME, ...) \
static teco_state_t * \
- NAME##_input(teco_machine_main_t *ctx, gchar chr, GError **error) \
+ NAME##_input(teco_machine_main_t *ctx, gunichar chr, GError **error) \
{ \
return teco_state_expectqreg_input(ctx, chr, error); \
} \
diff --git a/src/qreg.c b/src/qreg.c
index fb559af..cac2d12 100644
--- a/src/qreg.c
+++ b/src/qreg.c
@@ -84,10 +84,9 @@ teco_qreg_execute(teco_qreg_t *qreg, teco_qreg_table_t *qreg_table_locals, GErro
g_auto(teco_string_t) macro = {NULL, 0};
/*
- * FIXME: Once we have an Unicode-aware parser,
- * we should probably check the encoding of the register.
- * On the other hand, we will have to validate the
- * UTF-8 codepoints before execution anyway.
+ * SciTECO macros must be in UTF-8, but we don't check the encoding,
+ * so as not to complicate TECO_ED_DEFAULT_ANSI mode.
+ * The UTF-8 byte sequences are checked anyway.
*/
if (!qreg->vtable->get_string(qreg, &macro.data, &macro.len, NULL, error) ||
!teco_execute_macro(macro.data, macro.len, qreg_table_locals, error)) {
@@ -1220,7 +1219,7 @@ TECO_DECLARE_STATE(teco_state_qregspec_secondchar);
TECO_DECLARE_STATE(teco_state_qregspec_string);
static teco_state_t *teco_state_qregspec_start_global_input(teco_machine_qregspec_t *ctx,
- gchar chr, GError **error);
+ gunichar chr, GError **error);
static teco_state_t *
teco_state_qregspec_done(teco_machine_qregspec_t *ctx, GError **error)
@@ -1255,7 +1254,7 @@ teco_state_qregspec_done(teco_machine_qregspec_t *ctx, GError **error)
}
static teco_state_t *
-teco_state_qregspec_start_input(teco_machine_qregspec_t *ctx, gchar chr, GError **error)
+teco_state_qregspec_start_input(teco_machine_qregspec_t *ctx, gunichar chr, GError **error)
{
/*
* FIXME: We're using teco_state_qregspec_start as a success condition,
@@ -1272,7 +1271,7 @@ teco_state_qregspec_start_input(teco_machine_qregspec_t *ctx, gchar chr, GError
}
/* in cmdline.c */
-gboolean teco_state_qregspec_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error);
+gboolean teco_state_qregspec_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error);
TECO_DEFINE_STATE(teco_state_qregspec_start,
.is_start = TRUE,
@@ -1280,7 +1279,7 @@ TECO_DEFINE_STATE(teco_state_qregspec_start,
);
static teco_state_t *
-teco_state_qregspec_start_global_input(teco_machine_qregspec_t *ctx, gchar chr, GError **error)
+teco_state_qregspec_start_global_input(teco_machine_qregspec_t *ctx, gunichar chr, GError **error)
{
/*
* FIXME: Disallow space characters?
@@ -1299,8 +1298,7 @@ teco_state_qregspec_start_global_input(teco_machine_qregspec_t *ctx, gchar chr,
if (!ctx->parse_only) {
if (ctx->parent.must_undo)
undo__teco_string_truncate(&ctx->name, ctx->name.len);
- /* FIXME: g_unicode_toupper() once we have an Unicode-conforming parser */
- teco_string_append_c(&ctx->name, g_ascii_toupper(chr));
+ teco_string_append_wc(&ctx->name, g_unichar_toupper(chr));
}
return teco_state_qregspec_done(ctx, error);
}
@@ -1316,7 +1314,7 @@ TECO_DEFINE_STATE(teco_state_qregspec_start_global,
);
static teco_state_t *
-teco_state_qregspec_firstchar_input(teco_machine_qregspec_t *ctx, gchar chr, GError **error)
+teco_state_qregspec_firstchar_input(teco_machine_qregspec_t *ctx, gunichar chr, GError **error)
{
/*
* FIXME: Disallow space characters?
@@ -1324,8 +1322,7 @@ teco_state_qregspec_firstchar_input(teco_machine_qregspec_t *ctx, gchar chr, GEr
if (!ctx->parse_only) {
if (ctx->parent.must_undo)
undo__teco_string_truncate(&ctx->name, ctx->name.len);
- /* FIXME: g_unicode_toupper() once we have an Unicode-conforming parser */
- teco_string_append_c(&ctx->name, g_ascii_toupper(chr));
+ teco_string_append_wc(&ctx->name, g_unichar_toupper(chr));
}
return &teco_state_qregspec_secondchar;
}
@@ -1335,7 +1332,7 @@ TECO_DEFINE_STATE(teco_state_qregspec_firstchar,
);
static teco_state_t *
-teco_state_qregspec_secondchar_input(teco_machine_qregspec_t *ctx, gchar chr, GError **error)
+teco_state_qregspec_secondchar_input(teco_machine_qregspec_t *ctx, gunichar chr, GError **error)
{
/*
* FIXME: Disallow space characters?
@@ -1343,8 +1340,7 @@ teco_state_qregspec_secondchar_input(teco_machine_qregspec_t *ctx, gchar chr, GE
if (!ctx->parse_only) {
if (ctx->parent.must_undo)
undo__teco_string_truncate(&ctx->name, ctx->name.len);
- /* FIXME: g_unicode_toupper() once we have an Unicode-conforming parser */
- teco_string_append_c(&ctx->name, g_ascii_toupper(chr));
+ teco_string_append_wc(&ctx->name, g_unichar_toupper(chr));
}
return teco_state_qregspec_done(ctx, error);
}
@@ -1354,7 +1350,7 @@ TECO_DEFINE_STATE(teco_state_qregspec_secondchar,
);
static teco_state_t *
-teco_state_qregspec_string_input(teco_machine_qregspec_t *ctx, gchar chr, GError **error)
+teco_state_qregspec_string_input(teco_machine_qregspec_t *ctx, gunichar chr, GError **error)
{
/*
* Makes sure that braces within string building constructs do not have to be
@@ -1395,7 +1391,7 @@ teco_state_qregspec_string_input(teco_machine_qregspec_t *ctx, gchar chr, GError
/* in cmdline.c */
gboolean teco_state_qregspec_string_process_edit_cmd(teco_machine_qregspec_t *ctx, teco_machine_t *parent_ctx,
- gchar key, GError **error);
+ gunichar key, GError **error);
TECO_DEFINE_STATE(teco_state_qregspec_string,
.process_edit_cmd_cb = (teco_state_process_edit_cmd_cb_t)teco_state_qregspec_string_process_edit_cmd
@@ -1456,7 +1452,7 @@ teco_machine_qregspec_get_stringbuilding(teco_machine_qregspec_t *ctx)
* @memberof teco_machine_qregspec_t
*/
teco_machine_qregspec_status_t
-teco_machine_qregspec_input(teco_machine_qregspec_t *ctx, gchar chr,
+teco_machine_qregspec_input(teco_machine_qregspec_t *ctx, gunichar chr,
teco_qreg_t **result, teco_qreg_table_t **result_table, GError **error)
{
ctx->parse_only = result == NULL;
@@ -1484,7 +1480,7 @@ teco_machine_qregspec_get_results(teco_machine_qregspec_t *ctx,
gboolean
teco_machine_qregspec_auto_complete(teco_machine_qregspec_t *ctx, teco_string_t *insert)
{
- gsize restrict_len = 0;
+ guint restrict_len = 0;
/*
* NOTE: We could have separate process_edit_cmd_cb() for
@@ -1499,6 +1495,10 @@ teco_machine_qregspec_auto_complete(teco_machine_qregspec_t *ctx, teco_string_t
/* two-letter Q-Reg */
restrict_len = 2;
+ /*
+ * FIXME: This is not quite right as it will propose even
+ * lower case single or two-letter Q-Register names.
+ */
return teco_rb3str_auto_complete(&ctx->result_table->tree, !restrict_len,
ctx->name.data, ctx->name.len, restrict_len, insert) &&
ctx->nesting == 1;
diff --git a/src/qreg.h b/src/qreg.h
index 8c8764e..df4bdb4 100644
--- a/src/qreg.h
+++ b/src/qreg.h
@@ -227,7 +227,7 @@ void teco_machine_qregspec_reset(teco_machine_qregspec_t *ctx);
*/
struct teco_machine_stringbuilding_t *teco_machine_qregspec_get_stringbuilding(teco_machine_qregspec_t *ctx);
-teco_machine_qregspec_status_t teco_machine_qregspec_input(teco_machine_qregspec_t *ctx, gchar chr,
+teco_machine_qregspec_status_t teco_machine_qregspec_input(teco_machine_qregspec_t *ctx, gunichar chr,
teco_qreg_t **result,
teco_qreg_table_t **result_table, GError **error);
diff --git a/src/rb3str.c b/src/rb3str.c
index 72cf444..d51ac5d 100644
--- a/src/rb3str.c
+++ b/src/rb3str.c
@@ -95,7 +95,7 @@ teco_rb3str_nfind(teco_rb3str_tree_t *tree, gboolean case_sensitive, const gchar
* @param case_sensitive Whether to match case-sensitive.
* @param str String to complete (not necessarily null-terminated).
* @param str_len Length of characters in `str`.
- * @param restrict_len Limit completions to this size.
+ * @param restrict_len Limit completions to this size (in characters).
* @param insert String to set with characters that can be autocompleted.
* @return TRUE if the completion was unambiguous, else FALSE.
*
@@ -103,7 +103,7 @@ teco_rb3str_nfind(teco_rb3str_tree_t *tree, gboolean case_sensitive, const gchar
*/
gboolean
teco_rb3str_auto_complete(teco_rb3str_tree_t *tree, gboolean case_sensitive,
- const gchar *str, gsize str_len, gsize restrict_len, teco_string_t *insert)
+ const gchar *str, gsize str_len, guint restrict_len, teco_string_t *insert)
{
memset(insert, 0, sizeof(*insert));
@@ -115,7 +115,7 @@ teco_rb3str_auto_complete(teco_rb3str_tree_t *tree, gboolean case_sensitive,
for (teco_rb3str_head_t *cur = teco_rb3str_nfind(tree, case_sensitive, str, str_len);
cur && cur->key.len >= str_len && diff(&cur->key, str, str_len) == str_len;
cur = teco_rb3str_get_next(cur)) {
- if (restrict_len && cur->key.len != restrict_len)
+ if (restrict_len && g_utf8_strlen(cur->key.data, cur->key.len) != restrict_len)
continue;
if (G_UNLIKELY(!first)) {
@@ -136,7 +136,7 @@ teco_rb3str_auto_complete(teco_rb3str_tree_t *tree, gboolean case_sensitive,
for (teco_rb3str_head_t *cur = first;
cur && cur->key.len >= str_len && diff(&cur->key, str, str_len) == str_len;
cur = teco_rb3str_get_next(cur)) {
- if (restrict_len && cur->key.len != restrict_len)
+ if (restrict_len && g_utf8_strlen(cur->key.data, cur->key.len) != restrict_len)
continue;
teco_interface_popup_add(TECO_POPUP_PLAIN,
diff --git a/src/rb3str.h b/src/rb3str.h
index 74b3a37..adf5f89 100644
--- a/src/rb3str.h
+++ b/src/rb3str.h
@@ -65,5 +65,5 @@ teco_rb3str_head_t *teco_rb3str_nfind(teco_rb3str_tree_t *tree, gboolean case_se
const gchar *str, gsize len);
gboolean teco_rb3str_auto_complete(teco_rb3str_tree_t *tree, gboolean case_sensitive,
- const gchar *str, gsize str_len, gsize restrict_len,
+ const gchar *str, gsize str_len, guint restrict_len,
teco_string_t *insert);
diff --git a/src/sciteco.h b/src/sciteco.h
index 09dea3b..02eed97 100644
--- a/src/sciteco.h
+++ b/src/sciteco.h
@@ -71,7 +71,7 @@ teco_is_failure(teco_bool_t x)
#endif
/** TRUE if C is a control character */
-#define TECO_IS_CTL(C) ((guchar)(C) < ' ')
+#define TECO_IS_CTL(C) ((gunichar)(C) < ' ')
/** ASCII character to echo control character C */
#define TECO_CTL_ECHO(C) ((C) | 0x40)
/**
diff --git a/src/search.c b/src/search.c
index e146def..43a2936 100644
--- a/src/search.c
+++ b/src/search.c
@@ -308,14 +308,6 @@ teco_pattern2regexp(teco_string_t *pattern, gboolean single_expr, GError **error
do {
/*
- * FIXME: Currently we are fed single bytes, so there
- * could be an incomplete UTF-8 sequence at the end of the pattern.
- * This should not be necessary once we have an Unicode-aware parser.
- */
- if (pattern->len > 0 && (gint32)g_utf8_get_char_validated(pattern->data, -1) < 0)
- break;
-
- /*
* First check whether it is a class.
* This will not treat individual characters
* as classes, so we do not convert them to regexp
diff --git a/src/spawn.c b/src/spawn.c
index 044b8de..445acc5 100644
--- a/src/spawn.c
+++ b/src/spawn.c
@@ -417,7 +417,7 @@ cleanup:
}
/* in cmdline.c */
-gboolean teco_state_execute_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error);
+gboolean teco_state_execute_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error);
/*$ EC pipe filter
* ECcommand$ -- Execute operating system command and filter buffer contents
@@ -642,7 +642,7 @@ teco_spawn_stdin_watch_cb(GIOChannel *chan, GIOCondition condition, gpointer dat
gssize bytes_written = teco_eol_writer_convert(&teco_spawn_ctx.stdin_writer, buffer,
convert_len, &teco_spawn_ctx.error);
if (bytes_written < 0) {
- /* GError ocurred */
+ /* GError occurred */
g_main_loop_quit(teco_spawn_ctx.mainloop);
return G_SOURCE_REMOVE;
}
diff --git a/src/string-utils.c b/src/string-utils.c
index ac5835b..d9b12e0 100644
--- a/src/string-utils.c
+++ b/src/string-utils.c
@@ -78,7 +78,17 @@ teco_string_get_coord(const gchar *str, guint pos, guint *line, guint *column)
}
}
-/** @memberof teco_string_t */
+/**
+ * Get the length of the prefix common to two strings.
+ * Works with UTF-8 and single-byte encodings.
+ *
+ * @param a Left string.
+ * @param b Right string.
+ * @param b_len Length of right string.
+ * @return Length of the common prefix in bytes.
+ *
+ * @memberof teco_string_t
+ */
gsize
teco_string_diff(const teco_string_t *a, const gchar *b, gsize b_len)
{
@@ -92,14 +102,16 @@ teco_string_diff(const teco_string_t *a, const gchar *b, gsize b_len)
}
/**
- * Get the length of the prefix common to two strings
+ * Get the length of the prefix common to two UTF-8 strings
* without considering case.
*
- * @fixme This is currently only used for symbols and one/two letter
- * Q-Register names, which cannot be UTF-8.
- * If we rewrote this to perform Unicode case folding, we would
- * also have to check for character validity.
- * Once our parser is Unicode-aware, this is not necessary.
+ * The UTF-8 strings must be validated, which should be the case
+ * for help labels and short Q-Register names.
+ *
+ * @param a Left UTF-8 string.
+ * @param b Right UTF-8 string.
+ * @param b_len Length of right UTF-8 string.
+ * @return Length of the common prefix in bytes.
*
* @memberof teco_string_t
*/
@@ -108,9 +120,13 @@ teco_string_casediff(const teco_string_t *a, const gchar *b, gsize b_len)
{
gsize len = 0;
- while (len < a->len && len < b_len &&
- g_ascii_tolower(a->data[len]) == g_ascii_tolower(b[len]))
- len++;
+ while (len < a->len && len < b_len) {
+ gunichar a_chr = g_utf8_get_char(a->data+len);
+ gunichar b_chr = g_utf8_get_char(b+len);
+ if (g_unichar_tolower(a_chr) != g_unichar_tolower(b_chr))
+ break;
+ len = g_utf8_next_char(b+len) - b;
+ }
return len;
}
diff --git a/src/string-utils.h b/src/string-utils.h
index bb9ed37..1b4957f 100644
--- a/src/string-utils.h
+++ b/src/string-utils.h
@@ -26,11 +26,11 @@
/**
* Upper-case SciTECO command character.
*
- * There are implementations in glib (g_ascii_toupper) and libc,
+ * There are implementations in glib (g_ascii_toupper() and g_unichar_toupper()) and libc,
* but this implementation is sufficient for all letters used by SciTECO commands.
*/
-static inline gchar
-teco_ascii_toupper(gchar chr)
+static inline gunichar
+teco_ascii_toupper(gunichar chr)
{
return chr >= 'a' && chr <= 'z' ? chr & ~0x20 : chr;
}
@@ -52,6 +52,7 @@ teco_strv_remove(gchar **strv, guint i)
* and the allocation length is not stored.
* Just like GString, teco_string_t are always null-terminated but at the
* same time 8-bit clean (can contain null-characters).
+ * It may or may not contain UTF-8 byte sequences.
*
* The API is designed such that teco_string_t operations operate on plain
* (null-terminated) C strings, a single character or character array as well as
@@ -74,7 +75,7 @@ typedef struct {
* The pointer is guaranteed to be non-NULL after initialization.
*/
gchar *data;
- /** Length of `data` without the trailing null-byte. */
+ /** Length of `data` without the trailing null-byte in bytes. */
gsize len;
} teco_string_t;
@@ -128,6 +129,16 @@ teco_string_append_c(teco_string_t *str, gchar chr)
teco_string_append(str, &chr, sizeof(chr));
}
+/** @memberof teco_string_t */
+static inline void
+teco_string_append_wc(teco_string_t *target, gunichar chr)
+{
+ /* 4 bytes should be enough, but we better follow the documentation */
+ target->data = g_realloc(target->data, target->len + 6 + 1);
+ target->len += g_unichar_to_utf8(chr, target->data+target->len);
+ target->data[target->len] = '\0';
+}
+
/**
* @fixme Should this also realloc str->data?
*
diff --git a/src/symbols.c b/src/symbols.c
index ba407cc..feead76 100644
--- a/src/symbols.c
+++ b/src/symbols.c
@@ -251,7 +251,7 @@ teco_state_scintilla_symbols_done(teco_machine_main_t *ctx, const teco_string_t
}
/* in cmdline.c */
-gboolean teco_state_scintilla_symbols_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gchar key, GError **error);
+gboolean teco_state_scintilla_symbols_process_edit_cmd(teco_machine_main_t *ctx, teco_machine_t *parent_ctx, gunichar key, GError **error);
/*$ ES scintilla message
* -- Send Scintilla message
diff --git a/src/undo.c b/src/undo.c
index dfae63b..dc54c7a 100644
--- a/src/undo.c
+++ b/src/undo.c
@@ -30,7 +30,7 @@
//#define DEBUG
-TECO_DEFINE_UNDO_SCALAR(gchar);
+TECO_DEFINE_UNDO_SCALAR(gunichar);
TECO_DEFINE_UNDO_SCALAR(gint);
TECO_DEFINE_UNDO_SCALAR(guint);
TECO_DEFINE_UNDO_SCALAR(gsize);
diff --git a/src/undo.h b/src/undo.h
index ea1414f..9715c7a 100644
--- a/src/undo.h
+++ b/src/undo.h
@@ -164,8 +164,8 @@ gpointer teco_undo_push_size(teco_undo_action_t action_cb, gsize size)
* significantly improves batch-mode performance.
*/
-TECO_DECLARE_UNDO_SCALAR(gchar);
-#define teco_undo_gchar(VAR) (*teco_undo_object_gchar_push(&(VAR)))
+TECO_DECLARE_UNDO_SCALAR(gunichar);
+#define teco_undo_gunichar(VAR) (*teco_undo_object_gunichar_push(&(VAR)))
TECO_DECLARE_UNDO_SCALAR(gint);
#define teco_undo_gint(VAR) (*teco_undo_object_gint_push(&(VAR)))
diff --git a/tests/testsuite.at b/tests/testsuite.at
index 4749b13..0733d2a 100644
--- a/tests/testsuite.at
+++ b/tests/testsuite.at
@@ -84,8 +84,6 @@ AT_CHECK([$SCITECO -e "0@I//J 0A\"N(0/0)' :@S/^@/\"F(0/0)'"], 0, ignore, ignore)
AT_CHECK([$SCITECO -e "@EQa//0EE 1U*0EE 0:@EUa/f^@^@/ :Qa-4\"N(0/0)' Ga Z-4\"N(0/0)'"], 0, ignore, ignore)
AT_CHECK([$SCITECO -e "0EE 129@I// -A-129\"N(0/0)' HXa @EQa// EE\"N(0/0)'"], 0, ignore, ignore)
AT_CHECK([$SCITECO -8e "129@:^Ua// 0Qa-129\"N(0/0)'"], 0, ignore, ignore)
-# FIXME: This will fail once we have an UTF-8-only parser.
-AT_CHECK([$SCITECO -8e "@:^Ua/^^/ 129:@^Ua// Ma-129\"N(0/0)'"], 0, ignore, ignore)
AT_CHECK([$SCITECO -e "1EE 167Ua @I/^EUa/ .-1\"N(0/0)'"], 0, ignore, ignore)
AT_CLEANUP
@@ -95,6 +93,8 @@ AT_CHECK([$SCITECO -e "8594@^Ua/Здравствуй, мир!/ :Qa-17\"N(0/0)' 0
AT_CHECK([$SCITECO -e "@I/Здравствуй, мир!/ JW .-10\"N(0/0)' ^E-20\"N(0/0)' 204:EE .-10\"N(0/0)'"], 0, ignore, ignore)
AT_CHECK([$SCITECO -e "@I/TEST/ @EW/юникод.txt/"], 0, ignore, ignore)
AT_CHECK([test -f юникод.txt], 0, ignore, ignore)
+AT_CHECK([$SCITECO -e "^^ß-223\"N(0/0) 23Uъ Q[Ъ]-23\"N(0/0)'"], 0, ignore, ignore)
+AT_CHECK([$SCITECO -e "@O/метка/ !метка!"], 0, ignore, ignore)
AT_CLEANUP
AT_SETUP([Automatic EOL normalization])
@@ -207,8 +207,7 @@ AT_CLEANUP
AT_SETUP([Unicode glitches])
# While TECO code must always be UTF-8, strings after string building
# can be in single-byte encodings as well.
-# This might already work after introducing the Unicode-aware parser.
-# If not, it should be fixed.
+# It must be possible to search for single bytes in single-byte encodings.
AT_CHECK([$SCITECO -8e "164Ua Ga@I//J :@S/^EUa/\"F(0/0)'"], 0, ignore, ignore)
AT_XFAIL_IF(true)
AT_CLEANUP