input and displaying of Unicode characters is now possible (refs #5)

* All non-ASCII characters are inserted as Unicode. On Curses, this also requires a properly set up locale. * We still do not need any widechar Curses, as waddch() handles multibyte characters on ncurses. We will see whether there is any Curses variant that strictly requires wadd_wch(). If this will be an exception, we might keep both widechar and non-widechar support. * By convention gsize is used exclusively for byte sizes. Character offsets or lengths use int or long.
author: Robin Haberkorn <robin.haberkorn@googlemail.com> 2024-08-28 12:59:05 +0200
committer: Robin Haberkorn <robin.haberkorn@googlemail.com> 2024-09-09 18:16:07 +0200
commit: 4c6b6814abfc9c022c6ea8d1e23097c2a774fde5 (patch)
tree: 26ea9ad6d2777c080c1733b55fc7d30180c335f5
parent: fdc185b8faaae44d67f85d2c5a9b9fa48d3e2859 (diff)
download: sciteco-4c6b6814abfc9c022c6ea8d1e23097c2a774fde5.tar.gz
6 files changed, 73 insertions, 27 deletions
diff --git a/src/interface-curses/curses-utils.c b/src/interface-curses/curses-utils.c
index e7c8659..c751afd 100644
--- a/src/interface-curses/curses-utils.c
+++ b/src/interface-curses/curses-utils.c
@@ -29,7 +29,21 @@
 #include "string-utils.h"
 #include "curses-utils.h"
 
-gsize
+/**
+ * Render UTF-8 string with TECO character representations.
+ *
+ * Strings are cut off with `...` at the end if necessary.
+ * The mapping is similar to teco_view_set_representations().
+ *
+ * @param win The Curses window to write to.
+ * @param str The string to format.
+ * @param len The length of the string in bytes.
+ * @param max_width The maximum width to consume in
+ *   the window in characters. If smaller 0, take the
+ *   entire remaining space in the window.
+ * @return Number of characters actually written.
+ */
+guint
 teco_curses_format_str(WINDOW *win, const gchar *str, gsize len, gint max_width)
 {
 	int old_x, old_y;
@@ -42,6 +56,12 @@ teco_curses_format_str(WINDOW *win, const gchar *str, gsize len, gint max_width)
 
 	while (len > 0) {
 		/*
+		 * NOTE: It shouldn't be possible to meet any string,
+		 * that is not valid UTF-8.
+		 */
+		gsize clen = g_utf8_next_char(str) - str;
+
+		/*
 		 * NOTE: This mapping is similar to
 		 * teco_view_set_representations().
 		 */
@@ -85,12 +105,18 @@ teco_curses_format_str(WINDOW *win, const gchar *str, gsize len, gint max_width)
 				chars_added++;
 				if (chars_added > max_width)
 					goto truncate;
-				waddch(win, *str);
+				/*
+				 * FIXME: This works with UTF-8 on ncurses,
+				 * since it detects multi-byte characters.
+				 * However on other platforms wadd_wch() may be
+				 * necessary, which requires a widechar Curses variant.
+				 */
+				waddnstr(win, str, clen);
 			}
 		}
 
-		str++;
-		len--;
+		str += clen;
+		len -= clen;
 	}
 
 	return getcurx(win) - old_x;
@@ -108,23 +134,43 @@ truncate:
 	return getcurx(win) - old_x;
 }
 
-gsize
-teco_curses_format_filename(WINDOW *win, const gchar *filename,
-                            gint max_width)
+/**
+ * Render UTF-8 filename.
+ *
+ * This cuts of overlong filenames with `...` at the beginning,
+ * possibly skipping any drive letter.
+ * Control characters are escaped, but not highlighted.
+ *
+ * @param win The Curses window to write to.
+ * @param filename Null-terminated filename to render.
+ * @param max_width The maximum width to consume in
+ *   the window in characters. If smaller 0, take the
+ *   entire remaining space in the window.
+ * @return Number of characters actually written.
+ */
+guint
+teco_curses_format_filename(WINDOW *win, const gchar *filename, gint max_width)
 {
 	int old_x = getcurx(win);
 
 	g_autofree gchar *filename_printable = teco_string_echo(filename, strlen(filename));
-	size_t filename_len = strlen(filename_printable);
+	glong filename_len = g_utf8_strlen(filename_printable, -1);
 
 	if (max_width < 0)
 		max_width = getmaxx(win) - old_x;
 
-	if (filename_len <= (size_t)max_width) {
+	if (filename_len <= max_width) {
+		/*
+		 * FIXME: This works with UTF-8 on ncurses,
+		 * since it detects multi-byte characters.
+		 * However on other platforms wadd_wch() may be
+		 * necessary, which requires a widechar Curses variant.
+		 */
 		waddstr(win, filename_printable);
-	} else {
-		const gchar *keep_post = filename_printable + filename_len -
-		                         max_width + 3;
+	} else if (filename_len >= 3) {
+		const gchar *keep_post;
+		keep_post = g_utf8_offset_to_pointer(filename_printable + strlen(filename_printable),
+		                                     -max_width + 3);
 
 #ifdef G_OS_WIN32
 		const gchar *keep_pre = g_path_skip_root(filename_printable);
diff --git a/src/interface-curses/curses-utils.h b/src/interface-curses/curses-utils.h
index 35d9582..28c6f9b 100644
--- a/src/interface-curses/curses-utils.h
+++ b/src/interface-curses/curses-utils.h
@@ -20,6 +20,6 @@
 
 #include <curses.h>
 
-gsize teco_curses_format_str(WINDOW *win, const gchar *str, gsize len, gint max_width);
+guint teco_curses_format_str(WINDOW *win, const gchar *str, gsize len, gint max_width);
 
-gsize teco_curses_format_filename(WINDOW *win, const gchar *filename, gint max_width);
+guint teco_curses_format_filename(WINDOW *win, const gchar *filename, gint max_width);
diff --git a/src/interface-curses/interface.c b/src/interface-curses/interface.c
index 8de744f..89be588 100644
--- a/src/interface-curses/interface.c
+++ b/src/interface-curses/interface.c
@@ -344,7 +344,7 @@ static struct {
 	WINDOW *msg_window;
 
 	WINDOW *cmdline_window, *cmdline_pad;
-	gsize cmdline_len, cmdline_rubout_len;
+	guint cmdline_len, cmdline_rubout_len;
 
 	GQueue *input_queue;
 
@@ -680,7 +680,7 @@ teco_interface_init_interactive(GError **error)
 #endif
 
 	/* for displaying UTF-8 characters properly */
-	setlocale(LC_CTYPE, "");
+	setlocale(LC_ALL, "");
 
 	teco_interface_init_screen();
 
@@ -1044,7 +1044,8 @@ teco_interface_cmdline_update(const teco_cmdline_t *cmdline)
 	 * We don't know if it is similar to the last one,
 	 * so resizing makes no sense.
 	 * We approximate the size of the new formatted command-line,
-	 * wasting a few bytes for control characters.
+	 * wasting a few bytes for control characters and
+	 * multi-byte Unicode sequences.
 	 */
 	if (teco_interface.cmdline_pad)
 		delwin(teco_interface.cmdline_pad);
@@ -1626,6 +1627,7 @@ teco_interface_event_loop_iter(void)
 
 	/*
 	 * Function key macros
+	 * FIXME: What about keyname()?
 	 */
 #define FN(KEY) \
 	case KEY_##KEY: \
@@ -1660,7 +1662,7 @@ teco_interface_event_loop_iter(void)
 	 * Control keys and keys with printable representation
 	 */
 	default:
-		if (key < 0x80 &&
+		if (key <= 0xFF &&
 		    !teco_cmdline_keypress_c(key, &teco_interface.event_loop_error))
 			return;
 	}
diff --git a/src/interface-gtk/interface.c b/src/interface-gtk/interface.c
index bddd51f..ed027a2 100644
--- a/src/interface-gtk/interface.c
+++ b/src/interface-gtk/interface.c
@@ -876,14 +876,6 @@ teco_interface_cmdline_commit_cb(GtkIMContext *context, gchar *str, gpointer use
 {
 	g_autoptr(GError) error = NULL;
 
-	/*
-	 * FIXME: This is only for consistency as long as we
-	 * do not support Unicode.
-	 */
-	for (char *p = str; *p != '\0'; p = g_utf8_next_char(p))
-		if (g_utf8_get_char(p) >= 0x80)
-			return;
-
 	if (!teco_cmdline_keypress(str, strlen(str), &error) &&
 	    g_error_matches(error, TECO_ERROR, TECO_ERROR_QUIT))
 		gtk_main_quit();
diff --git a/src/sciteco.h b/src/sciteco.h
index 55ffd6c..7f420e8 100644
--- a/src/sciteco.h
+++ b/src/sciteco.h
@@ -69,7 +69,7 @@ teco_is_failure(teco_bool_t x)
 #endif
 
 /** TRUE if C is a control character */
-#define TECO_IS_CTL(C)		((C) < ' ')
+#define TECO_IS_CTL(C)		((guchar)(C) < ' ')
 /** ASCII character to echo control character C */
 #define TECO_CTL_ECHO(C)	((C) | 0x40)
 /**
diff --git a/src/string-utils.h b/src/string-utils.h
index 973b954..efc6fc5 100644
--- a/src/string-utils.h
+++ b/src/string-utils.h
@@ -51,6 +51,12 @@ teco_ascii_toupper(gchar chr)
  * A target teco_string_t::data is always null-terminated and thus safe to pass
  * to functions expecting traditional null-terminated C strings if you can
  * guarantee that it contains no null-character other than the trailing one.
+ *
+ * @warning For consistency with C idioms the underlying character type is
+ * `char`, which might be signed!
+ * Accessing individual characters may yield signed integers and that sign
+ * might be preserved when upcasting to a larger signed integer.
+ * In this case you should always cast to `guchar` first.
  */
 typedef struct {
 	/**
author	Robin Haberkorn <robin.haberkorn@googlemail.com>	2024-08-28 12:59:05 +0200
committer	Robin Haberkorn <robin.haberkorn@googlemail.com>	2024-09-09 18:16:07 +0200
commit	4c6b6814abfc9c022c6ea8d1e23097c2a774fde5 (patch)
tree	26ea9ad6d2777c080c1733b55fc7d30180c335f5
parent	fdc185b8faaae44d67f85d2c5a9b9fa48d3e2859 (diff)
download	sciteco-4c6b6814abfc9c022c6ea8d1e23097c2a774fde5.tar.gz