1 files changed, 196 insertions, 9 deletions
diff --git a/src/view.c b/src/view.c
index 2e6df3f..7cdc987 100644
--- a/src/view.c
+++ b/src/view.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2012-2023 Robin Haberkorn
+ * Copyright (C) 2012-2024 Robin Haberkorn
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -45,6 +45,7 @@
 #include "error.h"
 #include "qreg.h"
 #include "eol.h"
+#include "memory.h"
 #include "view.h"
 
 /** @memberof teco_view_t */
@@ -72,6 +73,27 @@ teco_view_setup(teco_view_t *ctx)
 	 */
 	teco_view_ssm(ctx, SCI_SETMARGINWIDTHN, 1, 0);
 
+	if (teco_ed & TECO_ED_DEFAULT_ANSI) {
+		/*
+		 * Configure a single-byte codepage/charset.
+		 * This requires setting it on all of the possible styles.
+		 * Fortunately, we can do it before SCI_STYLECLEARALL.
+		 * This is important only for display purposes - other than that
+		 * all single-byte encodings are handled the same.
+		 */
+		teco_view_ssm(ctx, SCI_STYLESETCHARACTERSET, STYLE_DEFAULT, SC_CHARSET_ANSI);
+		/* 0 is used for ALL single-byte encodings */
+		teco_view_ssm(ctx, SCI_SETCODEPAGE, 0, 0);
+	} else {
+		/*
+		 * Documents are UTF-8 by default and all UTF-8 documents
+		 * are expected to have a character index.
+		 * This is a property of the document, instead of the view.
+		 */
+		teco_view_ssm(ctx, SCI_ALLOCATELINECHARACTERINDEX,
+		              SC_LINECHARACTERINDEX_UTF32, 0);
+	}
+
 	/*
 	 * Set some basic styles in order to provide
 	 * a consistent look across UIs if no profile
@@ -137,6 +159,28 @@ teco_view_set_representations(teco_view_t *ctx)
 		gchar buf[] = {(gchar)cc, '\0'};
 		teco_view_ssm(ctx, SCI_SETREPRESENTATION, (uptr_t)buf, (sptr_t)reps[cc]);
 	}
+
+	if (teco_ed & TECO_ED_DEFAULT_ANSI) {
+		/*
+		 * Non-ANSI chars should be visible somehow.
+		 * This would best be done always when changing the
+		 * encoding to 0, but it would be kind of expensive.
+		 *
+		 * FIXME: On the other hand, this could cause problems
+		 * when setting SC_CP_UTF8 later on.
+		 */
+		for (guint cc = 0x80; cc <= 0xFF; cc++) {
+			gchar buf[] = {(gchar)cc, '\0'};
+			gchar rep[2+1];
+			/*
+			 * Hexadecimal is poorly supported in SciTECO, but
+			 * multiple decimal numbers one after another look
+			 * confusing, esp. in Curses.
+			 */
+			g_snprintf(rep, sizeof(rep), "%02X", cc);
+			teco_view_ssm(ctx, SCI_SETREPRESENTATION, (uptr_t)buf, (sptr_t)rep);
+		}
+	}
 }
 
 /**
@@ -161,6 +205,9 @@ teco_view_set_representations(teco_view_t *ctx)
 gboolean
 teco_view_load_from_channel(teco_view_t *ctx, GIOChannel *channel, GError **error)
 {
+	g_auto(teco_eol_reader_t) reader;
+	teco_eol_reader_init_gio(&reader, channel);
+
 	teco_view_ssm(ctx, SCI_BEGINUNDOACTION, 0, 0);
 	teco_view_ssm(ctx, SCI_CLEARALL, 0, 0);
 
@@ -173,11 +220,11 @@ teco_view_load_from_channel(teco_view_t *ctx, GIOChannel *channel, GError **erro
 	 */
 	struct stat stat_buf = {.st_size = 0};
 	if (!fstat(g_io_channel_unix_get_fd(channel), &stat_buf) &&
-	    stat_buf.st_size > 0)
+	    stat_buf.st_size > 0) {
+		if (!teco_memory_check(stat_buf.st_size, error))
+			goto error;
 		teco_view_ssm(ctx, SCI_ALLOCATE, stat_buf.st_size, 0);
-
-	g_auto(teco_eol_reader_t) reader;
-	teco_eol_reader_init_gio(&reader, channel);
+	}
 
 	for (;;) {
 		/*
@@ -187,14 +234,24 @@ teco_view_load_from_channel(teco_view_t *ctx, GIOChannel *channel, GError **erro
 		teco_string_t str;
 
 		GIOStatus rc = teco_eol_reader_convert(&reader, &str.data, &str.len, error);
-		if (rc == G_IO_STATUS_ERROR) {
-			teco_view_ssm(ctx, SCI_ENDUNDOACTION, 0, 0);
-			return FALSE;
-		}
+		if (rc == G_IO_STATUS_ERROR)
+			goto error;
 		if (rc == G_IO_STATUS_EOF)
 			break;
 
 		teco_view_ssm(ctx, SCI_APPENDTEXT, str.len, (sptr_t)str.data);
+
+		/*
+		 * Even if we checked initially, knowing the file size,
+		 * Scintilla could allocate much more bytes.
+		 */
+		if (!teco_memory_check(0, error))
+			goto error;
+
+		if (G_UNLIKELY(teco_interface_is_interrupted())) {
+			teco_error_interrupted_set(error);
+			goto error;
+		}
 	}
 
 	/*
@@ -216,6 +273,10 @@ teco_view_load_from_channel(teco_view_t *ctx, GIOChannel *channel, GError **erro
 
 	teco_view_ssm(ctx, SCI_ENDUNDOACTION, 0, 0);
 	return TRUE;
+
+error:
+	teco_view_ssm(ctx, SCI_ENDUNDOACTION, 0, 0);
+	return FALSE;
 }
 
 /**
@@ -449,3 +510,129 @@ teco_view_save_to_file(teco_view_t *ctx, const gchar *filename, GError **error)
 
 	return TRUE;
 }
+
+/**
+ * Convert a glyph index to a byte offset as used by Scintilla.
+ *
+ * This is optimized with the "line character index",
+ * which must always be enabled in UTF-8 documents.
+ *
+ * It is also used to validate glyph indexes.
+ *
+ * @param ctx The view to operate on.
+ * @param pos Position in glyphs/characters.
+ * @return Position in bytes or -1 if pos is out of bounds.
+ */
+gssize
+teco_view_glyphs2bytes(teco_view_t *ctx, teco_int_t pos)
+{
+	if (pos < 0)
+		return -1; /* invalid position */
+	if (!pos)
+		return 0;
+
+	if (!(teco_view_ssm(ctx, SCI_GETLINECHARACTERINDEX, 0, 0) &
+	      SC_LINECHARACTERINDEX_UTF32))
+		/* assume single-byte encoding */
+		return pos <= teco_view_ssm(ctx, SCI_GETLENGTH, 0, 0) ? pos : -1;
+
+	sptr_t line = teco_view_ssm(ctx, SCI_LINEFROMINDEXPOSITION, pos,
+	                            SC_LINECHARACTERINDEX_UTF32);
+	sptr_t line_bytes = teco_view_ssm(ctx, SCI_POSITIONFROMLINE, line, 0);
+	pos -= teco_view_ssm(ctx, SCI_INDEXPOSITIONFROMLINE, line,
+	                     SC_LINECHARACTERINDEX_UTF32);
+	return teco_view_ssm(ctx, SCI_POSITIONRELATIVE, line_bytes, pos) ? : -1;
+}
+
+/**
+ * Convert byte offset to glyph/character index without bounds checking.
+ */
+teco_int_t
+teco_view_bytes2glyphs(teco_view_t *ctx, gsize pos)
+{
+	if (!pos)
+		return 0;
+
+	if (!(teco_view_ssm(ctx, SCI_GETLINECHARACTERINDEX, 0, 0) &
+	      SC_LINECHARACTERINDEX_UTF32))
+		/* assume single-byte encoding */
+		return pos;
+
+	sptr_t line = teco_view_ssm(ctx, SCI_LINEFROMPOSITION, pos, 0);
+	sptr_t line_bytes = teco_view_ssm(ctx, SCI_POSITIONFROMLINE, line, 0);
+	return teco_view_ssm(ctx, SCI_INDEXPOSITIONFROMLINE, line,
+	                     SC_LINECHARACTERINDEX_UTF32) +
+	       teco_view_ssm(ctx, SCI_COUNTCHARACTERS, line_bytes, pos);
+}
+
+#define TECO_RELATIVE_LIMIT 1024
+
+/**
+ * Convert a glyph index relative to a byte position to
+ * a byte position.
+ *
+ * Can be used to implement commands with relative character
+ * ranges.
+ * As an optimization, this always counts characters for deltas
+ * smaller than TECO_RELATIVE_LIMIT, so it will be fast
+ * even where the character-index based lookup is too slow
+ * (as on exceedingly long lines).
+ *
+ * @param ctx The view to operate on.
+ * @param pos Byte position to start.
+ * @param n Number of glyphs/characters to the left (negative) or
+ *   right (positive) of pos.
+ * @return Position in bytes or -1 if the resulting position is out of bounds.
+ */
+gssize
+teco_view_glyphs2bytes_relative(teco_view_t *ctx, gsize pos, teco_int_t n)
+{
+	if (!n)
+		return pos;
+	if (ABS(n) > TECO_RELATIVE_LIMIT)
+		return teco_view_glyphs2bytes(ctx, teco_view_bytes2glyphs(ctx, pos) + n);
+
+	sptr_t res = teco_view_ssm(ctx, SCI_POSITIONRELATIVE, pos, n);
+	/* SCI_POSITIONRELATIVE may return 0 even if the offset is valid */
+	return res ? : n > 0 ? -1 : teco_view_bytes2glyphs(ctx, pos)+n >= 0 ? 0 : -1;
+}
+
+/**
+ * Get codepoint at given byte offset.
+ *
+ * @param ctx The view to operate on.
+ * @param pos The glyph's byte position
+ * @param len The length of the document in bytes
+ * @return The requested codepoint.
+ *   In UTF-8 encoded documents, this might be -1 (incomplete sequence)
+ *   or -2 (invalid byte sequence).
+ */
+teco_int_t
+teco_view_get_character(teco_view_t *ctx, gsize pos, gsize len)
+{
+	if (teco_view_ssm(ctx, SCI_GETCODEPAGE, 0, 0) != SC_CP_UTF8)
+		/*
+		 * We don't support the asiatic multi-byte encodings,
+		 * so everything else is single-byte codepages.
+		 * NOTE: Internally, the character is casted to signed char
+		 * and may therefore become negative.
+		 */
+		return (guchar)teco_view_ssm(ctx, SCI_GETCHARAT, pos, 0);
+
+	gchar buf[4+1];
+	struct Sci_TextRangeFull range = {
+		.chrg = {pos, MIN(len, pos+sizeof(buf)-1)},
+		.lpstrText = buf
+	};
+	/*
+	 * Probably faster than SCI_GETRANGEPOINTER+SCI_GETGAPPOSITION
+	 * or repeatedly calling SCI_GETCHARAT.
+	 */
+	teco_view_ssm(ctx, SCI_GETTEXTRANGEFULL, 0, (sptr_t)&range);
+	/*
+	 * Make sure that the -1/-2 error values are preserved.
+	 * The sign bit in UCS-4/UTF-32 is unused, so this will even
+	 * suffice if TECO_INTEGER == 32.
+	 */
+	return (gint32)g_utf8_get_char_validated(buf, -1);
+}