sped up opening very large UTF-8 files by temporarily disabling the line-character index

* checks for character consistency (of UTF-8 byte sequences) were slowing down things significantly in Scintilla * It got even worse if the file indeed contained non-ANSI codepoints as reading in chunks of 1024 would sometimes mean that incomplete byte sequences would be read. Some large 160mb test files wouldn't load even after minutes. They now load in seconds. * This does NOT yet solve the slowdowns when operating on very long lines.
author: Robin Haberkorn <robin.haberkorn@googlemail.com> 2024-12-17 12:48:41 +0300
committer: Robin Haberkorn <robin.haberkorn@googlemail.com> 2024-12-17 12:48:41 +0300
commit: 715d7e151eea9126ead42834cf9f9a05a7226597 (patch)
tree: 1c68f8ec53f972ba9bfc33e141255df4912f3e62 /src/view.c
parent: 9cd8d404d246774f53b6f5d318ccbd4bd82a922d (diff)
download: sciteco-715d7e151eea9126ead42834cf9f9a05a7226597.tar.gz
1 files changed, 33 insertions, 11 deletions
diff --git a/src/view.c b/src/view.c
index 836ffdc..1f09ce6 100644
--- a/src/view.c
+++ b/src/view.c
@@ -206,9 +206,24 @@ teco_view_set_representations(teco_view_t *ctx)
 gboolean
 teco_view_load_from_channel(teco_view_t *ctx, GIOChannel *channel, GError **error)
 {
+	gboolean ret = TRUE;
+
 	g_auto(teco_eol_reader_t) reader;
 	teco_eol_reader_init_gio(&reader, channel);
 
+	/*
+	 * Temporarily disable the line character index.
+	 * This tremendously speeds up reading UTF-8 documents.
+	 * The reason is, that UTF-8 consistency checks are rather
+	 * costly. Also, when reading in chunks of 1024 bytes,
+	 * we can very well add incomplete UTF-8 sequences,
+	 * resulting in unnecessary recalculations of the line index.
+	 */
+	guint cp = teco_view_get_codepage(ctx);
+	if (cp == SC_CP_UTF8)
+		teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
+		                   SC_LINECHARACTERINDEX_UTF32, 0);
+
 	teco_view_ssm(ctx, SCI_BEGINUNDOACTION, 0, 0);
 	teco_view_ssm(ctx, SCI_CLEARALL, 0, 0);
 
@@ -222,8 +237,9 @@ teco_view_load_from_channel(teco_view_t *ctx, GIOChannel *channel, GError **erro
 	struct stat stat_buf = {.st_size = 0};
 	if (!fstat(g_io_channel_unix_get_fd(channel), &stat_buf) &&
 	    stat_buf.st_size > 0) {
-		if (!teco_memory_check(stat_buf.st_size, error))
-			goto error;
+		ret = teco_memory_check(stat_buf.st_size, error);
+		if (!ret)
+			goto cleanup;
 		teco_view_ssm(ctx, SCI_ALLOCATE, stat_buf.st_size, 0);
 	}
 
@@ -235,8 +251,10 @@ teco_view_load_from_channel(teco_view_t *ctx, GIOChannel *channel, GError **erro
 		teco_string_t str;
 
 		GIOStatus rc = teco_eol_reader_convert(&reader, &str.data, &str.len, error);
-		if (rc == G_IO_STATUS_ERROR)
-			goto error;
+		if (rc == G_IO_STATUS_ERROR) {
+			ret = FALSE;
+			goto cleanup;
+		}
 		if (rc == G_IO_STATUS_EOF)
 			break;
 
@@ -246,12 +264,14 @@ teco_view_load_from_channel(teco_view_t *ctx, GIOChannel *channel, GError **erro
 		 * Even if we checked initially, knowing the file size,
 		 * Scintilla could allocate much more bytes.
 		 */
-		if (!teco_memory_check(0, error))
-			goto error;
+		ret = teco_memory_check(0, error);
+		if (!ret)
+			goto cleanup;
 
 		if (G_UNLIKELY(teco_interface_is_interrupted())) {
 			teco_error_interrupted_set(error);
-			goto error;
+			ret = FALSE;
+			goto cleanup;
 		}
 	}
 
@@ -272,12 +292,14 @@ teco_view_load_from_channel(teco_view_t *ctx, GIOChannel *channel, GError **erro
 		teco_interface_msg(TECO_MSG_WARNING,
 		                   "Inconsistent EOL styles normalized");
 
+cleanup:
 	teco_view_ssm(ctx, SCI_ENDUNDOACTION, 0, 0);
-	return TRUE;
 
-error:
-	teco_view_ssm(ctx, SCI_ENDUNDOACTION, 0, 0);
-	return FALSE;
+	if (cp == SC_CP_UTF8)
+		teco_interface_ssm(SCI_ALLOCATELINECHARACTERINDEX,
+		                   SC_LINECHARACTERINDEX_UTF32, 0);
+
+	return ret;
 }
 
 /**
author	Robin Haberkorn <robin.haberkorn@googlemail.com>	2024-12-17 12:48:41 +0300
committer	Robin Haberkorn <robin.haberkorn@googlemail.com>	2024-12-17 12:48:41 +0300
commit	715d7e151eea9126ead42834cf9f9a05a7226597 (patch)
tree	1c68f8ec53f972ba9bfc33e141255df4912f3e62 /src/view.c
parent	9cd8d404d246774f53b6f5d318ccbd4bd82a922d (diff)
download	sciteco-715d7e151eea9126ead42834cf9f9a05a7226597.tar.gz