aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRobin Haberkorn <robin.haberkorn@googlemail.com>2024-08-29 01:56:50 +0200
committerRobin Haberkorn <robin.haberkorn@googlemail.com>2024-09-09 18:22:21 +0200
commit7c592561af3bbbad2eaf865247811ba2bd590c2e (patch)
tree75b27cf40fb9ba8d646eb00e05be5f91d116b493
parentc71ed30cf0c554d288edfe87842082cc9ec393a7 (diff)
downloadsciteco-7c592561af3bbbad2eaf865247811ba2bd590c2e.tar.gz
Glyph to byte offset mapping is now using the line character index (refs #5)
* This works reasonably well unless lines are exceedingly long (as on a line we always count characters). The following test case is still slow (on Unicode buffers): 10000<@I/XX/> <%a-1:J;> While the following is now also fast: 10000<@I/X^J/> <%a-1:J;> * Commands with relative character offsets (C, R, A, D) have a special optimization where they always count characters beginning at dot, as long as the argument is now exceedingly large. This means they are fast even on exceedingly long lines. * The remaining commands (search, EC/EG, Xq) now accept glyph indexes.
-rw-r--r--src/core-commands.c53
-rw-r--r--src/interface.c84
-rw-r--r--src/interface.h9
-rw-r--r--src/qreg-commands.c9
-rw-r--r--src/search.c19
-rw-r--r--src/spawn.c18
-rw-r--r--src/view.c6
7 files changed, 130 insertions, 68 deletions
diff --git a/src/core-commands.c b/src/core-commands.c
index 1c315fe..7e6dbfa 100644
--- a/src/core-commands.c
+++ b/src/core-commands.c
@@ -130,7 +130,7 @@ teco_state_start_dot(teco_machine_main_t *ctx, GError **error)
if (!teco_expressions_eval(FALSE, error))
return;
sptr_t pos = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0);
- teco_expressions_push(teco_interface_ssm(SCI_COUNTCHARACTERS, 0, pos));
+ teco_expressions_push(teco_bytes2glyphs(pos));
}
/*$ Z size
@@ -147,7 +147,7 @@ teco_state_start_zed(teco_machine_main_t *ctx, GError **error)
if (!teco_expressions_eval(FALSE, error))
return;
sptr_t pos = teco_interface_ssm(SCI_GETLENGTH, 0, 0);
- teco_expressions_push(teco_interface_ssm(SCI_COUNTCHARACTERS, 0, pos));
+ teco_expressions_push(teco_bytes2glyphs(pos));
}
/*$ H
@@ -165,7 +165,7 @@ teco_state_start_range(teco_machine_main_t *ctx, GError **error)
return;
teco_expressions_push(0);
sptr_t pos = teco_interface_ssm(SCI_GETLENGTH, 0, 0);
- teco_expressions_push(teco_interface_ssm(SCI_COUNTCHARACTERS, 0, pos));
+ teco_expressions_push(teco_bytes2glyphs(pos));
}
/*$ \[rs]
@@ -514,9 +514,8 @@ teco_state_start_jump(teco_machine_main_t *ctx, GError **error)
if (!teco_expressions_pop_num_calc(&v, 0, error))
return;
- sptr_t pos = teco_interface_ssm(SCI_POSITIONRELATIVE, 0, v);
- /* see teco_validate_pos(): this is saving SCI_POSITIONRELATIVE calls */
- if (!v || (v > 0 && pos > 0)) {
+ gssize pos = teco_glyphs2bytes(v);
+ if (pos >= 0) {
if (teco_current_doc_must_undo())
undo__teco_interface_ssm(SCI_GOTOPOS,
teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0), 0);
@@ -536,16 +535,9 @@ static teco_bool_t
teco_move_chars(teco_int_t n)
{
sptr_t pos = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0);
- sptr_t next_pos = teco_interface_ssm(SCI_POSITIONRELATIVE, pos, n);
-
- if (n <= 0) {
- /* SCI_POSITIONRELATIVE may return 0 even if the offset is valid */
- sptr_t dot = teco_interface_ssm(SCI_COUNTCHARACTERS, 0, pos);
- if (dot+n < 0)
- return TECO_FAILURE;
- } else if (!next_pos) {
+ gssize next_pos = teco_glyphs2bytes_relative(pos, n);
+ if (next_pos < 0)
return TECO_FAILURE;
- }
teco_interface_ssm(SCI_GOTOPOS, next_pos, 0);
if (teco_current_doc_must_undo())
@@ -891,7 +883,7 @@ static gboolean
teco_state_start_kill(teco_machine_main_t *ctx, const gchar *cmd, gboolean by_lines, GError **error)
{
teco_bool_t rc;
- sptr_t from, len; /* in bytes */
+ gssize from, len; /* in bytes */
if (!teco_expressions_eval(FALSE, error))
return FALSE;
@@ -909,15 +901,9 @@ teco_state_start_kill(teco_machine_main_t *ctx, const gchar *cmd, gboolean by_li
teco_int_t len_glyphs;
if (!teco_expressions_pop_num_calc(&len_glyphs, teco_num_sign, error))
return FALSE;
- sptr_t to = teco_interface_ssm(SCI_POSITIONRELATIVE, from, len_glyphs);
+ gssize to = teco_glyphs2bytes_relative(from, len_glyphs);
+ rc = teco_bool(to >= 0);
len = to-from;
- if (len_glyphs <= 0) {
- /* SCI_POSITIONRELATIVE may return 0 even if the offset is valid */
- sptr_t from_glyphs = teco_interface_ssm(SCI_COUNTCHARACTERS, 0, from);
- rc = teco_bool(from_glyphs+len_glyphs >= 0);
- } else {
- rc = teco_bool(to > 0);
- }
}
if (len < 0) {
len *= -1;
@@ -925,13 +911,11 @@ teco_state_start_kill(teco_machine_main_t *ctx, const gchar *cmd, gboolean by_li
}
} else {
teco_int_t to_glyphs = teco_expressions_pop_num(0);
- sptr_t to = teco_interface_ssm(SCI_POSITIONRELATIVE, 0, to_glyphs);
+ gssize to = teco_glyphs2bytes(to_glyphs);
teco_int_t from_glyphs = teco_expressions_pop_num(0);
- from = teco_interface_ssm(SCI_POSITIONRELATIVE, 0, from_glyphs);
+ from = teco_glyphs2bytes(from_glyphs);
len = to - from;
- /* see teco_validate_pos(): here we are just saving SCI_POSITIONRELATIVE calls */
- rc = teco_bool(len >= 0 && (!from_glyphs || (from_glyphs > 0 && from > 0)) &&
- (!to_glyphs || (to_glyphs > 0 && to > 0)));
+ rc = teco_bool(len >= 0 && from >= 0 && to >= 0);
}
if (teco_machine_main_eval_colon(ctx)) {
@@ -1050,17 +1034,10 @@ teco_state_start_get(teco_machine_main_t *ctx, GError **error)
return;
sptr_t pos = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0);
- sptr_t get_pos = teco_interface_ssm(SCI_POSITIONRELATIVE, pos, v);
+ gssize get_pos = teco_glyphs2bytes_relative(pos, v);
sptr_t len = teco_interface_ssm(SCI_GETLENGTH, 0, 0);
- if (v <= 0) {
- /* SCI_POSITIONRELATIVE may return 0 even if the offset is valid */
- sptr_t dot = teco_interface_ssm(SCI_COUNTCHARACTERS, 0, pos);
- if (dot+v < 0) {
- teco_error_range_set(error, "A");
- return;
- }
- } else if (!get_pos || get_pos == len) {
+ if (get_pos < 0 || get_pos == len) {
teco_error_range_set(error, "A");
return;
}
diff --git a/src/interface.c b/src/interface.c
index 2e2d64e..e21cbb4 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -118,3 +118,87 @@ teco_interface_process_notify(SCNotification *notify)
g_printf("SCINTILLA NOTIFY: code=%d\n", notify->nmhdr.code);
#endif
}
+
+/**
+ * Convert a glyph index to a byte offset as used by Scintilla.
+ *
+ * This is optimized with the "line character index",
+ * which must always be enabled in UTF-8 documents.
+ *
+ * It is also used to validate glyph indexes.
+ *
+ * @param pos Position in glyphs/characters.
+ * @return Position in bytes or -1 if pos is out of bounds.
+ */
+gssize
+teco_glyphs2bytes(teco_int_t pos)
+{
+ if (pos < 0)
+ return -1; /* invalid position */
+ if (!pos)
+ return 0;
+
+ if (!(teco_interface_ssm(SCI_GETLINECHARACTERINDEX, 0, 0) &
+ SC_LINECHARACTERINDEX_UTF32))
+ /* assume single-byte encoding */
+ return pos <= teco_interface_ssm(SCI_GETLENGTH, 0, 0) ? pos : -1;
+
+ sptr_t line = teco_interface_ssm(SCI_LINEFROMINDEXPOSITION, pos,
+ SC_LINECHARACTERINDEX_UTF32);
+ sptr_t line_bytes = teco_interface_ssm(SCI_POSITIONFROMLINE, line, 0);
+ pos -= teco_interface_ssm(SCI_INDEXPOSITIONFROMLINE, line,
+ SC_LINECHARACTERINDEX_UTF32);
+ return teco_interface_ssm(SCI_POSITIONRELATIVE, line_bytes, pos) ? : -1;
+}
+
+/**
+ * Convert byte offset to glyph/character index without bounds checking.
+ */
+teco_int_t
+teco_bytes2glyphs(gsize pos)
+{
+ if (!pos)
+ return 0;
+
+ if (!(teco_interface_ssm(SCI_GETLINECHARACTERINDEX, 0, 0) &
+ SC_LINECHARACTERINDEX_UTF32))
+ /* assume single-byte encoding */
+ return pos;
+
+ sptr_t line = teco_interface_ssm(SCI_LINEFROMPOSITION, pos, 0);
+ sptr_t line_bytes = teco_interface_ssm(SCI_POSITIONFROMLINE, line, 0);
+ return teco_interface_ssm(SCI_INDEXPOSITIONFROMLINE, line,
+ SC_LINECHARACTERINDEX_UTF32) +
+ teco_interface_ssm(SCI_COUNTCHARACTERS, line_bytes, pos);
+}
+
+#define TECO_RELATIVE_LIMIT 1024
+
+/**
+ * Convert a glyph index relative to a byte position to
+ * a byte position.
+ *
+ * Can be used to implement commands with relative character
+ * ranges.
+ * As an optimization, this always counts characters for deltas
+ * smaller than TECO_RELATIVE_LIMIT, so it will be fast
+ * even where the character-index based lookup is too slow
+ * (as on exceedingly long lines).
+ *
+ * @param pos Byte position to start.
+ * @param n Number of glyphs/characters to the left (negative) or
+ * right (positive) of pos.
+ * @return Position in bytes or -1 if the resulting position is out of bounds.
+ */
+gssize
+teco_glyphs2bytes_relative(gsize pos, teco_int_t n)
+{
+ if (!n)
+ return pos;
+ if (ABS(n) > TECO_RELATIVE_LIMIT)
+ return teco_glyphs2bytes(teco_bytes2glyphs(pos) + n);
+
+ sptr_t res = teco_interface_ssm(SCI_POSITIONRELATIVE, pos, n);
+ /* SCI_POSITIONRELATIVE may return 0 even if the offset is valid */
+ return res ? : n > 0 ? -1 : teco_bytes2glyphs(pos)+n >= 0 ? 0 : -1;
+}
diff --git a/src/interface.h b/src/interface.h
index b6c015b..6a391aa 100644
--- a/src/interface.h
+++ b/src/interface.h
@@ -160,12 +160,9 @@ void teco_interface_cleanup(void);
* since sciteco.h should not depend on interface.h.
*/
-// FIXME: Should probably return the byte offset
-static inline gboolean
-teco_validate_pos(teco_int_t n)
-{
- return !n || (n > 0 && teco_interface_ssm(SCI_POSITIONRELATIVE, 0, n) > 0);
-}
+gssize teco_glyphs2bytes(teco_int_t pos);
+teco_int_t teco_bytes2glyphs(gsize pos);
+gssize teco_glyphs2bytes_relative(gsize pos, teco_int_t n);
static inline gboolean
teco_validate_line(teco_int_t n)
diff --git a/src/qreg-commands.c b/src/qreg-commands.c
index 1131c96..34f3164 100644
--- a/src/qreg-commands.c
+++ b/src/qreg-commands.c
@@ -678,7 +678,7 @@ teco_state_copytoqreg_got_register(teco_machine_main_t *ctx, teco_qreg_t *qreg,
if (ctx->mode > TECO_MODE_NORMAL)
return &teco_state_start;
- teco_int_t from, len;
+ gssize from, len; /* in bytes */
if (!teco_expressions_eval(FALSE, error))
return NULL;
@@ -702,12 +702,11 @@ teco_state_copytoqreg_got_register(teco_machine_main_t *ctx, teco_qreg_t *qreg,
len *= -1;
}
} else {
- teco_int_t to = teco_expressions_pop_num(0);
- from = teco_expressions_pop_num(0);
-
+ gssize to = teco_glyphs2bytes(teco_expressions_pop_num(0));
+ from = teco_glyphs2bytes(teco_expressions_pop_num(0));
len = to - from;
- if (len < 0 || !teco_validate_pos(from) || !teco_validate_pos(to)) {
+ if (len < 0 || from < 0 || to < 0) {
teco_error_range_set(error, "X");
return NULL;
}
diff --git a/src/search.c b/src/search.c
index 46407d0..88b0e16 100644
--- a/src/search.c
+++ b/src/search.c
@@ -38,11 +38,8 @@
#include "search.h"
typedef struct {
- /*
- * FIXME: Should perhaps all be teco_int_t?
- */
- gint dot;
- gint from, to;
+ gssize dot;
+ gssize from, to;
gint count;
teco_buffer_t *from_buffer, *to_buffer;
@@ -79,16 +76,16 @@ teco_state_search_initial(teco_machine_main_t *ctx, GError **error)
return FALSE;
if (v1 <= v2) {
teco_search_parameters.count = 1;
- teco_search_parameters.from = (gint)v1;
- teco_search_parameters.to = (gint)v2;
+ teco_search_parameters.from = teco_glyphs2bytes(v1);
+ teco_search_parameters.to = teco_glyphs2bytes(v2);
} else {
teco_search_parameters.count = -1;
- teco_search_parameters.from = (gint)v2;
- teco_search_parameters.to = (gint)v1;
+ teco_search_parameters.from = teco_glyphs2bytes(v2);
+ teco_search_parameters.to = teco_glyphs2bytes(v1);
}
- if (!teco_validate_pos(teco_search_parameters.from) ||
- !teco_validate_pos(teco_search_parameters.to)) {
+ if (teco_search_parameters.from < 0 ||
+ teco_search_parameters.to < 0) {
/*
* FIXME: In derived classes, the command name will
* no longer be correct.
diff --git a/src/spawn.c b/src/spawn.c
index 671f493..c1fb426 100644
--- a/src/spawn.c
+++ b/src/spawn.c
@@ -76,8 +76,8 @@ static struct {
GSource *stdin_src, *stdout_src;
gboolean interrupted;
- teco_int_t from, to;
- teco_int_t start;
+ gssize from, to;
+ gsize start;
gboolean text_added;
teco_eol_writer_t stdin_writer;
@@ -202,15 +202,17 @@ teco_state_execute_initial(teco_machine_main_t *ctx, GError **error)
break;
}
- default:
+ default: {
/* pipe and replace character range */
- if (!teco_expressions_pop_num_calc(&teco_spawn_ctx.to, 0, error) ||
- !teco_expressions_pop_num_calc(&teco_spawn_ctx.from, 0, error))
+ teco_int_t from, to;
+ if (!teco_expressions_pop_num_calc(&to, 0, error) ||
+ !teco_expressions_pop_num_calc(&from, 0, error))
return FALSE;
+ teco_spawn_ctx.from = teco_glyphs2bytes(from);
+ teco_spawn_ctx.to = teco_glyphs2bytes(to);
rc = teco_bool(teco_spawn_ctx.from <= teco_spawn_ctx.to &&
- teco_validate_pos(teco_spawn_ctx.from) &&
- teco_validate_pos(teco_spawn_ctx.to));
- break;
+ teco_spawn_ctx.from >= 0 && teco_spawn_ctx.to >= 0);
+ }
}
if (teco_is_failure(rc)) {
diff --git a/src/view.c b/src/view.c
index fffe88c..6fecdc0 100644
--- a/src/view.c
+++ b/src/view.c
@@ -112,6 +112,12 @@ teco_view_setup(teco_view_t *ctx)
teco_view_ssm(ctx, SCI_STYLESETBACK, STYLE_CALLTIP, 0xFFFFFF);
/*
+ * Documents are UTF-8 by default and all UTF-8 documents
+ * are expected to have a character index.
+ */
+ teco_view_ssm(ctx, SCI_ALLOCATELINECHARACTERINDEX, SC_LINECHARACTERINDEX_UTF32, 0);
+
+ /*
* Since we have patched out Scintilla's original SetRepresentations(),
* it no longer resets them on SCI_SETDOCPOINTER.
* Therefore it is sufficient for all kinds of views to initialize