diff options
author | Robin Haberkorn <robin.haberkorn@googlemail.com> | 2024-08-29 01:56:50 +0200 |
---|---|---|
committer | Robin Haberkorn <robin.haberkorn@googlemail.com> | 2024-09-09 18:22:21 +0200 |
commit | 7c592561af3bbbad2eaf865247811ba2bd590c2e (patch) | |
tree | 75b27cf40fb9ba8d646eb00e05be5f91d116b493 | |
parent | c71ed30cf0c554d288edfe87842082cc9ec393a7 (diff) | |
download | sciteco-7c592561af3bbbad2eaf865247811ba2bd590c2e.tar.gz |
Glyph to byte offset mapping is now using the line character index (refs #5)
* This works reasonably well unless lines are exceedingly long
(as on a line we always count characters).
The following test case is still slow (on Unicode buffers):
10000<@I/XX/> <%a-1:J;>
While the following is now also fast:
10000<@I/X^J/> <%a-1:J;>
* Commands with relative character offsets (C, R, A, D) have
a special optimization where they always count characters beginning
at dot, as long as the argument is now exceedingly large.
This means they are fast even on exceedingly long lines.
* The remaining commands (search, EC/EG, Xq) now accept glyph indexes.
-rw-r--r-- | src/core-commands.c | 53 | ||||
-rw-r--r-- | src/interface.c | 84 | ||||
-rw-r--r-- | src/interface.h | 9 | ||||
-rw-r--r-- | src/qreg-commands.c | 9 | ||||
-rw-r--r-- | src/search.c | 19 | ||||
-rw-r--r-- | src/spawn.c | 18 | ||||
-rw-r--r-- | src/view.c | 6 |
7 files changed, 130 insertions, 68 deletions
diff --git a/src/core-commands.c b/src/core-commands.c index 1c315fe..7e6dbfa 100644 --- a/src/core-commands.c +++ b/src/core-commands.c @@ -130,7 +130,7 @@ teco_state_start_dot(teco_machine_main_t *ctx, GError **error) if (!teco_expressions_eval(FALSE, error)) return; sptr_t pos = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0); - teco_expressions_push(teco_interface_ssm(SCI_COUNTCHARACTERS, 0, pos)); + teco_expressions_push(teco_bytes2glyphs(pos)); } /*$ Z size @@ -147,7 +147,7 @@ teco_state_start_zed(teco_machine_main_t *ctx, GError **error) if (!teco_expressions_eval(FALSE, error)) return; sptr_t pos = teco_interface_ssm(SCI_GETLENGTH, 0, 0); - teco_expressions_push(teco_interface_ssm(SCI_COUNTCHARACTERS, 0, pos)); + teco_expressions_push(teco_bytes2glyphs(pos)); } /*$ H @@ -165,7 +165,7 @@ teco_state_start_range(teco_machine_main_t *ctx, GError **error) return; teco_expressions_push(0); sptr_t pos = teco_interface_ssm(SCI_GETLENGTH, 0, 0); - teco_expressions_push(teco_interface_ssm(SCI_COUNTCHARACTERS, 0, pos)); + teco_expressions_push(teco_bytes2glyphs(pos)); } /*$ \[rs] @@ -514,9 +514,8 @@ teco_state_start_jump(teco_machine_main_t *ctx, GError **error) if (!teco_expressions_pop_num_calc(&v, 0, error)) return; - sptr_t pos = teco_interface_ssm(SCI_POSITIONRELATIVE, 0, v); - /* see teco_validate_pos(): this is saving SCI_POSITIONRELATIVE calls */ - if (!v || (v > 0 && pos > 0)) { + gssize pos = teco_glyphs2bytes(v); + if (pos >= 0) { if (teco_current_doc_must_undo()) undo__teco_interface_ssm(SCI_GOTOPOS, teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0), 0); @@ -536,16 +535,9 @@ static teco_bool_t teco_move_chars(teco_int_t n) { sptr_t pos = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0); - sptr_t next_pos = teco_interface_ssm(SCI_POSITIONRELATIVE, pos, n); - - if (n <= 0) { - /* SCI_POSITIONRELATIVE may return 0 even if the offset is valid */ - sptr_t dot = teco_interface_ssm(SCI_COUNTCHARACTERS, 0, pos); - if (dot+n < 0) - return TECO_FAILURE; - } else if (!next_pos) { + gssize next_pos = teco_glyphs2bytes_relative(pos, n); + if (next_pos < 0) return TECO_FAILURE; - } teco_interface_ssm(SCI_GOTOPOS, next_pos, 0); if (teco_current_doc_must_undo()) @@ -891,7 +883,7 @@ static gboolean teco_state_start_kill(teco_machine_main_t *ctx, const gchar *cmd, gboolean by_lines, GError **error) { teco_bool_t rc; - sptr_t from, len; /* in bytes */ + gssize from, len; /* in bytes */ if (!teco_expressions_eval(FALSE, error)) return FALSE; @@ -909,15 +901,9 @@ teco_state_start_kill(teco_machine_main_t *ctx, const gchar *cmd, gboolean by_li teco_int_t len_glyphs; if (!teco_expressions_pop_num_calc(&len_glyphs, teco_num_sign, error)) return FALSE; - sptr_t to = teco_interface_ssm(SCI_POSITIONRELATIVE, from, len_glyphs); + gssize to = teco_glyphs2bytes_relative(from, len_glyphs); + rc = teco_bool(to >= 0); len = to-from; - if (len_glyphs <= 0) { - /* SCI_POSITIONRELATIVE may return 0 even if the offset is valid */ - sptr_t from_glyphs = teco_interface_ssm(SCI_COUNTCHARACTERS, 0, from); - rc = teco_bool(from_glyphs+len_glyphs >= 0); - } else { - rc = teco_bool(to > 0); - } } if (len < 0) { len *= -1; @@ -925,13 +911,11 @@ teco_state_start_kill(teco_machine_main_t *ctx, const gchar *cmd, gboolean by_li } } else { teco_int_t to_glyphs = teco_expressions_pop_num(0); - sptr_t to = teco_interface_ssm(SCI_POSITIONRELATIVE, 0, to_glyphs); + gssize to = teco_glyphs2bytes(to_glyphs); teco_int_t from_glyphs = teco_expressions_pop_num(0); - from = teco_interface_ssm(SCI_POSITIONRELATIVE, 0, from_glyphs); + from = teco_glyphs2bytes(from_glyphs); len = to - from; - /* see teco_validate_pos(): here we are just saving SCI_POSITIONRELATIVE calls */ - rc = teco_bool(len >= 0 && (!from_glyphs || (from_glyphs > 0 && from > 0)) && - (!to_glyphs || (to_glyphs > 0 && to > 0))); + rc = teco_bool(len >= 0 && from >= 0 && to >= 0); } if (teco_machine_main_eval_colon(ctx)) { @@ -1050,17 +1034,10 @@ teco_state_start_get(teco_machine_main_t *ctx, GError **error) return; sptr_t pos = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0); - sptr_t get_pos = teco_interface_ssm(SCI_POSITIONRELATIVE, pos, v); + gssize get_pos = teco_glyphs2bytes_relative(pos, v); sptr_t len = teco_interface_ssm(SCI_GETLENGTH, 0, 0); - if (v <= 0) { - /* SCI_POSITIONRELATIVE may return 0 even if the offset is valid */ - sptr_t dot = teco_interface_ssm(SCI_COUNTCHARACTERS, 0, pos); - if (dot+v < 0) { - teco_error_range_set(error, "A"); - return; - } - } else if (!get_pos || get_pos == len) { + if (get_pos < 0 || get_pos == len) { teco_error_range_set(error, "A"); return; } diff --git a/src/interface.c b/src/interface.c index 2e2d64e..e21cbb4 100644 --- a/src/interface.c +++ b/src/interface.c @@ -118,3 +118,87 @@ teco_interface_process_notify(SCNotification *notify) g_printf("SCINTILLA NOTIFY: code=%d\n", notify->nmhdr.code); #endif } + +/** + * Convert a glyph index to a byte offset as used by Scintilla. + * + * This is optimized with the "line character index", + * which must always be enabled in UTF-8 documents. + * + * It is also used to validate glyph indexes. + * + * @param pos Position in glyphs/characters. + * @return Position in bytes or -1 if pos is out of bounds. + */ +gssize +teco_glyphs2bytes(teco_int_t pos) +{ + if (pos < 0) + return -1; /* invalid position */ + if (!pos) + return 0; + + if (!(teco_interface_ssm(SCI_GETLINECHARACTERINDEX, 0, 0) & + SC_LINECHARACTERINDEX_UTF32)) + /* assume single-byte encoding */ + return pos <= teco_interface_ssm(SCI_GETLENGTH, 0, 0) ? pos : -1; + + sptr_t line = teco_interface_ssm(SCI_LINEFROMINDEXPOSITION, pos, + SC_LINECHARACTERINDEX_UTF32); + sptr_t line_bytes = teco_interface_ssm(SCI_POSITIONFROMLINE, line, 0); + pos -= teco_interface_ssm(SCI_INDEXPOSITIONFROMLINE, line, + SC_LINECHARACTERINDEX_UTF32); + return teco_interface_ssm(SCI_POSITIONRELATIVE, line_bytes, pos) ? : -1; +} + +/** + * Convert byte offset to glyph/character index without bounds checking. + */ +teco_int_t +teco_bytes2glyphs(gsize pos) +{ + if (!pos) + return 0; + + if (!(teco_interface_ssm(SCI_GETLINECHARACTERINDEX, 0, 0) & + SC_LINECHARACTERINDEX_UTF32)) + /* assume single-byte encoding */ + return pos; + + sptr_t line = teco_interface_ssm(SCI_LINEFROMPOSITION, pos, 0); + sptr_t line_bytes = teco_interface_ssm(SCI_POSITIONFROMLINE, line, 0); + return teco_interface_ssm(SCI_INDEXPOSITIONFROMLINE, line, + SC_LINECHARACTERINDEX_UTF32) + + teco_interface_ssm(SCI_COUNTCHARACTERS, line_bytes, pos); +} + +#define TECO_RELATIVE_LIMIT 1024 + +/** + * Convert a glyph index relative to a byte position to + * a byte position. + * + * Can be used to implement commands with relative character + * ranges. + * As an optimization, this always counts characters for deltas + * smaller than TECO_RELATIVE_LIMIT, so it will be fast + * even where the character-index based lookup is too slow + * (as on exceedingly long lines). + * + * @param pos Byte position to start. + * @param n Number of glyphs/characters to the left (negative) or + * right (positive) of pos. + * @return Position in bytes or -1 if the resulting position is out of bounds. + */ +gssize +teco_glyphs2bytes_relative(gsize pos, teco_int_t n) +{ + if (!n) + return pos; + if (ABS(n) > TECO_RELATIVE_LIMIT) + return teco_glyphs2bytes(teco_bytes2glyphs(pos) + n); + + sptr_t res = teco_interface_ssm(SCI_POSITIONRELATIVE, pos, n); + /* SCI_POSITIONRELATIVE may return 0 even if the offset is valid */ + return res ? : n > 0 ? -1 : teco_bytes2glyphs(pos)+n >= 0 ? 0 : -1; +} diff --git a/src/interface.h b/src/interface.h index b6c015b..6a391aa 100644 --- a/src/interface.h +++ b/src/interface.h @@ -160,12 +160,9 @@ void teco_interface_cleanup(void); * since sciteco.h should not depend on interface.h. */ -// FIXME: Should probably return the byte offset -static inline gboolean -teco_validate_pos(teco_int_t n) -{ - return !n || (n > 0 && teco_interface_ssm(SCI_POSITIONRELATIVE, 0, n) > 0); -} +gssize teco_glyphs2bytes(teco_int_t pos); +teco_int_t teco_bytes2glyphs(gsize pos); +gssize teco_glyphs2bytes_relative(gsize pos, teco_int_t n); static inline gboolean teco_validate_line(teco_int_t n) diff --git a/src/qreg-commands.c b/src/qreg-commands.c index 1131c96..34f3164 100644 --- a/src/qreg-commands.c +++ b/src/qreg-commands.c @@ -678,7 +678,7 @@ teco_state_copytoqreg_got_register(teco_machine_main_t *ctx, teco_qreg_t *qreg, if (ctx->mode > TECO_MODE_NORMAL) return &teco_state_start; - teco_int_t from, len; + gssize from, len; /* in bytes */ if (!teco_expressions_eval(FALSE, error)) return NULL; @@ -702,12 +702,11 @@ teco_state_copytoqreg_got_register(teco_machine_main_t *ctx, teco_qreg_t *qreg, len *= -1; } } else { - teco_int_t to = teco_expressions_pop_num(0); - from = teco_expressions_pop_num(0); - + gssize to = teco_glyphs2bytes(teco_expressions_pop_num(0)); + from = teco_glyphs2bytes(teco_expressions_pop_num(0)); len = to - from; - if (len < 0 || !teco_validate_pos(from) || !teco_validate_pos(to)) { + if (len < 0 || from < 0 || to < 0) { teco_error_range_set(error, "X"); return NULL; } diff --git a/src/search.c b/src/search.c index 46407d0..88b0e16 100644 --- a/src/search.c +++ b/src/search.c @@ -38,11 +38,8 @@ #include "search.h" typedef struct { - /* - * FIXME: Should perhaps all be teco_int_t? - */ - gint dot; - gint from, to; + gssize dot; + gssize from, to; gint count; teco_buffer_t *from_buffer, *to_buffer; @@ -79,16 +76,16 @@ teco_state_search_initial(teco_machine_main_t *ctx, GError **error) return FALSE; if (v1 <= v2) { teco_search_parameters.count = 1; - teco_search_parameters.from = (gint)v1; - teco_search_parameters.to = (gint)v2; + teco_search_parameters.from = teco_glyphs2bytes(v1); + teco_search_parameters.to = teco_glyphs2bytes(v2); } else { teco_search_parameters.count = -1; - teco_search_parameters.from = (gint)v2; - teco_search_parameters.to = (gint)v1; + teco_search_parameters.from = teco_glyphs2bytes(v2); + teco_search_parameters.to = teco_glyphs2bytes(v1); } - if (!teco_validate_pos(teco_search_parameters.from) || - !teco_validate_pos(teco_search_parameters.to)) { + if (teco_search_parameters.from < 0 || + teco_search_parameters.to < 0) { /* * FIXME: In derived classes, the command name will * no longer be correct. diff --git a/src/spawn.c b/src/spawn.c index 671f493..c1fb426 100644 --- a/src/spawn.c +++ b/src/spawn.c @@ -76,8 +76,8 @@ static struct { GSource *stdin_src, *stdout_src; gboolean interrupted; - teco_int_t from, to; - teco_int_t start; + gssize from, to; + gsize start; gboolean text_added; teco_eol_writer_t stdin_writer; @@ -202,15 +202,17 @@ teco_state_execute_initial(teco_machine_main_t *ctx, GError **error) break; } - default: + default: { /* pipe and replace character range */ - if (!teco_expressions_pop_num_calc(&teco_spawn_ctx.to, 0, error) || - !teco_expressions_pop_num_calc(&teco_spawn_ctx.from, 0, error)) + teco_int_t from, to; + if (!teco_expressions_pop_num_calc(&to, 0, error) || + !teco_expressions_pop_num_calc(&from, 0, error)) return FALSE; + teco_spawn_ctx.from = teco_glyphs2bytes(from); + teco_spawn_ctx.to = teco_glyphs2bytes(to); rc = teco_bool(teco_spawn_ctx.from <= teco_spawn_ctx.to && - teco_validate_pos(teco_spawn_ctx.from) && - teco_validate_pos(teco_spawn_ctx.to)); - break; + teco_spawn_ctx.from >= 0 && teco_spawn_ctx.to >= 0); + } } if (teco_is_failure(rc)) { @@ -112,6 +112,12 @@ teco_view_setup(teco_view_t *ctx) teco_view_ssm(ctx, SCI_STYLESETBACK, STYLE_CALLTIP, 0xFFFFFF); /* + * Documents are UTF-8 by default and all UTF-8 documents + * are expected to have a character index. + */ + teco_view_ssm(ctx, SCI_ALLOCATELINECHARACTERINDEX, SC_LINECHARACTERINDEX_UTF32, 0); + + /* * Since we have patched out Scintilla's original SetRepresentations(), * it no longer resets them on SCI_SETDOCPOINTER. * Therefore it is sufficient for all kinds of views to initialize |