From 7c592561af3bbbad2eaf865247811ba2bd590c2e Mon Sep 17 00:00:00 2001 From: Robin Haberkorn Date: Thu, 29 Aug 2024 01:56:50 +0200 Subject: Glyph to byte offset mapping is now using the line character index (refs #5) * This works reasonably well unless lines are exceedingly long (as on a line we always count characters). The following test case is still slow (on Unicode buffers): 10000<@I/XX/> <%a-1:J;> While the following is now also fast: 10000<@I/X^J/> <%a-1:J;> * Commands with relative character offsets (C, R, A, D) have a special optimization where they always count characters beginning at dot, as long as the argument is now exceedingly large. This means they are fast even on exceedingly long lines. * The remaining commands (search, EC/EG, Xq) now accept glyph indexes. --- src/interface.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) (limited to 'src/interface.c') diff --git a/src/interface.c b/src/interface.c index 2e2d64e..e21cbb4 100644 --- a/src/interface.c +++ b/src/interface.c @@ -118,3 +118,87 @@ teco_interface_process_notify(SCNotification *notify) g_printf("SCINTILLA NOTIFY: code=%d\n", notify->nmhdr.code); #endif } + +/** + * Convert a glyph index to a byte offset as used by Scintilla. + * + * This is optimized with the "line character index", + * which must always be enabled in UTF-8 documents. + * + * It is also used to validate glyph indexes. + * + * @param pos Position in glyphs/characters. + * @return Position in bytes or -1 if pos is out of bounds. + */ +gssize +teco_glyphs2bytes(teco_int_t pos) +{ + if (pos < 0) + return -1; /* invalid position */ + if (!pos) + return 0; + + if (!(teco_interface_ssm(SCI_GETLINECHARACTERINDEX, 0, 0) & + SC_LINECHARACTERINDEX_UTF32)) + /* assume single-byte encoding */ + return pos <= teco_interface_ssm(SCI_GETLENGTH, 0, 0) ? pos : -1; + + sptr_t line = teco_interface_ssm(SCI_LINEFROMINDEXPOSITION, pos, + SC_LINECHARACTERINDEX_UTF32); + sptr_t line_bytes = teco_interface_ssm(SCI_POSITIONFROMLINE, line, 0); + pos -= teco_interface_ssm(SCI_INDEXPOSITIONFROMLINE, line, + SC_LINECHARACTERINDEX_UTF32); + return teco_interface_ssm(SCI_POSITIONRELATIVE, line_bytes, pos) ? : -1; +} + +/** + * Convert byte offset to glyph/character index without bounds checking. + */ +teco_int_t +teco_bytes2glyphs(gsize pos) +{ + if (!pos) + return 0; + + if (!(teco_interface_ssm(SCI_GETLINECHARACTERINDEX, 0, 0) & + SC_LINECHARACTERINDEX_UTF32)) + /* assume single-byte encoding */ + return pos; + + sptr_t line = teco_interface_ssm(SCI_LINEFROMPOSITION, pos, 0); + sptr_t line_bytes = teco_interface_ssm(SCI_POSITIONFROMLINE, line, 0); + return teco_interface_ssm(SCI_INDEXPOSITIONFROMLINE, line, + SC_LINECHARACTERINDEX_UTF32) + + teco_interface_ssm(SCI_COUNTCHARACTERS, line_bytes, pos); +} + +#define TECO_RELATIVE_LIMIT 1024 + +/** + * Convert a glyph index relative to a byte position to + * a byte position. + * + * Can be used to implement commands with relative character + * ranges. + * As an optimization, this always counts characters for deltas + * smaller than TECO_RELATIVE_LIMIT, so it will be fast + * even where the character-index based lookup is too slow + * (as on exceedingly long lines). + * + * @param pos Byte position to start. + * @param n Number of glyphs/characters to the left (negative) or + * right (positive) of pos. + * @return Position in bytes or -1 if the resulting position is out of bounds. + */ +gssize +teco_glyphs2bytes_relative(gsize pos, teco_int_t n) +{ + if (!n) + return pos; + if (ABS(n) > TECO_RELATIVE_LIMIT) + return teco_glyphs2bytes(teco_bytes2glyphs(pos) + n); + + sptr_t res = teco_interface_ssm(SCI_POSITIONRELATIVE, pos, n); + /* SCI_POSITIONRELATIVE may return 0 even if the offset is valid */ + return res ? : n > 0 ? -1 : teco_bytes2glyphs(pos)+n >= 0 ? 0 : -1; +} -- cgit v1.2.3