diff options
Diffstat (limited to 'src/view.c')
-rw-r--r-- | src/view.c | 205 |
1 files changed, 196 insertions, 9 deletions
@@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2023 Robin Haberkorn + * Copyright (C) 2012-2024 Robin Haberkorn * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -45,6 +45,7 @@ #include "error.h" #include "qreg.h" #include "eol.h" +#include "memory.h" #include "view.h" /** @memberof teco_view_t */ @@ -72,6 +73,27 @@ teco_view_setup(teco_view_t *ctx) */ teco_view_ssm(ctx, SCI_SETMARGINWIDTHN, 1, 0); + if (teco_ed & TECO_ED_DEFAULT_ANSI) { + /* + * Configure a single-byte codepage/charset. + * This requires setting it on all of the possible styles. + * Fortunately, we can do it before SCI_STYLECLEARALL. + * This is important only for display purposes - other than that + * all single-byte encodings are handled the same. + */ + teco_view_ssm(ctx, SCI_STYLESETCHARACTERSET, STYLE_DEFAULT, SC_CHARSET_ANSI); + /* 0 is used for ALL single-byte encodings */ + teco_view_ssm(ctx, SCI_SETCODEPAGE, 0, 0); + } else { + /* + * Documents are UTF-8 by default and all UTF-8 documents + * are expected to have a character index. + * This is a property of the document, instead of the view. + */ + teco_view_ssm(ctx, SCI_ALLOCATELINECHARACTERINDEX, + SC_LINECHARACTERINDEX_UTF32, 0); + } + /* * Set some basic styles in order to provide * a consistent look across UIs if no profile @@ -137,6 +159,28 @@ teco_view_set_representations(teco_view_t *ctx) gchar buf[] = {(gchar)cc, '\0'}; teco_view_ssm(ctx, SCI_SETREPRESENTATION, (uptr_t)buf, (sptr_t)reps[cc]); } + + if (teco_ed & TECO_ED_DEFAULT_ANSI) { + /* + * Non-ANSI chars should be visible somehow. + * This would best be done always when changing the + * encoding to 0, but it would be kind of expensive. + * + * FIXME: On the other hand, this could cause problems + * when setting SC_CP_UTF8 later on. + */ + for (guint cc = 0x80; cc <= 0xFF; cc++) { + gchar buf[] = {(gchar)cc, '\0'}; + gchar rep[2+1]; + /* + * Hexadecimal is poorly supported in SciTECO, but + * multiple decimal numbers one after another look + * confusing, esp. in Curses. + */ + g_snprintf(rep, sizeof(rep), "%02X", cc); + teco_view_ssm(ctx, SCI_SETREPRESENTATION, (uptr_t)buf, (sptr_t)rep); + } + } } /** @@ -161,6 +205,9 @@ teco_view_set_representations(teco_view_t *ctx) gboolean teco_view_load_from_channel(teco_view_t *ctx, GIOChannel *channel, GError **error) { + g_auto(teco_eol_reader_t) reader; + teco_eol_reader_init_gio(&reader, channel); + teco_view_ssm(ctx, SCI_BEGINUNDOACTION, 0, 0); teco_view_ssm(ctx, SCI_CLEARALL, 0, 0); @@ -173,11 +220,11 @@ teco_view_load_from_channel(teco_view_t *ctx, GIOChannel *channel, GError **erro */ struct stat stat_buf = {.st_size = 0}; if (!fstat(g_io_channel_unix_get_fd(channel), &stat_buf) && - stat_buf.st_size > 0) + stat_buf.st_size > 0) { + if (!teco_memory_check(stat_buf.st_size, error)) + goto error; teco_view_ssm(ctx, SCI_ALLOCATE, stat_buf.st_size, 0); - - g_auto(teco_eol_reader_t) reader; - teco_eol_reader_init_gio(&reader, channel); + } for (;;) { /* @@ -187,14 +234,24 @@ teco_view_load_from_channel(teco_view_t *ctx, GIOChannel *channel, GError **erro teco_string_t str; GIOStatus rc = teco_eol_reader_convert(&reader, &str.data, &str.len, error); - if (rc == G_IO_STATUS_ERROR) { - teco_view_ssm(ctx, SCI_ENDUNDOACTION, 0, 0); - return FALSE; - } + if (rc == G_IO_STATUS_ERROR) + goto error; if (rc == G_IO_STATUS_EOF) break; teco_view_ssm(ctx, SCI_APPENDTEXT, str.len, (sptr_t)str.data); + + /* + * Even if we checked initially, knowing the file size, + * Scintilla could allocate much more bytes. + */ + if (!teco_memory_check(0, error)) + goto error; + + if (G_UNLIKELY(teco_interface_is_interrupted())) { + teco_error_interrupted_set(error); + goto error; + } } /* @@ -216,6 +273,10 @@ teco_view_load_from_channel(teco_view_t *ctx, GIOChannel *channel, GError **erro teco_view_ssm(ctx, SCI_ENDUNDOACTION, 0, 0); return TRUE; + +error: + teco_view_ssm(ctx, SCI_ENDUNDOACTION, 0, 0); + return FALSE; } /** @@ -449,3 +510,129 @@ teco_view_save_to_file(teco_view_t *ctx, const gchar *filename, GError **error) return TRUE; } + +/** + * Convert a glyph index to a byte offset as used by Scintilla. + * + * This is optimized with the "line character index", + * which must always be enabled in UTF-8 documents. + * + * It is also used to validate glyph indexes. + * + * @param ctx The view to operate on. + * @param pos Position in glyphs/characters. + * @return Position in bytes or -1 if pos is out of bounds. + */ +gssize +teco_view_glyphs2bytes(teco_view_t *ctx, teco_int_t pos) +{ + if (pos < 0) + return -1; /* invalid position */ + if (!pos) + return 0; + + if (!(teco_view_ssm(ctx, SCI_GETLINECHARACTERINDEX, 0, 0) & + SC_LINECHARACTERINDEX_UTF32)) + /* assume single-byte encoding */ + return pos <= teco_view_ssm(ctx, SCI_GETLENGTH, 0, 0) ? pos : -1; + + sptr_t line = teco_view_ssm(ctx, SCI_LINEFROMINDEXPOSITION, pos, + SC_LINECHARACTERINDEX_UTF32); + sptr_t line_bytes = teco_view_ssm(ctx, SCI_POSITIONFROMLINE, line, 0); + pos -= teco_view_ssm(ctx, SCI_INDEXPOSITIONFROMLINE, line, + SC_LINECHARACTERINDEX_UTF32); + return teco_view_ssm(ctx, SCI_POSITIONRELATIVE, line_bytes, pos) ? : -1; +} + +/** + * Convert byte offset to glyph/character index without bounds checking. + */ +teco_int_t +teco_view_bytes2glyphs(teco_view_t *ctx, gsize pos) +{ + if (!pos) + return 0; + + if (!(teco_view_ssm(ctx, SCI_GETLINECHARACTERINDEX, 0, 0) & + SC_LINECHARACTERINDEX_UTF32)) + /* assume single-byte encoding */ + return pos; + + sptr_t line = teco_view_ssm(ctx, SCI_LINEFROMPOSITION, pos, 0); + sptr_t line_bytes = teco_view_ssm(ctx, SCI_POSITIONFROMLINE, line, 0); + return teco_view_ssm(ctx, SCI_INDEXPOSITIONFROMLINE, line, + SC_LINECHARACTERINDEX_UTF32) + + teco_view_ssm(ctx, SCI_COUNTCHARACTERS, line_bytes, pos); +} + +#define TECO_RELATIVE_LIMIT 1024 + +/** + * Convert a glyph index relative to a byte position to + * a byte position. + * + * Can be used to implement commands with relative character + * ranges. + * As an optimization, this always counts characters for deltas + * smaller than TECO_RELATIVE_LIMIT, so it will be fast + * even where the character-index based lookup is too slow + * (as on exceedingly long lines). + * + * @param ctx The view to operate on. + * @param pos Byte position to start. + * @param n Number of glyphs/characters to the left (negative) or + * right (positive) of pos. + * @return Position in bytes or -1 if the resulting position is out of bounds. + */ +gssize +teco_view_glyphs2bytes_relative(teco_view_t *ctx, gsize pos, teco_int_t n) +{ + if (!n) + return pos; + if (ABS(n) > TECO_RELATIVE_LIMIT) + return teco_view_glyphs2bytes(ctx, teco_view_bytes2glyphs(ctx, pos) + n); + + sptr_t res = teco_view_ssm(ctx, SCI_POSITIONRELATIVE, pos, n); + /* SCI_POSITIONRELATIVE may return 0 even if the offset is valid */ + return res ? : n > 0 ? -1 : teco_view_bytes2glyphs(ctx, pos)+n >= 0 ? 0 : -1; +} + +/** + * Get codepoint at given byte offset. + * + * @param ctx The view to operate on. + * @param pos The glyph's byte position + * @param len The length of the document in bytes + * @return The requested codepoint. + * In UTF-8 encoded documents, this might be -1 (incomplete sequence) + * or -2 (invalid byte sequence). + */ +teco_int_t +teco_view_get_character(teco_view_t *ctx, gsize pos, gsize len) +{ + if (teco_view_ssm(ctx, SCI_GETCODEPAGE, 0, 0) != SC_CP_UTF8) + /* + * We don't support the asiatic multi-byte encodings, + * so everything else is single-byte codepages. + * NOTE: Internally, the character is casted to signed char + * and may therefore become negative. + */ + return (guchar)teco_view_ssm(ctx, SCI_GETCHARAT, pos, 0); + + gchar buf[4+1]; + struct Sci_TextRangeFull range = { + .chrg = {pos, MIN(len, pos+sizeof(buf)-1)}, + .lpstrText = buf + }; + /* + * Probably faster than SCI_GETRANGEPOINTER+SCI_GETGAPPOSITION + * or repeatedly calling SCI_GETCHARAT. + */ + teco_view_ssm(ctx, SCI_GETTEXTRANGEFULL, 0, (sptr_t)&range); + /* + * Make sure that the -1/-2 error values are preserved. + * The sign bit in UCS-4/UTF-32 is unused, so this will even + * suffice if TECO_INTEGER == 32. + */ + return (gint32)g_utf8_get_char_validated(buf, -1); +} |