aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/core-commands.c244
-rw-r--r--src/doc.c22
2 files changed, 231 insertions, 35 deletions
diff --git a/src/core-commands.c b/src/core-commands.c
index 98097bb..edd7e35 100644
--- a/src/core-commands.c
+++ b/src/core-commands.c
@@ -1771,6 +1771,9 @@ teco_state_control_glyphs2bytes(teco_machine_main_t *ctx, GError **error)
if (!teco_expressions_eval(FALSE, error))
return;
+
+ gboolean colon_modified = teco_machine_main_eval_colon(ctx);
+
if (!teco_expressions_args()) {
/*
* This is shorter than .^E or Z^E and avoids unnecessary glyph to
@@ -1778,13 +1781,12 @@ teco_state_control_glyphs2bytes(teco_machine_main_t *ctx, GError **error)
* On the other hand :^E is inconsistent, as it will return a byte
* index, instead of glyph index.
*/
- res = teco_interface_ssm(teco_machine_main_eval_colon(ctx)
- ? SCI_GETLENGTH : SCI_GETCURRENTPOS, 0, 0);
+ res = teco_interface_ssm(colon_modified ? SCI_GETLENGTH : SCI_GETCURRENTPOS, 0, 0);
} else {
teco_int_t pos;
if (!teco_expressions_pop_num_calc(&pos, 0, error))
return;
- if (teco_machine_main_eval_colon(ctx)) {
+ if (colon_modified) {
/* teco_bytes2glyphs() does not check addresses */
res = 0 <= pos && pos <= teco_interface_ssm(SCI_GETLENGTH, 0, 0)
? teco_bytes2glyphs(pos) : -1;
@@ -2387,14 +2389,102 @@ teco_state_ecommand_eol(teco_machine_main_t *ctx, GError **error)
}
}
+static const gchar *
+teco_codepage2str(guint codepage)
+{
+ /*
+ * The multi-byte charsets are excluded, since we don't
+ * support them in SciTECO, even though Scintilla has them.
+ * Contrary to the Scintilla documentation, Gtk supports
+ * most of them.
+ * Those that are supported are tested, so the codepage
+ * mapping should be definitive (although there could be
+ * similar related codepages).
+ */
+ switch (codepage) {
+ case SC_CP_UTF8: return "UTF-8";
+ case SC_CHARSET_ANSI:
+ case SC_CHARSET_DEFAULT: return "ISO-8859-1"; /* LATIN1 */
+ case SC_CHARSET_BALTIC: return "ISO-8859-13"; /* LATIN7 */
+ //case SC_CHARSET_CHINESEBIG5: return "BIG5";
+ case SC_CHARSET_EASTEUROPE: return "ISO-8859-2"; /* LATIN2 */
+ //case SC_CHARSET_GB2312: return "GB2312";
+ case SC_CHARSET_GREEK: return "ISO-8859-7"; // CP1253???
+ //case SC_CHARSET_HANGUL: return "UHC";
+ /* unsure whether this is supported on Gtk */
+ case SC_CHARSET_MAC: return "MAC";
+ /* not supported by Gtk */
+ case SC_CHARSET_OEM: return "CP437";
+ /*
+ * Apparently, this can be CP1251 on the native Windows
+ * port of Scintilla.
+ */
+ case SC_CHARSET_RUSSIAN: return "KOI8-R";
+ case SC_CHARSET_OEM866: return "CP866";
+ case SC_CHARSET_CYRILLIC: return "CP1251";
+ //case SC_CHARSET_SHIFTJIS: return "SHIFT-JIS";
+ //case SC_CHARSET_SYMBOL:
+ case SC_CHARSET_TURKISH: return "ISO-8859-9"; /* LATIN5 */
+ //case SC_CHARSET_JOHAB: return "JOHAB";
+ case SC_CHARSET_HEBREW: return "ISO-8859-8"; // CP1255?
+ /*
+ * FIXME: Some arabic codepage is supported by Gtk,
+ * but I am not sure which.
+ */
+ case SC_CHARSET_ARABIC: return "ISO-8859-6"; // CP720, CP1256???
+ /* apparently not supported by Gtk */
+ case SC_CHARSET_VIETNAMESE: return "CP1258";
+ case SC_CHARSET_THAI: return "ISO-8859-11";
+ case SC_CHARSET_8859_15: return "ISO-8859-15"; /* LATIN9 */
+ }
+
+ return NULL;
+}
+
/*$ EE encoding codepage charset
* codepageEE -- Edit current document's encoding (codepage/charset)
* EE -> codepage
+ * codepage:EE
+ * :EE -> codepage
*
* When called with an argument, it sets the current codepage,
* otherwise returns it.
+ * The following codepages are supported:
+ * - 0: ANSI (raw bytes)
+ * - 1: ISO-8859-1 (latin1)
+ * - 77: Macintosh Latin encoding
+ * - 161: ISO-8859-7
+ * - 162: ISO-8859-9 (latin5)
+ * - 163: CP1258
+ * - 177: ISO-8859-8
+ * - 178: ISO-8859-6
+ * - 186: ISO-8859-13 (latin7)
+ * - 204: KOI8-R
+ * - 222: ISO-8859-11
+ * - 238: ISO-8859-2 (latin2)
+ * - 255: CP437
+ * - 866: CP866
+ * - 1000: ISO-8859-15 (latin9)
+ * - 1251: CP1251
+ * - 65001: UTF-8
+ *
+ * Displaying characters in the single-byte (non-UTF-8) codepages might
+ * be supported only with the Gtk UI.
+ * At least 77, 178, 163 and 255 are not displayed correctly on Gtk.
* 65001 (UTF-8) is the default for new buffers.
- * 0 (ANSI) should be used when working with raw bytes.
+ * 0 (ANSI) should be used when working with raw bytes,
+ * but is currently displayed like ISO-8859-1 (latin1).
+ *
+ * \fBEE\fP does not change the buffer contents itself by default, only
+ * how it is displayed and how \*(ST interacts with it.
+ * This allows fixing up the codepage if it is not in the default UTF-8
+ * or if codepage guessing failed.
+ *
+ * When colon-modified the \fB:EE\fP command will also additionally convert
+ * the current buffer contents into the new code page, preserving the
+ * current position (dot).
+ * This will fail if the conversion would be lossy.
+ * Conversions from and to UTF-8 \fIshould\fP always be successful.
*/
static void
teco_state_ecommand_encoding(teco_machine_main_t *ctx, GError **error)
@@ -2402,53 +2492,153 @@ teco_state_ecommand_encoding(teco_machine_main_t *ctx, GError **error)
if (!teco_expressions_eval(FALSE, error))
return;
- sptr_t old_cp = teco_interface_ssm(SCI_GETCODEPAGE, 0, 0);
+ gboolean colon_modified = teco_machine_main_eval_colon(ctx);
+
+ sptr_t old_cp = teco_interface_ssm(SCI_GETCODEPAGE, 0, 0)
+ ? : teco_interface_ssm(SCI_STYLEGETCHARACTERSET, STYLE_DEFAULT, 0);
if (!teco_expressions_args()) {
- teco_expressions_push(old_cp ? : teco_interface_ssm(SCI_STYLEGETCHARACTERSET, STYLE_DEFAULT, 0));
+ /* get current code page */
+ teco_expressions_push(old_cp);
return;
}
/*
* Set code page
*/
- if (teco_current_doc_must_undo()) {
- if (old_cp == SC_CP_UTF8) {
+ teco_int_t new_cp;
+ if (!teco_expressions_pop_num_calc(&new_cp, 0, error))
+ return;
+
+ if (old_cp == SC_CP_UTF8 && new_cp == SC_CP_UTF8)
+ return;
+
+ if (teco_current_doc_must_undo() && teco_undo_enabled) {
+ if (old_cp == SC_CP_UTF8) { /* new_cp != SC_CP_UTF8 */
undo__teco_interface_ssm(SCI_ALLOCATELINECHARACTERINDEX,
SC_LINECHARACTERINDEX_UTF32, 0);
undo__teco_interface_ssm(SCI_SETCODEPAGE, SC_CP_UTF8, 0);
} else {
- undo__teco_interface_ssm(SCI_SETCODEPAGE, old_cp, 0);
- undo__teco_interface_ssm(SCI_STYLESETCHARACTERSET, STYLE_DEFAULT,
- teco_interface_ssm(SCI_STYLEGETCHARACTERSET, STYLE_DEFAULT, 0));
- undo__teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
- SC_LINECHARACTERINDEX_UTF32, 0);
+ undo__teco_interface_ssm(SCI_SETCODEPAGE, 0, 0);
+ for (gint style = 0; style <= STYLE_LASTPREDEFINED; style++)
+ undo__teco_interface_ssm(SCI_STYLESETCHARACTERSET, style, old_cp);
+ /*
+ * The index is internally reference-counted and could underflow,
+ * so don't do it more than necessary.
+ */
+ if (new_cp == SC_CP_UTF8)
+ undo__teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
+ SC_LINECHARACTERINDEX_UTF32, 0);
}
}
- teco_int_t v;
- if (!teco_expressions_pop_num_calc(&v, 0, error))
- return;
- if (v == SC_CP_UTF8) {
+ teco_int_t dot_glyphs;
+ if (colon_modified) {
+ sptr_t dot_bytes = teco_interface_ssm(SCI_GETCURRENTPOS, 0, 0);
+ dot_glyphs = teco_bytes2glyphs(dot_bytes);
+
+ /*
+ * Convert buffer to new codepage.
+ *
+ * FIXME: Could be optimized slightly by converting first
+ * before the gap, inserting the converted text and then
+ * converting after the gap.
+ */
+ const gchar *to_codepage = teco_codepage2str(new_cp);
+ const gchar *from_codepage = teco_codepage2str(old_cp);
+ if (!to_codepage || !from_codepage) {
+ g_set_error_literal(error, TECO_ERROR, TECO_ERROR_FAILED,
+ "Unknown or unsupported codepage/charset");
+ return;
+ }
+
+ const gchar *buf = (const gchar *)teco_interface_ssm(SCI_GETCHARACTERPOINTER, 0, 0);
+ gsize len = teco_interface_ssm(SCI_GETLENGTH, 0, 0);
+ g_autofree gchar *converted;
+ gsize converted_len;
+
+ /*
+ * This fails if there is no direct translation.
+ * If we'd use g_convert_with_fallback(), it would be tricky to choose
+ * fallback characters that will always work.
+ */
+ converted = g_convert(buf, len, to_codepage, from_codepage,
+ NULL, &converted_len, error);
+ if (!converted)
+ return;
+
+ teco_interface_ssm(SCI_BEGINUNDOACTION, 0, 0);
+ teco_interface_ssm(SCI_CLEARALL, 0, 0);
+ teco_interface_ssm(SCI_APPENDTEXT, converted_len, (sptr_t)converted);
+ teco_interface_ssm(SCI_ENDUNDOACTION, 0, 0);
+ teco_ring_dirtify();
+
+ if (teco_current_doc_must_undo()) {
+ undo__teco_interface_ssm(SCI_GOTOPOS, dot_bytes, 0);
+ undo__teco_interface_ssm(SCI_UNDO, 0, 0);
+ }
+ }
+
+ if (new_cp == SC_CP_UTF8) {
teco_interface_ssm(SCI_SETCODEPAGE, SC_CP_UTF8, 0);
/*
* UTF-8 documents strictly require the line character index.
* See teco_glyphs2bytes() and teco_bytes2glyphs().
*/
+ g_assert(!(teco_interface_ssm(SCI_GETLINECHARACTERINDEX, 0, 0)
+ & SC_LINECHARACTERINDEX_UTF32));
teco_interface_ssm(SCI_ALLOCATELINECHARACTERINDEX,
SC_LINECHARACTERINDEX_UTF32, 0);
- return;
+ } else {
+ /*
+ * The index is NOT released automatically when setting the codepage.
+ * But it is internally reference-counted and could underflow,
+ * so don't do it more than necessary.
+ */
+ if (old_cp == SC_CP_UTF8) {
+ teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
+ SC_LINECHARACTERINDEX_UTF32, 0);
+ g_assert(!(teco_interface_ssm(SCI_GETLINECHARACTERINDEX, 0, 0)
+ & SC_LINECHARACTERINDEX_UTF32));
+ }
+
+ /*
+ * Configure a single-byte codepage/charset.
+ * This requires setting it on all of the possible styles.
+ * Unfortunately there can theoretically even be 255 (STYLE_MAX) styles.
+ * This is important only for display purposes - other than that
+ * all single-byte encodings are handled the same.
+ *
+ * FIXME: Should we avoid this if new_cp == 0?
+ * It will be used for raw byte handling mostly.
+ * Perhaps we should even set char representations appropriately
+ * for all non-ANSI codepoints in the 0 codepage.
+ * But this would also be costly...
+ */
+ if (teco_current_doc_must_undo()) {
+ /*
+ * There is a chance the user will see this buffer even if we
+ * are currently in batch mode.
+ */
+ for (gint style = 0; style <= STYLE_LASTPREDEFINED; style++)
+ teco_interface_ssm(SCI_STYLESETCHARACTERSET, style, new_cp);
+ } else {
+ /* we must still set it, so that <EE> retrieval works */
+ teco_interface_ssm(SCI_STYLESETCHARACTERSET, STYLE_DEFAULT, new_cp);
+ }
+ /* 0 is used for ALL single-byte encodings */
+ teco_interface_ssm(SCI_SETCODEPAGE, 0, 0);
}
- teco_interface_ssm(SCI_RELEASELINECHARACTERINDEX,
- SC_LINECHARACTERINDEX_UTF32, 0);
- teco_interface_ssm(SCI_STYLESETCHARACTERSET, STYLE_DEFAULT, v);
- /* 0 is used for ALL single-byte encodings */
- teco_interface_ssm(SCI_SETCODEPAGE, 0, 0);
- /*
- * FIXME: Should we attempt any code page conversion via
- * g_iconv()?
- */
+ if (colon_modified)
+ /*
+ * Only now, it will be safe to recalculate dot in the new encoding.
+ * If the new codepage is UTF-8, the line character index will be
+ * ready only now.
+ * FIXME: Apparently the line character index is still not ready
+ * after switching to UTF-8!
+ */
+ teco_interface_ssm(SCI_GOTOPOS, teco_glyphs2bytes(dot_glyphs), 0);
}
/*$ EX exit
diff --git a/src/doc.c b/src/doc.c
index 4e41e8a..12413af 100644
--- a/src/doc.c
+++ b/src/doc.c
@@ -48,22 +48,28 @@ teco_doc_edit(teco_doc_t *ctx)
teco_view_ssm(teco_qreg_view, SCI_SETSEL, ctx->anchor, (sptr_t)ctx->dot);
/*
- * NOTE: Thanks to a custom Scintilla patch, se representations
+ * NOTE: Thanks to a custom Scintilla patch, representations
* do not get reset after SCI_SETDOCPOINTER, so they have to be
* initialized only once.
*/
//teco_view_set_representations(teco_qreg_view);
/*
- * Documents are UTF-8 by default and all UTF-8 documents
- * are expected to have a character index.
+ * All UTF-8 documents are expected to have a character index.
+ * This allocates nothing if the document is not UTF-8.
+ * But it is reference counted, so it must not be allocated
+ * more than once.
*
- * FIXME: This apparently gets reset with every SCI_SETDOCPOINTER.
- * Does that mean the index needs to be recalculated repeatedly as well?
- * What if the document/register is made non-UTF-8 afterwards?
+ * FIXME: This apparently gets reset with every SCI_SETDOCPOINTER
+ * (although I don't know why and where).
+ * Recalculating it could be inefficient.
+ * The index is reference-counted. Perhaps we could just allocate
+ * one more time, so it doesn't get freed when changing documents.
*/
- teco_view_ssm(teco_qreg_view, SCI_ALLOCATELINECHARACTERINDEX,
- SC_LINECHARACTERINDEX_UTF32, 0);
+ if (!(teco_view_ssm(teco_qreg_view,
+ SCI_GETLINECHARACTERINDEX, 0, 0) & SC_LINECHARACTERINDEX_UTF32))
+ teco_view_ssm(teco_qreg_view, SCI_ALLOCATELINECHARACTERINDEX,
+ SC_LINECHARACTERINDEX_UTF32, 0);
}
/** @memberof teco_doc_t */