added raw ANSI mode to facilitate 8-bit clean editing (refs #5)

* When enabled with bit 2 in the ED flags (0,4ED), all registers and buffers will get the raw ANSI encoding (as if 0EE had been called on them). You can still manually change the encoding, eg. by calling 65001EE afterwards. * Also the ANSI mode sets up character representations for all bytes >= 0x80. This is currently done only depending on the ED flag, not when setting 0EE. * Since setting 16,4ED for 8-bit clean editing in a macro can be tricky - the default unnamed buffer will still be at UTF-8 and at least a bunch of environment registers as well - we added the command line option `--8bit` (short `-8`) which configures the ED flags very early on. As another advantage you can mung the profile in 8-bit mode as well when using SciTECO as a sort of interactive hex editor. * Disable UTF-8 checks in 8-bit clean mode (sample.teco_ini).
author: Robin Haberkorn <robin.haberkorn@googlemail.com> 2024-09-09 16:54:26 +0200
committer: Robin Haberkorn <robin.haberkorn@googlemail.com> 2024-09-09 18:22:21 +0200
commit: 4f231871a0208ec9bcc2679fce25d3b9795d1597 (patch)
tree: bdc9055166fe236f009c6640acf53b6706310c27 /src
parent: 41ab5cf0289dab60ac1ddc97cf9680ee2468ea6c (diff)
download: sciteco-4f231871a0208ec9bcc2679fce25d3b9795d1597.tar.gz
13 files changed, 143 insertions, 104 deletions
diff --git a/src/cmdline.c b/src/cmdline.c
index d6fcd37..47ef86f 100644
--- a/src/cmdline.c
+++ b/src/cmdline.c
@@ -1052,7 +1052,8 @@ teco_state_save_cmdline_got_register(teco_machine_main_t *ctx, teco_qreg_t *qreg
 		return &teco_state_start;
 
 	if (!qreg->vtable->undo_set_string(qreg, error) ||
-	    !qreg->vtable->set_string(qreg, teco_last_cmdline.data, teco_last_cmdline.len, SC_CP_UTF8, error))
+	    !qreg->vtable->set_string(qreg, teco_last_cmdline.data, teco_last_cmdline.len,
+	                              teco_default_codepage(), error))
 		return NULL;
 
 	return &teco_state_start;
diff --git a/src/core-commands.c b/src/core-commands.c
index 638279d..176bb17 100644
--- a/src/core-commands.c
+++ b/src/core-commands.c
@@ -2038,6 +2038,11 @@ teco_state_ecommand_close(teco_machine_main_t *ctx, GError **error)
  * Without any argument ED returns the current flags.
  *
  * Currently, the following flags are used by \*(ST:
+ *   - 4: If enabled, prefer raw single-byte ANSI encoding
+ *     for all new buffers and registers.
+ *     This does not change the encoding of any existing
+ *     buffers and any initialized default register when set via
+ *     \fBED\fP, so you might want to launch \*(ST with \fB--8bit\fP.
  *   - 8: Enable/disable automatic folding of case-insensitive
  *     command characters during interactive key translation.
  *     The case of letter keys is inverted, so one or two
@@ -2610,9 +2615,6 @@ teco_state_ecommand_encoding(teco_machine_main_t *ctx, GError **error)
 		 *
 		 * FIXME: Should we avoid this if new_cp == 0?
 		 * It will be used for raw byte handling mostly.
-		 * Perhaps we should even set char representations appropriately
-		 * for all non-ANSI codepoints in the 0 codepage.
-		 * But this would also be costly...
 		 */
 		if (teco_current_doc_must_undo()) {
 			/*
diff --git a/src/doc.c b/src/doc.c
index 516dadb..a1ebe2c 100644
--- a/src/doc.c
+++ b/src/doc.c
@@ -60,10 +60,19 @@ teco_doc_get_scintilla(teco_doc_t *ctx)
 	return ctx->doc;
 }
 
-/** @memberof teco_doc_t */
+/**
+ * Edit the given document in the Q-Register view.
+ *
+ * @param ctx The document to edit.
+ * @param default_cp The codepage to configure if the document is new.
+ *
+ * @memberof teco_doc_t
+ */
 void
-teco_doc_edit(teco_doc_t *ctx)
+teco_doc_edit(teco_doc_t *ctx, guint default_cp)
 {
+	gboolean new_doc = ctx->doc == NULL;
+
 	teco_view_ssm(teco_qreg_view, SCI_SETDOCPOINTER, 0,
 	              (sptr_t)teco_doc_get_scintilla(ctx));
 	teco_view_ssm(teco_qreg_view, SCI_SETFIRSTVISIBLELINE, ctx->first_line, 0);
@@ -77,22 +86,33 @@ teco_doc_edit(teco_doc_t *ctx)
 	 */
 	//teco_view_set_representations(teco_qreg_view);
 
-	/*
-	 * All UTF-8 documents are expected to have a character index.
-	 * This allocates nothing if the document is not UTF-8.
-	 * But it is reference counted, so it must not be allocated
-	 * more than once.
-	 *
-	 * FIXME: This apparently gets reset with every SCI_SETDOCPOINTER
-	 * (although I don't know why and where).
-	 * Recalculating it could be inefficient.
-	 * The index is reference-counted. Perhaps we could just allocate
-	 * one more time, so it doesn't get freed when changing documents.
-	 */
-	if (!(teco_view_ssm(teco_qreg_view, SCI_GETLINECHARACTERINDEX, 0, 0)
-						& SC_LINECHARACTERINDEX_UTF32))
+	if (new_doc && default_cp != SC_CP_UTF8) {
+		/*
+		 * There is a chance the user will see this buffer even if we
+		 * are currently in batch mode.
+		 */
+		for (gint style = 0; style <= STYLE_LASTPREDEFINED; style++)
+			teco_view_ssm(teco_qreg_view, SCI_STYLESETCHARACTERSET,
+			              style, default_cp);
+		/* 0 is used for ALL single-byte encodings */
+		teco_view_ssm(teco_qreg_view, SCI_SETCODEPAGE, 0, 0);
+	} else if (!(teco_view_ssm(teco_qreg_view, SCI_GETLINECHARACTERINDEX, 0, 0)
+							& SC_LINECHARACTERINDEX_UTF32)) {
+		/*
+		 * All UTF-8 documents are expected to have a character index.
+		 * This allocates nothing if the document is not UTF-8.
+		 * But it is reference counted, so it must not be allocated
+		 * more than once.
+		 *
+		 * FIXME: This apparently gets reset with every SCI_SETDOCPOINTER
+		 * (although I don't know why and where).
+		 * Recalculating it could be inefficient.
+		 * The index is reference-counted. Perhaps we could just allocate
+		 * one more time, so it doesn't get freed when changing documents.
+		 */
 		teco_view_ssm(teco_qreg_view, SCI_ALLOCATELINECHARACTERINDEX,
 		              SC_LINECHARACTERINDEX_UTF32, 0);
+	}
 }
 
 /** @memberof teco_doc_t */
@@ -122,41 +142,12 @@ teco_doc_set_string(teco_doc_t *ctx, const gchar *str, gsize len, guint codepage
 	ctx->doc = NULL;
 
 	teco_doc_reset(ctx);
-	teco_doc_edit(ctx);
+	teco_doc_edit(ctx, codepage);
 
 	teco_view_ssm(teco_qreg_view, SCI_APPENDTEXT, len, (sptr_t)(str ? : ""));
 
-	if (codepage != SC_CP_UTF8) {
-		/*
-		 * We have a new UTF-8 document and
-		 * teco_doc_edit() currently always initializes an index.
-		 */
-		teco_view_ssm(teco_qreg_view, SCI_RELEASELINECHARACTERINDEX,
-		              SC_LINECHARACTERINDEX_UTF32, 0);
-		g_assert(!(teco_view_ssm(teco_qreg_view, SCI_GETLINECHARACTERINDEX, 0, 0)
-							& SC_LINECHARACTERINDEX_UTF32));
-
-		/*
-		 * Configure a single-byte codepage/charset.
-		 * This requires setting it on all of the possible styles.
-		 * Unfortunately there can theoretically even be 255 (STYLE_MAX) styles.
-		 * This is important only for display purposes - other than that
-		 * all single-byte encodings are handled the same.
-		 *
-		 * FIXME: Should we avoid this if codepage == 0?
-		 * It will be used for raw byte handling mostly.
-		 * Perhaps we should even set char representations appropriately
-		 * for all non-ANSI codepoints in the 0 codepage.
-		 * But this would also be costly...
-		 */
-		for (gint style = 0; style <= STYLE_LASTPREDEFINED; style++)
-			teco_view_ssm(teco_qreg_view, SCI_STYLESETCHARACTERSET, style, codepage);
-		/* 0 is used for ALL single-byte encodings */
-		teco_view_ssm(teco_qreg_view, SCI_SETCODEPAGE, 0, 0);
-	}
-
 	if (teco_qreg_current)
-		teco_doc_edit(&teco_qreg_current->string);
+		teco_doc_edit(&teco_qreg_current->string, 0);
 }
 
 /** @memberof teco_doc_t */
@@ -201,14 +192,14 @@ teco_doc_get_string(teco_doc_t *ctx, gchar **str, gsize *outlen, guint *codepage
 		if (outlen)
 			*outlen = 0;
 		if (codepage)
-			*codepage = SC_CP_UTF8;
+			*codepage = teco_default_codepage();
 		return;
 	}
 
 	if (teco_qreg_current)
 		teco_doc_update(&teco_qreg_current->string, teco_qreg_view);
 
-	teco_doc_edit(ctx);
+	teco_doc_edit(ctx, teco_default_codepage());
 
 	gsize len = teco_view_ssm(teco_qreg_view, SCI_GETLENGTH, 0, 0);
 	if (str) {
@@ -221,7 +212,7 @@ teco_doc_get_string(teco_doc_t *ctx, gchar **str, gsize *outlen, guint *codepage
 		*codepage = teco_view_get_codepage(teco_qreg_view);
 
 	if (teco_qreg_current)
-		teco_doc_edit(&teco_qreg_current->string);
+		teco_doc_edit(&teco_qreg_current->string, 0);
 }
 
 /** @memberof teco_doc_t */
diff --git a/src/doc.h b/src/doc.h
index b7a4f99..1218c35 100644
--- a/src/doc.h
+++ b/src/doc.h
@@ -62,7 +62,7 @@ teco_doc_init(teco_doc_t *ctx)
 	memset(ctx, 0, sizeof(*ctx));
 }
 
-void teco_doc_edit(teco_doc_t *ctx);
+void teco_doc_edit(teco_doc_t *ctx, guint default_cp);
 void teco_doc_undo_edit(teco_doc_t *ctx);
 
 void teco_doc_set_string(teco_doc_t *ctx, const gchar *str, gsize len, guint codepage);
diff --git a/src/glob.c b/src/glob.c
index 2c955ee..9aa499d 100644
--- a/src/glob.c
+++ b/src/glob.c
@@ -319,7 +319,7 @@ teco_state_glob_pattern_done(teco_machine_main_t *ctx, const teco_string_t *str,
 		g_assert(glob_reg != NULL);
 		if (!glob_reg->vtable->undo_set_string(glob_reg, error) ||
 		    !glob_reg->vtable->set_string(glob_reg, filename, strlen(filename),
-		                                  SC_CP_UTF8, error))
+		                                  teco_default_codepage(), error))
 			return NULL;
 	}
 
diff --git a/src/main.c b/src/main.c
index c38b1a3..abf8d2f 100644
--- a/src/main.c
+++ b/src/main.c
@@ -105,6 +105,7 @@ teco_get_default_config_path(const gchar *program)
 static gchar *teco_eval_macro = NULL;
 static gboolean teco_mung_file = FALSE;
 static gboolean teco_mung_profile = TRUE;
+static gboolean teco_8bit_clean = FALSE;
 
 static gchar *
 teco_process_options(gint *argc, gchar ***argv)
@@ -120,6 +121,8 @@ teco_process_options(gint *argc, gchar ***argv)
 		 "Do not mung "
 		 "$SCITECOCONFIG" G_DIR_SEPARATOR_S INI_FILE " "
 		 "even if it exists"},
+		{"8bit", '8', 0, G_OPTION_ARG_NONE, &teco_8bit_clean,
+		 "Use ANSI encoding by default and disable automatic EOL conversion"},
 		{NULL}
 	};
 
@@ -320,6 +323,10 @@ main(int argc, char **argv)
 	 * to the macro or munged file.
 	 */
 
+	if (teco_8bit_clean)
+		/* equivalent to 16,4ED but executed earlier */
+		teco_ed = (teco_ed & ~TECO_ED_AUTOEOL) | TECO_ED_DEFAULT_ANSI;
+
 	/*
 	 * Theoretically, QReg tables should only be initialized
 	 * after the interface, since they contain Scintilla documents.
diff --git a/src/parser.c b/src/parser.c
index aef6223..ed21740 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -724,7 +724,7 @@ teco_machine_stringbuilding_init(teco_machine_stringbuilding_t *ctx, gchar escap
 	teco_machine_init(&ctx->parent, &teco_state_stringbuilding_start, must_undo);
 	ctx->escape_char = escape_char;
 	ctx->qreg_table_locals = locals;
-	ctx->codepage = SC_CP_UTF8;
+	ctx->codepage = teco_default_codepage();
 }
 
 void
@@ -767,7 +767,7 @@ gboolean
 teco_state_expectstring_initial(teco_machine_main_t *ctx, GError **error)
 {
 	if (ctx->mode == TECO_MODE_NORMAL)
-		teco_undo_guint(ctx->expectstring.machine.codepage) = SC_CP_UTF8;
+		teco_undo_guint(ctx->expectstring.machine.codepage) = teco_default_codepage();
 	return TRUE;
 }
 
diff --git a/src/qreg-commands.c b/src/qreg-commands.c
index e8be384..0e07944 100644
--- a/src/qreg-commands.c
+++ b/src/qreg-commands.c
@@ -372,7 +372,7 @@ teco_state_setqregstring_nobuilding_done(teco_machine_main_t *ctx,
 	gint args = teco_expressions_args();
 
 	if (args > 0) {
-		guint codepage = SC_CP_UTF8;
+		guint codepage = teco_default_codepage();
 		if (colon_modified && !qreg->vtable->get_string(qreg, NULL, NULL, &codepage, error))
 			return NULL;
 
@@ -415,7 +415,7 @@ teco_state_setqregstring_nobuilding_done(teco_machine_main_t *ctx,
 			/* set register */
 			if (!qreg->vtable->undo_set_string(qreg, error) ||
 			    !qreg->vtable->set_string(qreg, buffer, len,
-			                              SC_CP_UTF8, error))
+			                              codepage, error))
 				return NULL;
 		}
 	}
@@ -429,7 +429,7 @@ teco_state_setqregstring_nobuilding_done(teco_machine_main_t *ctx,
 		/* set register */
 		if (!qreg->vtable->undo_set_string(qreg, error) ||
 		    !qreg->vtable->set_string(qreg, str->data, str->len,
-		                              SC_CP_UTF8, error))
+		                              teco_default_codepage(), error))
 			return NULL;
 	}
 
diff --git a/src/qreg.c b/src/qreg.c
index 08bc8fc..fb559af 100644
--- a/src/qreg.c
+++ b/src/qreg.c
@@ -127,11 +127,11 @@ teco_qreg_set_eol_mode(teco_qreg_t *qreg, gint mode)
 	if (teco_qreg_current)
 		teco_doc_update(&teco_qreg_current->string, teco_qreg_view);
 
-	teco_doc_edit(&qreg->string);
+	teco_doc_edit(&qreg->string, teco_default_codepage());
 	teco_view_ssm(teco_qreg_view, SCI_SETEOLMODE, mode, 0);
 
 	if (teco_qreg_current)
-		teco_doc_edit(&teco_qreg_current->string);
+		teco_doc_edit(&teco_qreg_current->string, 0);
 }
 
 /** @memberof teco_qreg_t */
@@ -144,7 +144,7 @@ teco_qreg_load(teco_qreg_t *qreg, const gchar *filename, GError **error)
 	if (teco_qreg_current)
 		teco_doc_update(&teco_qreg_current->string, teco_qreg_view);
 
-	teco_doc_edit(&qreg->string);
+	teco_doc_edit(&qreg->string, teco_default_codepage());
 	teco_doc_reset(&qreg->string);
 
 	/*
@@ -162,7 +162,7 @@ teco_qreg_load(teco_qreg_t *qreg, const gchar *filename, GError **error)
 		return FALSE;
 
 	if (teco_qreg_current)
-		teco_doc_edit(&teco_qreg_current->string);
+		teco_doc_edit(&teco_qreg_current->string, 0);
 
 	return TRUE;
 }
@@ -174,18 +174,14 @@ teco_qreg_save(teco_qreg_t *qreg, const gchar *filename, GError **error)
 	if (teco_qreg_current)
 		teco_doc_update(&teco_qreg_current->string, teco_qreg_view);
 
-	teco_doc_edit(&qreg->string);
+	teco_doc_edit(&qreg->string, teco_default_codepage());
 
-	if (!teco_view_save(teco_qreg_view, filename, error)) {
-		if (teco_qreg_current)
-			teco_doc_edit(&teco_qreg_current->string);
-		return FALSE;
-	}
+	gboolean ret = teco_view_save(teco_qreg_view, filename, error);
 
 	if (teco_qreg_current)
-		teco_doc_edit(&teco_qreg_current->string);
+		teco_doc_edit(&teco_qreg_current->string, 0);
 
-	return TRUE;
+	return ret;
 }
 
 static gboolean
@@ -239,14 +235,14 @@ teco_qreg_plain_append_string(teco_qreg_t *qreg, const gchar *str, gsize len, GE
 	if (teco_qreg_current)
 		teco_doc_update(&teco_qreg_current->string, teco_qreg_view);
 
-	teco_doc_edit(&qreg->string);
+	teco_doc_edit(&qreg->string, teco_default_codepage());
 
 	teco_view_ssm(teco_qreg_view, SCI_BEGINUNDOACTION, 0, 0);
 	teco_view_ssm(teco_qreg_view, SCI_APPENDTEXT, len, (sptr_t)str);
 	teco_view_ssm(teco_qreg_view, SCI_ENDUNDOACTION, 0, 0);
 
 	if (teco_qreg_current)
-		teco_doc_edit(&teco_qreg_current->string);
+		teco_doc_edit(&teco_qreg_current->string, 0);
 	return TRUE;
 }
 
@@ -262,27 +258,24 @@ static gboolean
 teco_qreg_plain_get_character(teco_qreg_t *qreg, teco_int_t position,
                               teco_int_t *chr, GError **error)
 {
-	gboolean ret = TRUE;
-
 	if (teco_qreg_current)
 		teco_doc_update(&teco_qreg_current->string, teco_qreg_view);
 
-	teco_doc_edit(&qreg->string);
+	teco_doc_edit(&qreg->string, teco_default_codepage());
 
 	sptr_t len = teco_view_ssm(teco_qreg_view, SCI_GETLENGTH, 0, 0);
 	gssize off = teco_view_glyphs2bytes(teco_qreg_view, position);
 
-	if (off < 0 || off == len) {
+	gboolean ret = off >= 0 && off != len;
+	if (!ret)
 		g_set_error(error, TECO_ERROR, TECO_ERROR_RANGE,
 		            "Position %" TECO_INT_FORMAT " out of range", position);
-		ret = FALSE;
 		/* make sure we still restore the current Q-Register */
-	} else {
+	else
 		*chr = teco_view_get_character(teco_qreg_view, off, len);
-	}
 
 	if (teco_qreg_current)
-		teco_doc_edit(&teco_qreg_current->string);
+		teco_doc_edit(&teco_qreg_current->string, 0);
 
 	return ret;
 }
@@ -293,13 +286,13 @@ teco_qreg_plain_get_length(teco_qreg_t *qreg, GError **error)
 	if (teco_qreg_current)
 		teco_doc_update(&teco_qreg_current->string, teco_qreg_view);
 
-	teco_doc_edit(&qreg->string);
+	teco_doc_edit(&qreg->string, teco_default_codepage());
 
 	sptr_t len = teco_view_ssm(teco_qreg_view, SCI_GETLENGTH, 0, 0);
 	teco_int_t ret = teco_view_bytes2glyphs(teco_qreg_view, len);
 
 	if (teco_qreg_current)
-		teco_doc_edit(&teco_qreg_current->string);
+		teco_doc_edit(&teco_qreg_current->string, 0);
 
 	return ret;
 }
@@ -326,7 +319,7 @@ teco_qreg_plain_edit(teco_qreg_t *qreg, GError **error)
 	if (teco_qreg_current)
 		teco_doc_update(&teco_qreg_current->string, teco_qreg_view);
 
-	teco_doc_edit(&qreg->string);
+	teco_doc_edit(&qreg->string, teco_default_codepage());
 	teco_interface_show_view(teco_qreg_view);
 	teco_interface_info_update(qreg);
 
@@ -549,7 +542,7 @@ teco_qreg_bufferinfo_get_string(teco_qreg_t *qreg, gchar **str, gsize *len,
 	 */
 	*len = teco_ring_current->filename ? strlen(teco_ring_current->filename) : 0;
 	if (codepage)
-		 *codepage = SC_CP_UTF8;
+		 *codepage = teco_default_codepage();
 	return TRUE;
 }
 
@@ -647,7 +640,7 @@ teco_qreg_workingdir_get_string(teco_qreg_t *qreg, gchar **str, gsize *len,
 	else
 		g_free(dir);
 	if (codepage)
-		*codepage = SC_CP_UTF8;
+		*codepage = teco_default_codepage();
 
 	return TRUE;
 }
@@ -798,7 +791,7 @@ teco_qreg_clipboard_get_string(teco_qreg_t *qreg, gchar **str, gsize *len,
 		teco_string_clear(&str_converted);
 	*len = str_converted.len;
 	if (codepage)
-		*codepage = SC_CP_UTF8;
+		*codepage = teco_default_codepage();
 
 	return TRUE;
 }
@@ -910,7 +903,7 @@ teco_qreg_table_set_environ(teco_qreg_table_t *table, GError **error)
 		}
 
 		if (!qreg->vtable->set_string(qreg, value, strlen(value),
-		                              SC_CP_UTF8, error))
+		                              teco_default_codepage(), error))
 			return FALSE;
 	}
 
diff --git a/src/sciteco.h b/src/sciteco.h
index 7f420e8..09dea3b 100644
--- a/src/sciteco.h
+++ b/src/sciteco.h
@@ -21,6 +21,8 @@
 
 #include <glib.h>
 
+#include <Scintilla.h>
+
 #if TECO_INTEGER == 32
 typedef gint32 teco_int_t;
 #define TECO_INT_FORMAT G_GINT32_FORMAT
@@ -83,6 +85,7 @@ teco_is_failure(teco_bool_t x)
  * This is not a bitfield, since it is set from SciTECO.
  */
 enum {
+	TECO_ED_DEFAULT_ANSI	= (1 << 2),
 	TECO_ED_AUTOCASEFOLD	= (1 << 3),
 	TECO_ED_AUTOEOL		= (1 << 4),
 	TECO_ED_HOOKS		= (1 << 5),
@@ -94,6 +97,12 @@ enum {
 /* in main.c */
 extern teco_int_t teco_ed;
 
+static inline guint
+teco_default_codepage(void)
+{
+	return teco_ed & TECO_ED_DEFAULT_ANSI ? SC_CHARSET_ANSI : SC_CP_UTF8;
+}
+
 /* in main.c */
 extern volatile sig_atomic_t teco_interrupted;
 
diff --git a/src/search.c b/src/search.c
index cf26c7f..c1dd542 100644
--- a/src/search.c
+++ b/src/search.c
@@ -678,7 +678,7 @@ teco_state_search_done(teco_machine_main_t *ctx, const teco_string_t *str, GErro
 
 		if (!search_reg->vtable->undo_set_string(search_reg, error) ||
 		    !search_reg->vtable->set_string(search_reg, str->data, str->len,
-		                                    SC_CP_UTF8, error))
+		                                    teco_default_codepage(), error))
 			return NULL;
 
 		teco_interface_ssm(SCI_SETANCHOR, anchor, 0);
@@ -1078,7 +1078,7 @@ teco_state_replace_default_insert_done_overwrite(teco_machine_main_t *ctx, const
 	if (str->len > 0) {
 		if (!replace_reg->vtable->undo_set_string(replace_reg, error) ||
 		    !replace_reg->vtable->set_string(replace_reg, str->data, str->len,
-		                                     SC_CP_UTF8, error))
+		                                     teco_default_codepage(), error))
 			return NULL;
 	} else {
 		g_auto(teco_string_t) replace_str = {NULL, 0};
@@ -1111,7 +1111,7 @@ teco_state_replace_default_ignore_done(teco_machine_main_t *ctx, const teco_stri
 
 	if (!replace_reg->vtable->undo_set_string(replace_reg, error) ||
 	    !replace_reg->vtable->set_string(replace_reg, str->data, str->len,
-	                                     SC_CP_UTF8, error))
+	                                     teco_default_codepage(), error))
 		return NULL;
 
 	return &teco_state_start;
diff --git a/src/spawn.c b/src/spawn.c
index c6dd779..6d3a441 100644
--- a/src/spawn.c
+++ b/src/spawn.c
@@ -165,9 +165,10 @@ teco_state_execute_initial(teco_machine_main_t *ctx, GError **error)
 		return TRUE;
 
 	/*
-	 * Command-lines and file names are always assumed to be UTF-8.
+	 * Command-lines and file names are always assumed to be UTF-8,
+	 * unless we set TECO_ED_DEFAULT_ANSI.
 	 */
-	teco_undo_guint(ctx->expectstring.machine.codepage) = SC_CP_UTF8;
+	teco_undo_guint(ctx->expectstring.machine.codepage) = teco_default_codepage();
 
 	if (!teco_expressions_eval(FALSE, error))
 		return FALSE;
@@ -702,7 +703,7 @@ teco_spawn_stdout_watch_cb(GIOChannel *chan, GIOCondition condition, gpointer da
 			} else {
 				if (!qreg->vtable->undo_set_string(qreg, &teco_spawn_ctx.error) ||
 				    !qreg->vtable->set_string(qreg, buffer.data, buffer.len,
-				                              SC_CP_UTF8, &teco_spawn_ctx.error))
+				                              teco_default_codepage(), &teco_spawn_ctx.error))
 					goto error;
 			}
 		} else {
diff --git a/src/view.c b/src/view.c
index 291c06b..0d1d168 100644
--- a/src/view.c
+++ b/src/view.c
@@ -72,6 +72,27 @@ teco_view_setup(teco_view_t *ctx)
 	 */
 	teco_view_ssm(ctx, SCI_SETMARGINWIDTHN, 1, 0);
 
+	if (teco_ed & TECO_ED_DEFAULT_ANSI) {
+		/*
+		 * Configure a single-byte codepage/charset.
+		 * This requires setting it on all of the possible styles.
+		 * Fortunately, we can do it before SCI_STYLECLEARALL.
+		 * This is important only for display purposes - other than that
+		 * all single-byte encodings are handled the same.
+		 */
+		teco_view_ssm(ctx, SCI_STYLESETCHARACTERSET, STYLE_DEFAULT, SC_CHARSET_ANSI);
+		/* 0 is used for ALL single-byte encodings */
+		teco_view_ssm(ctx, SCI_SETCODEPAGE, 0, 0);
+	} else {
+		/*
+		 * Documents are UTF-8 by default and all UTF-8 documents
+		 * are expected to have a character index.
+		 * This is a property of the document, instead of the view.
+		 */
+		teco_view_ssm(ctx, SCI_ALLOCATELINECHARACTERINDEX,
+		              SC_LINECHARACTERINDEX_UTF32, 0);
+	}
+
 	/*
 	 * Set some basic styles in order to provide
 	 * a consistent look across UIs if no profile
@@ -118,14 +139,6 @@ teco_view_setup(teco_view_t *ctx)
 	 * the representations only once.
 	 */
 	teco_view_set_representations(ctx);
-
-	/*
-	 * Documents are UTF-8 by default and all UTF-8 documents
-	 * are expected to have a character index.
-	 * This is a property of the document, instead of the view.
-	 */
-	teco_view_ssm(ctx, SCI_ALLOCATELINECHARACTERINDEX,
-	              SC_LINECHARACTERINDEX_UTF32, 0);
 }
 
 TECO_DEFINE_UNDO_CALL(teco_view_ssm, teco_view_t *, unsigned int, uptr_t, sptr_t);
@@ -145,6 +158,28 @@ teco_view_set_representations(teco_view_t *ctx)
 		gchar buf[] = {(gchar)cc, '\0'};
 		teco_view_ssm(ctx, SCI_SETREPRESENTATION, (uptr_t)buf, (sptr_t)reps[cc]);
 	}
+
+	if (teco_ed & TECO_ED_DEFAULT_ANSI) {
+		/*
+		 * Non-ANSI chars should be visible somehow.
+		 * This would best be done always when changing the
+		 * encoding to 0, but it would be kind of expensive.
+		 *
+		 * FIXME: On the other hand, this could cause problems
+		 * when setting SC_CP_UTF8 later on.
+		 */
+		for (guint cc = 0x80; cc <= 0xFF; cc++) {
+			gchar buf[] = {(gchar)cc, '\0'};
+			gchar rep[2+1];
+			/*
+			 * Hexadecimal is poorly supported in SciTECO, but
+			 * multiple decimal numbers one after another look
+			 * confusing, esp. in Curses.
+			 */
+			g_snprintf(rep, sizeof(rep), "%02X", cc);
+			teco_view_ssm(ctx, SCI_SETREPRESENTATION, (uptr_t)buf, (sptr_t)rep);
+		}
+	}
 }
 
 /**
author	Robin Haberkorn <robin.haberkorn@googlemail.com>	2024-09-09 16:54:26 +0200
committer	Robin Haberkorn <robin.haberkorn@googlemail.com>	2024-09-09 18:22:21 +0200
commit	4f231871a0208ec9bcc2679fce25d3b9795d1597 (patch)
tree	bdc9055166fe236f009c6640acf53b6706310c27 /src
parent	41ab5cf0289dab60ac1ddc97cf9680ee2468ea6c (diff)
download	sciteco-4f231871a0208ec9bcc2679fce25d3b9795d1597.tar.gz