6 files changed, 54 insertions, 48 deletions
diff --git a/src/core-commands.c b/src/core-commands.c
index 951e001..98097bb 100644
--- a/src/core-commands.c
+++ b/src/core-commands.c
@@ -1045,32 +1045,7 @@ teco_state_start_get(teco_machine_main_t *ctx, GError **error)
 		return;
 	}
 
-	teco_int_t ret;
-
-	if (teco_interface_ssm(SCI_GETCODEPAGE, 0, 0) == SC_CP_UTF8) {
-		gchar buf[4+1];
-		struct Sci_TextRangeFull range = {
-			.chrg = {get_pos, MIN(len, get_pos+sizeof(buf)-1)},
-			.lpstrText = buf
-		};
-		/*
-		 * Probably faster than SCI_GETRANGEPOINTER+SCI_GETGAPPOSITION
-		 * or repeatedly calling SCI_GETCHARAT.
-		 */
-		teco_interface_ssm(SCI_GETTEXTRANGEFULL, 0, (sptr_t)&range);
-		/*
-		 * Make sure that the -1/-2 error values are preserved.
-		 * The sign bit in UCS-4/UTF-32 is unused, so this will even
-		 * suffice if TECO_INTEGER == 32.
-		 */
-		ret = (gint32)g_utf8_get_char_validated(buf, -1);
-	} else {
-		// FIXME: Everything else is a single-byte encoding?
-		/* internally, the character is casted to signed char */
-		ret = (guchar)teco_interface_ssm(SCI_GETCHARAT, get_pos, 0);
-	}
-
-	teco_expressions_push(ret);
+	teco_expressions_push(teco_interface_get_character(get_pos, len));
 }
 
 static teco_state_t *
diff --git a/src/interface.h b/src/interface.h
index c975525..cbe10bd 100644
--- a/src/interface.h
+++ b/src/interface.h
@@ -172,6 +172,12 @@ teco_glyphs2bytes_relative(gsize pos, teco_int_t n)
 	return teco_view_glyphs2bytes_relative(teco_interface_current_view, pos, n);
 }
 
+static inline teco_int_t
+teco_interface_get_character(gsize pos, gsize len)
+{
+	return teco_view_get_character(teco_interface_current_view, pos, len);
+}
+
 /*
  * The following functions are here for lack of a better place.
  * They could also be in sciteco.h, but only if declared as non-inline
diff --git a/src/qreg-commands.c b/src/qreg-commands.c
index d7bfafe..09b2b90 100644
--- a/src/qreg-commands.c
+++ b/src/qreg-commands.c
@@ -310,8 +310,10 @@ teco_state_queryqreg_got_register(teco_machine_main_t *ctx, teco_qreg_t *qreg,
  * Positions are handled like buffer positions \(em they
  * begin at 0 up to the length of the string minus 1.
  * An error is thrown for invalid positions.
- * If <q> is Unicode-encoded, -1 or -2 could be returned for
- * invalid byte sequences.
+ * If <q> is encoded as UTF-8 and there is
+ * an incomplete sequence at the requested position,
+ * -1 is returned.
+ * All other invalid Unicode sequences are returned as -2.
  * Both non-colon-modified forms of Q require register <q>
  * to be defined and fail otherwise.
  *
diff --git a/src/qreg.c b/src/qreg.c
index 4432cbf..c3ab1a5 100644
--- a/src/qreg.c
+++ b/src/qreg.c
@@ -284,27 +284,8 @@ teco_qreg_plain_get_character(teco_qreg_t *qreg, teco_int_t position,
 		            "Position %" TECO_INT_FORMAT " out of range", position);
 		ret = FALSE;
 		/* make sure we still restore the current Q-Register */
-	} else if (teco_view_ssm(teco_qreg_view, SCI_GETCODEPAGE, 0, 0) == SC_CP_UTF8) {
-		gchar buf[4+1];
-		struct Sci_TextRangeFull range = {
-			.chrg = {off, MIN(len, off+sizeof(buf)-1)},
-			.lpstrText = buf
-		};
-		/*
-		 * Probably faster than SCI_GETRANGEPOINTER+SCI_GETGAPPOSITION
-		 * or repeatedly calling SCI_GETCHARAT.
-		 */
-		teco_view_ssm(teco_qreg_view, SCI_GETTEXTRANGEFULL, 0, (sptr_t)&range);
-		/*
-		 * Make sure that the -1/-2 error values are preserved.
-		 * The sign bit in UCS-4/UTF-32 is unused, so this will even
-		 * suffice if TECO_INTEGER == 32.
-		 */
-		*chr = (gint32)g_utf8_get_char_validated(buf, -1);
 	} else {
-		// FIXME: Everything else is a single-byte encoding?
-		/* internally, the character is casted to signed char */
-		*chr = (guchar)teco_view_ssm(teco_qreg_view, SCI_GETCHARAT, off, 0);
+		*chr = teco_view_get_character(teco_qreg_view, off, len);
 	}
 
 	if (teco_qreg_current)
diff --git a/src/view.c b/src/view.c
index 4f959a3..291c06b 100644
--- a/src/view.c
+++ b/src/view.c
@@ -543,3 +543,43 @@ teco_view_glyphs2bytes_relative(teco_view_t *ctx, gsize pos, teco_int_t n)
 	/* SCI_POSITIONRELATIVE may return 0 even if the offset is valid */
 	return res ? : n > 0 ? -1 : teco_view_bytes2glyphs(ctx, pos)+n >= 0 ? 0 : -1;
 }
+
+/**
+ * Get codepoint at given byte offset.
+ *
+ * @param ctx The view to operate on.
+ * @param pos The glyph's byte position
+ * @param len The length of the document in bytes
+ * @return The requested codepoint.
+ *   In UTF-8 encoded documents, this might be -1 (incomplete sequence)
+ *   or -2 (invalid byte sequence).
+ */
+teco_int_t
+teco_view_get_character(teco_view_t *ctx, gsize pos, gsize len)
+{
+	if (teco_view_ssm(ctx, SCI_GETCODEPAGE, 0, 0) != SC_CP_UTF8)
+		/*
+		 * We don't support the asiatic multi-byte encodings,
+		 * so everything else is single-byte codepages.
+		 * NOTE: Internally, the character is casted to signed char
+		 * and may therefore become negative.
+		 */
+		return (guchar)teco_view_ssm(ctx, SCI_GETCHARAT, pos, 0);
+
+	gchar buf[4+1];
+	struct Sci_TextRangeFull range = {
+		.chrg = {pos, MIN(len, pos+sizeof(buf)-1)},
+		.lpstrText = buf
+	};
+	/*
+	 * Probably faster than SCI_GETRANGEPOINTER+SCI_GETGAPPOSITION
+	 * or repeatedly calling SCI_GETCHARAT.
+	 */
+	teco_view_ssm(ctx, SCI_GETTEXTRANGEFULL, 0, (sptr_t)&range);
+	/*
+	 * Make sure that the -1/-2 error values are preserved.
+	 * The sign bit in UCS-4/UTF-32 is unused, so this will even
+	 * suffice if TECO_INTEGER == 32.
+	 */
+	return (gint32)g_utf8_get_char_validated(buf, -1);
+}
diff --git a/src/view.h b/src/view.h
index a395dcf..882a33c 100644
--- a/src/view.h
+++ b/src/view.h
@@ -74,3 +74,5 @@ void teco_view_free(teco_view_t *ctx);
 gssize teco_view_glyphs2bytes(teco_view_t *ctx, teco_int_t pos);
 teco_int_t teco_view_bytes2glyphs(teco_view_t *ctx, gsize pos);
 gssize teco_view_glyphs2bytes_relative(teco_view_t *ctx, gsize pos, teco_int_t n);
+
+teco_int_t teco_view_get_character(teco_view_t *ctx, gsize pos, gsize len);