aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/core-commands.c27
-rw-r--r--src/interface.h6
-rw-r--r--src/qreg-commands.c6
-rw-r--r--src/qreg.c21
-rw-r--r--src/view.c40
-rw-r--r--src/view.h2
6 files changed, 54 insertions, 48 deletions
diff --git a/src/core-commands.c b/src/core-commands.c
index 951e001..98097bb 100644
--- a/src/core-commands.c
+++ b/src/core-commands.c
@@ -1045,32 +1045,7 @@ teco_state_start_get(teco_machine_main_t *ctx, GError **error)
return;
}
- teco_int_t ret;
-
- if (teco_interface_ssm(SCI_GETCODEPAGE, 0, 0) == SC_CP_UTF8) {
- gchar buf[4+1];
- struct Sci_TextRangeFull range = {
- .chrg = {get_pos, MIN(len, get_pos+sizeof(buf)-1)},
- .lpstrText = buf
- };
- /*
- * Probably faster than SCI_GETRANGEPOINTER+SCI_GETGAPPOSITION
- * or repeatedly calling SCI_GETCHARAT.
- */
- teco_interface_ssm(SCI_GETTEXTRANGEFULL, 0, (sptr_t)&range);
- /*
- * Make sure that the -1/-2 error values are preserved.
- * The sign bit in UCS-4/UTF-32 is unused, so this will even
- * suffice if TECO_INTEGER == 32.
- */
- ret = (gint32)g_utf8_get_char_validated(buf, -1);
- } else {
- // FIXME: Everything else is a single-byte encoding?
- /* internally, the character is casted to signed char */
- ret = (guchar)teco_interface_ssm(SCI_GETCHARAT, get_pos, 0);
- }
-
- teco_expressions_push(ret);
+ teco_expressions_push(teco_interface_get_character(get_pos, len));
}
static teco_state_t *
diff --git a/src/interface.h b/src/interface.h
index c975525..cbe10bd 100644
--- a/src/interface.h
+++ b/src/interface.h
@@ -172,6 +172,12 @@ teco_glyphs2bytes_relative(gsize pos, teco_int_t n)
return teco_view_glyphs2bytes_relative(teco_interface_current_view, pos, n);
}
+static inline teco_int_t
+teco_interface_get_character(gsize pos, gsize len)
+{
+ return teco_view_get_character(teco_interface_current_view, pos, len);
+}
+
/*
* The following functions are here for lack of a better place.
* They could also be in sciteco.h, but only if declared as non-inline
diff --git a/src/qreg-commands.c b/src/qreg-commands.c
index d7bfafe..09b2b90 100644
--- a/src/qreg-commands.c
+++ b/src/qreg-commands.c
@@ -310,8 +310,10 @@ teco_state_queryqreg_got_register(teco_machine_main_t *ctx, teco_qreg_t *qreg,
* Positions are handled like buffer positions \(em they
* begin at 0 up to the length of the string minus 1.
* An error is thrown for invalid positions.
- * If <q> is Unicode-encoded, -1 or -2 could be returned for
- * invalid byte sequences.
+ * If <q> is encoded as UTF-8 and there is
+ * an incomplete sequence at the requested position,
+ * -1 is returned.
+ * All other invalid Unicode sequences are returned as -2.
* Both non-colon-modified forms of Q require register <q>
* to be defined and fail otherwise.
*
diff --git a/src/qreg.c b/src/qreg.c
index 4432cbf..c3ab1a5 100644
--- a/src/qreg.c
+++ b/src/qreg.c
@@ -284,27 +284,8 @@ teco_qreg_plain_get_character(teco_qreg_t *qreg, teco_int_t position,
"Position %" TECO_INT_FORMAT " out of range", position);
ret = FALSE;
/* make sure we still restore the current Q-Register */
- } else if (teco_view_ssm(teco_qreg_view, SCI_GETCODEPAGE, 0, 0) == SC_CP_UTF8) {
- gchar buf[4+1];
- struct Sci_TextRangeFull range = {
- .chrg = {off, MIN(len, off+sizeof(buf)-1)},
- .lpstrText = buf
- };
- /*
- * Probably faster than SCI_GETRANGEPOINTER+SCI_GETGAPPOSITION
- * or repeatedly calling SCI_GETCHARAT.
- */
- teco_view_ssm(teco_qreg_view, SCI_GETTEXTRANGEFULL, 0, (sptr_t)&range);
- /*
- * Make sure that the -1/-2 error values are preserved.
- * The sign bit in UCS-4/UTF-32 is unused, so this will even
- * suffice if TECO_INTEGER == 32.
- */
- *chr = (gint32)g_utf8_get_char_validated(buf, -1);
} else {
- // FIXME: Everything else is a single-byte encoding?
- /* internally, the character is casted to signed char */
- *chr = (guchar)teco_view_ssm(teco_qreg_view, SCI_GETCHARAT, off, 0);
+ *chr = teco_view_get_character(teco_qreg_view, off, len);
}
if (teco_qreg_current)
diff --git a/src/view.c b/src/view.c
index 4f959a3..291c06b 100644
--- a/src/view.c
+++ b/src/view.c
@@ -543,3 +543,43 @@ teco_view_glyphs2bytes_relative(teco_view_t *ctx, gsize pos, teco_int_t n)
/* SCI_POSITIONRELATIVE may return 0 even if the offset is valid */
return res ? : n > 0 ? -1 : teco_view_bytes2glyphs(ctx, pos)+n >= 0 ? 0 : -1;
}
+
+/**
+ * Get codepoint at given byte offset.
+ *
+ * @param ctx The view to operate on.
+ * @param pos The glyph's byte position
+ * @param len The length of the document in bytes
+ * @return The requested codepoint.
+ * In UTF-8 encoded documents, this might be -1 (incomplete sequence)
+ * or -2 (invalid byte sequence).
+ */
+teco_int_t
+teco_view_get_character(teco_view_t *ctx, gsize pos, gsize len)
+{
+ if (teco_view_ssm(ctx, SCI_GETCODEPAGE, 0, 0) != SC_CP_UTF8)
+ /*
+ * We don't support the asiatic multi-byte encodings,
+ * so everything else is single-byte codepages.
+ * NOTE: Internally, the character is casted to signed char
+ * and may therefore become negative.
+ */
+ return (guchar)teco_view_ssm(ctx, SCI_GETCHARAT, pos, 0);
+
+ gchar buf[4+1];
+ struct Sci_TextRangeFull range = {
+ .chrg = {pos, MIN(len, pos+sizeof(buf)-1)},
+ .lpstrText = buf
+ };
+ /*
+ * Probably faster than SCI_GETRANGEPOINTER+SCI_GETGAPPOSITION
+ * or repeatedly calling SCI_GETCHARAT.
+ */
+ teco_view_ssm(ctx, SCI_GETTEXTRANGEFULL, 0, (sptr_t)&range);
+ /*
+ * Make sure that the -1/-2 error values are preserved.
+ * The sign bit in UCS-4/UTF-32 is unused, so this will even
+ * suffice if TECO_INTEGER == 32.
+ */
+ return (gint32)g_utf8_get_char_validated(buf, -1);
+}
diff --git a/src/view.h b/src/view.h
index a395dcf..882a33c 100644
--- a/src/view.h
+++ b/src/view.h
@@ -74,3 +74,5 @@ void teco_view_free(teco_view_t *ctx);
gssize teco_view_glyphs2bytes(teco_view_t *ctx, teco_int_t pos);
teco_int_t teco_view_bytes2glyphs(teco_view_t *ctx, gsize pos);
gssize teco_view_glyphs2bytes_relative(teco_view_t *ctx, gsize pos, teco_int_t n);
+
+teco_int_t teco_view_get_character(teco_view_t *ctx, gsize pos, gsize len);