aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/string-utils.h
diff options
context:
space:
mode:
authorRobin Haberkorn <robin.haberkorn@googlemail.com>2024-09-11 14:30:24 +0200
committerRobin Haberkorn <robin.haberkorn@googlemail.com>2024-09-11 16:14:27 +0200
commit2a050759ab621b87d0782cc8235907a1757b46cc (patch)
treecde0c666146f833fc948a13c22056061f62d2619 /src/string-utils.h
parent68578072bfaf6054a96bb6bcedfccb6e56a508fe (diff)
downloadsciteco-2a050759ab621b87d0782cc8235907a1757b46cc.tar.gz
fixed searches in single-byte encoded documents
* while code is guaranteed to be in valid UTF-8, this cannot be said about the result of string building. * The search pattern can end up with invalid Unicode bytes even when searching on UTF-8 buffers, e.g. if ^EQq inserts garbage. There are currently no checks. * When searching on a raw buffer, it must be possible to search for arbitrary bytes (^EUq). Since teco_pattern2regexp() was always expecting clean UTF-8 input, this would sometimes skip over too many bytes and could even crash. * Instead, teco_pattern2regexp() now takes the <S> target codepage into account.
Diffstat (limited to 'src/string-utils.h')
-rw-r--r--src/string-utils.h13
1 files changed, 13 insertions, 0 deletions
diff --git a/src/string-utils.h b/src/string-utils.h
index 1b4957f..806140e 100644
--- a/src/string-utils.h
+++ b/src/string-utils.h
@@ -197,6 +197,19 @@ teco_string_rindex(const teco_string_t *str, gchar chr)
const gchar *teco_string_last_occurrence(const teco_string_t *str, const gchar *chars);
+/**
+ * Validate whether string consists exclusively of valid UTF-8, but accept null bytes.
+ * @note there is g_utf8_validate_len() in Glib 2.60
+ */
+static inline gboolean
+teco_string_validate_utf8(const teco_string_t *str)
+{
+ const gchar *p = str->data;
+ while (!g_utf8_validate(p, str->len - (p - str->data), &p) && !*p)
+ p++;
+ return p - str->data == str->len;
+}
+
/** @memberof teco_string_t */
static inline void
teco_string_clear(teco_string_t *str)