Unicode builds now expect UTF-8 strings

* They are built with `-DREGEX_UTF8` instead of `-DREGEX_WCHAR`. Functions are called reg_ucomp() and reg_uexec() instead for consistency. The library is now called libhsurex.so instead of libhswrex.so. * The `chr` type is now always `unsigned char`. As a result many other uses of the `chr` type had to be changed to pchr (which is always large enough to hold a byte or wide character). Generally we try to keep code changes as small as possible since we may have to backport changes from the Tcl codebase or contribute patches to the Tcl project.
author: Robin Haberkorn <rhaberkorn@fmsbw.de> 2026-06-21 21:42:12 +0200
committer: Robin Haberkorn <rhaberkorn@fmsbw.de> 2026-06-21 22:05:37 +0200
commit: 13f5fd77bbc528862f295f9e7196f3ff709d185a (patch)
tree: 9f9ce051bc8adf61e5ae2b4e94ccb1331bfdbfa5 /regexec.c
parent: 10b47c9226b6267e5a4be4e79fe79314bf969025 (diff)
download: terex-13f5fd77bbc528862f295f9e7196f3ff709d185a.tar.gz
1 files changed, 50 insertions, 4 deletions
diff --git a/regexec.c b/regexec.c
index 24edb41..2f8a234 100644
--- a/regexec.c
+++ b/regexec.c
@@ -155,6 +155,52 @@ static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *);
 /* automatically gathered by fwd; do not hand-edit */
 /* =====^!^===== end forwards =====^!^===== */
 
+#ifdef REGEX_UTF8
+
+static inline chr *
+nextchr(chr *s)
+{
+    unsigned char c = (unsigned char)*s;
+
+    if (c < 0x80)        /* 0xxxxxxx */
+	return s + 1;
+    if ((c & 0xE0) == 0xC0) /* 110xxxxx */
+	return s + 2;
+    if ((c & 0xF0) == 0xE0) /* 1110xxxx */
+	return s + 3;
+    if ((c & 0xF8) == 0xF0) /* 11110xxx */
+	return s + 4;
+
+    /* invalid lead byte, including stray continuation byte */
+    return s + 1;
+}
+
+static inline chr *
+prevchr(chr *s)
+{
+    do {
+	--s;
+    } while (((unsigned char)*s & 0xC0) == 0x80);
+
+    return s;
+}
+
+static inline pchr
+getchr(const chr *s, const chr *end)
+{
+    wchar_t c = 0;
+    mbtowc(&c, (const char *)s, end - s);
+    return c;
+}
+
+#else /* !REGEX_UTF8 */
+
+static inline chr *nextchr(chr *s) { return s+1; }
+static inline chr *prevchr(chr *s) { return s-1; }
+static inline pchr getchr(const chr *s, const chr *end) { return *s; }
+
+#endif
+
 /*
  - exec - match regular expression
  ^ int exec(regex_t *, const chr *, size_t, rm_detail_t *,
@@ -353,7 +399,7 @@ find(
     d = newdfa(v, cnfa, cm, &v->dfa1);
     assert(!(ISERR() && d != NULL));
     NOERR();
-    for (begin = open; begin <= close; begin++) {
+    for (begin = open; begin <= close; begin = nextchr(begin)) {
 	MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
 	if (shorter) {
 	    end = shortest(v, d, begin, begin, v->stop, NULL, &hitend);
@@ -478,7 +524,7 @@ cfindloop(
 	open = cold;
 	cold = NULL;
 	MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
-	for (begin = open; begin <= close; begin++) {
+	for (begin = open; begin <= close; begin = nextchr(begin)) {
 	    MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
 	    estart = begin;
 	    estop = v->stop;
@@ -525,9 +571,9 @@ cfindloop(
 		 */
 
 		if (shorter) {
-		    estart = end + 1;
+		    estart = nextchr(end);
 		} else {
-		    estop = end - 1;
+		    estop = prevchr(end);
 		}
 	    }
 	}
author	Robin Haberkorn <rhaberkorn@fmsbw.de>	2026-06-21 21:42:12 +0200
committer	Robin Haberkorn <rhaberkorn@fmsbw.de>	2026-06-21 22:05:37 +0200
commit	13f5fd77bbc528862f295f9e7196f3ff709d185a (patch)
tree	9f9ce051bc8adf61e5ae2b4e94ccb1331bfdbfa5 /regexec.c
parent	10b47c9226b6267e5a4be4e79fe79314bf969025 (diff)
download	terex-13f5fd77bbc528862f295f9e7196f3ff709d185a.tar.gz