From 13f5fd77bbc528862f295f9e7196f3ff709d185a Mon Sep 17 00:00:00 2001
From: Robin Haberkorn <rhaberkorn@fmsbw.de>
Date: Sun, 21 Jun 2026 21:42:12 +0200
Subject: Unicode builds now expect UTF-8 strings

* They are built with `-DREGEX_UTF8` instead of `-DREGEX_WCHAR`.
  Functions are called reg_ucomp() and reg_uexec() instead for consistency.
  The library is now called libhsurex.so instead of libhswrex.so.
* The `chr` type is now always `unsigned char`.
  As a result many other uses of the `chr` type had to be changed to pchr
  (which is always large enough to hold a byte or wide character).
  Generally we try to keep code changes as small as possible since
  we may have to backport changes from the Tcl codebase or contribute
  patches to the Tcl project.
---
 regc_lex.c | 40 ++++++++++++++++++++++++++++------------
 1 file changed, 28 insertions(+), 12 deletions(-)

(limited to 'regc_lex.c')

diff --git a/regc_lex.c b/regc_lex.c
index 4be02c6..ae71884 100644
--- a/regc_lex.c
+++ b/regc_lex.c
@@ -32,6 +32,7 @@
 /* scanning macros (know about v) */
 #define	ATEOS()		(v->now >= v->stop)
 #define	HAVE(n)		(v->stop - v->now >= (n))
+/* will work only for ANSI characters */
 #define	NEXT1(c)	(!ATEOS() && *v->now == CHR(c))
 #define	NEXT2(a,b)	(HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
 #define	NEXT3(a,b,c) \
@@ -45,6 +46,17 @@
 #define	FAILW(e)	return (ERR(e), 0)	/* ERR does SET(EOS) */
 #define	LASTTYPE(t)	(v->lasttype == (t))
 
+/* return and skip the next (unicode) character */
+#ifdef REGEX_UTF8
+#define SKIPCHR(x)	do { \
+	wchar_t __c; \
+	v->now += mbtowc(&__c, (const char *)v->now, v->stop - v->now); \
+	x = __c; \
+} while (0)
+#else
+#define SKIPCHR(x)	do x = *v->now++; while (0)
+#endif
+
 /* lexical contexts */
 #define	L_ERE	1	/* mainline ERE/ARE */
 #define	L_BRE	2	/* mainline BRE */
@@ -292,7 +304,7 @@ static int			/* 1 normal, 0 failure */
 next(
     struct vars *v)
 {
-    chr c;
+    pchr c;
 
     /*
      * Errors yield an infinite sequence of failures.
@@ -371,7 +383,7 @@ next(
      * Okay, time to actually get a character.
      */
 
-    c = *v->now++;
+    SKIPCHR(c);
 
     /*
      * Deal with the easy contexts, punt EREs to code below.
@@ -697,11 +709,12 @@ next(
 
     assert(!ATEOS());
     if (!(v->cflags&REG_ADVF)) {/* only AREs have non-trivial escapes */
-	if (iscalnum(*v->now)) {
+	SKIPCHR(c);
+	if (iscalnum(c)) {
 	    NOTE(REG_UBSALNUM);
 	    NOTE(REG_UUNSPEC);
 	}
-	RETV(PLAIN, *v->now++);
+	RETV(PLAIN, c);
     }
     (DISCARD)lexescape(v);
     if (ISERR()) {
@@ -741,7 +754,7 @@ static int			/* not actually used, but convenient for RETV */
 lexescape(
     struct vars *v)
 {
-    chr c;
+    pchr c;
     static chr alert[] = {
 	CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
     };
@@ -753,7 +766,7 @@ lexescape(
     assert(v->cflags&REG_ADVF);
 
     assert(!ATEOS());
-    c = *v->now++;
+    SKIPCHR(c);
     if (!iscalnum(c)) {
 	RETV(PLAIN, c);
     }
@@ -777,7 +790,8 @@ lexescape(
 	if (ATEOS()) {
 	    FAILW(REG_EESCAPE);
 	}
-	RETV(PLAIN, (chr)(*v->now++ & 037));
+	SKIPCHR(c);
+	RETV(PLAIN, c & 037);
 	break;
     case CHR('d'):
 	NOTE(REG_ULOCALE);
@@ -911,6 +925,8 @@ lexescape(
  - lexdigits - slurp up digits and return chr value
  ^ static chr lexdigits(struct vars *, int, int, int);
  */
+// FIXME: Perhaps directly return unsigned int.
+// Why should we be restricted to 0-255?
 static chr			/* chr value; errors signalled via ERR */
 lexdigits(
     struct vars *v,
@@ -972,7 +988,7 @@ brenext(
     struct vars *v,
     pchr pc)
 {
-    chr c = (chr)pc;
+    pchr c = pc;
 
     switch (c) {
     case CHR('*'):
@@ -1039,7 +1055,7 @@ brenext(
 	FAILW(REG_EESCAPE);
     }
 
-    c = *v->now++;
+    SKIPCHR(c);
     switch (c) {
     case CHR('{'):
 	INTOCON(L_BBND);
@@ -1147,7 +1163,7 @@ ch(void)
  * use that it hardly matters.
  ^ static chr chrnamed(struct vars *, const chr *, const chr *, pchr);
  */
-static chr
+static pchr
 chrnamed(
     struct vars *v,
     const chr *startp,		/* start of name */
@@ -1166,12 +1182,12 @@ chrnamed(
     v->err = errsave;
 
     if (e != 0) {
-	return (chr)lastresort;
+	return (pchr)lastresort;
     }
 
     cv = range(v, c, c, 0);
     if (cv->nchrs == 0) {
-	return (chr)lastresort;
+	return (pchr)lastresort;
     }
     return cv->chrs[0];
 }
-- 
cgit v1.2.3