From 13f5fd77bbc528862f295f9e7196f3ff709d185a Mon Sep 17 00:00:00 2001
From: Robin Haberkorn <rhaberkorn@fmsbw.de>
Date: Sun, 21 Jun 2026 21:42:12 +0200
Subject: Unicode builds now expect UTF-8 strings

* They are built with `-DREGEX_UTF8` instead of `-DREGEX_WCHAR`.
  Functions are called reg_ucomp() and reg_uexec() instead for consistency.
  The library is now called libhsurex.so instead of libhswrex.so.
* The `chr` type is now always `unsigned char`.
  As a result many other uses of the `chr` type had to be changed to pchr
  (which is always large enough to hold a byte or wide character).
  Generally we try to keep code changes as small as possible since
  we may have to backport changes from the Tcl codebase or contribute
  patches to the Tcl project.
---
 regcomp.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

(limited to 'regcomp.c')

diff --git a/regcomp.c b/regcomp.c
index 8ff77ad..c00e19e 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -86,7 +86,7 @@ static chr newline(NOPARMS);
 #ifdef REG_DEBUG
 static const chr *ch(NOPARMS);
 #endif
-static chr chrnamed(struct vars *, const chr *, const chr *, pchr);
+static pchr chrnamed(struct vars *, const chr *, const chr *, pchr);
 /* === regc_color.c === */
 static void initcm(struct vars *, struct colormap *);
 static void freecm(struct colormap *);
@@ -193,7 +193,7 @@ struct vars {
     int cflags;			/* copy of compile flags */
     int lasttype;		/* type of previous token */
     int nexttype;		/* type of next token */
-    chr nextvalue;		/* value (if any) of next token */
+    pchr nextvalue;		/* value (if any) of next token */
     int lexcon;			/* lexical context type (see lex.c) */
     int nsubexp;		/* subexpression count */
     struct subre **subs;	/* subRE pointer vector */
@@ -229,6 +229,12 @@ struct vars {
 #define	NOTE(b)	(v->re->re_info |= (b))		/* note visible condition */
 #define	EMPTYARC(x, y)	newarc(v->nfa, EMPTY, 0, x, y)
 
+#ifdef REGEX_UTF8
+#define DECODECHR(buf, c) wctomb((char *)buf, c)
+#else
+#define DECODECHR(buf, c) ((buf)[0] = (c), 1)
+#endif
+
 /* token type codes, some also used as NFA arc types */
 #define	EMPTY	'n'		/* no token present */
 #define	EOS	'e'		/* end of string */
@@ -1458,7 +1464,9 @@ brackpart(
     celt startc, endc;
     struct cvec *cv;
     const chr *startp, *endp;
-    chr c[1];
+    chr buf[MB_LEN_MAX];
+    size_t buf_len;
+    pchr c;
 
     /*
      * Parse something, get rid of special cases, take shortcuts.
@@ -1470,7 +1478,7 @@ brackpart(
 	return;
 	break;
     case PLAIN:
-	c[0] = v->nextvalue;
+	c = v->nextvalue;
 	NEXT();
 
 	/*
@@ -1478,10 +1486,11 @@ brackpart(
 	 */
 
 	if (!SEE(RANGE)) {
-	    onechr(v, c[0], lp, rp);
+	    onechr(v, c, lp, rp);
 	    return;
 	}
-	startc = element(v, c, c+1);
+	buf_len = DECODECHR(buf, c);
+	startc = element(v, buf, buf+buf_len);
 	NOERR();
 	break;
     case COLLEL:
@@ -1525,9 +1534,9 @@ brackpart(
 	switch (v->nexttype) {
 	case PLAIN:
 	case RANGE:
-	    c[0] = v->nextvalue;
+	    buf_len = DECODECHR(buf, v->nextvalue);
 	    NEXT();
-	    endc = element(v, c, c+1);
+	    endc = element(v, buf, buf+buf_len);
 	    NOERR();
 	    break;
 	case COLLEL:
@@ -1623,8 +1632,8 @@ dovec(
     struct state *lp,
     struct state *rp)
 {
-    chr ch, from, to;
-    const chr *p;
+    pchr ch, from, to;
+    const pchr *p;
     int i;
 
     for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) {
-- 
cgit v1.2.3