Unicode builds now expect UTF-8 strings

* They are built with `-DREGEX_UTF8` instead of `-DREGEX_WCHAR`. Functions are called reg_ucomp() and reg_uexec() instead for consistency. The library is now called libhsurex.so instead of libhswrex.so. * The `chr` type is now always `unsigned char`. As a result many other uses of the `chr` type had to be changed to pchr (which is always large enough to hold a byte or wide character). Generally we try to keep code changes as small as possible since we may have to backport changes from the Tcl codebase or contribute patches to the Tcl project.
author: Robin Haberkorn <rhaberkorn@fmsbw.de> 2026-06-21 21:42:12 +0200
committer: Robin Haberkorn <rhaberkorn@fmsbw.de> 2026-06-21 22:05:37 +0200
commit: 13f5fd77bbc528862f295f9e7196f3ff709d185a (patch)
tree: 9f9ce051bc8adf61e5ae2b4e94ccb1331bfdbfa5 /regc_lex.c
parent: 10b47c9226b6267e5a4be4e79fe79314bf969025 (diff)
download: terex-13f5fd77bbc528862f295f9e7196f3ff709d185a.tar.gz
1 files changed, 28 insertions, 12 deletions
diff --git a/regc_lex.c b/regc_lex.c
index 4be02c6..ae71884 100644
--- a/regc_lex.c
+++ b/regc_lex.c
@@ -32,6 +32,7 @@
 /* scanning macros (know about v) */
 #define	ATEOS()		(v->now >= v->stop)
 #define	HAVE(n)		(v->stop - v->now >= (n))
+/* will work only for ANSI characters */
 #define	NEXT1(c)	(!ATEOS() && *v->now == CHR(c))
 #define	NEXT2(a,b)	(HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
 #define	NEXT3(a,b,c) \
@@ -45,6 +46,17 @@
 #define	FAILW(e)	return (ERR(e), 0)	/* ERR does SET(EOS) */
 #define	LASTTYPE(t)	(v->lasttype == (t))
 
+/* return and skip the next (unicode) character */
+#ifdef REGEX_UTF8
+#define SKIPCHR(x)	do { \
+	wchar_t __c; \
+	v->now += mbtowc(&__c, (const char *)v->now, v->stop - v->now); \
+	x = __c; \
+} while (0)
+#else
+#define SKIPCHR(x)	do x = *v->now++; while (0)
+#endif
+
 /* lexical contexts */
 #define	L_ERE	1	/* mainline ERE/ARE */
 #define	L_BRE	2	/* mainline BRE */
@@ -292,7 +304,7 @@ static int			/* 1 normal, 0 failure */
 next(
     struct vars *v)
 {
-    chr c;
+    pchr c;
 
     /*
      * Errors yield an infinite sequence of failures.
@@ -371,7 +383,7 @@ next(
      * Okay, time to actually get a character.
      */
 
-    c = *v->now++;
+    SKIPCHR(c);
 
     /*
      * Deal with the easy contexts, punt EREs to code below.
@@ -697,11 +709,12 @@ next(
 
     assert(!ATEOS());
     if (!(v->cflags&REG_ADVF)) {/* only AREs have non-trivial escapes */
-	if (iscalnum(*v->now)) {
+	SKIPCHR(c);
+	if (iscalnum(c)) {
 	    NOTE(REG_UBSALNUM);
 	    NOTE(REG_UUNSPEC);
 	}
-	RETV(PLAIN, *v->now++);
+	RETV(PLAIN, c);
     }
     (DISCARD)lexescape(v);
     if (ISERR()) {
@@ -741,7 +754,7 @@ static int			/* not actually used, but convenient for RETV */
 lexescape(
     struct vars *v)
 {
-    chr c;
+    pchr c;
     static chr alert[] = {
 	CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
     };
@@ -753,7 +766,7 @@ lexescape(
     assert(v->cflags&REG_ADVF);
 
     assert(!ATEOS());
-    c = *v->now++;
+    SKIPCHR(c);
     if (!iscalnum(c)) {
 	RETV(PLAIN, c);
     }
@@ -777,7 +790,8 @@ lexescape(
 	if (ATEOS()) {
 	    FAILW(REG_EESCAPE);
 	}
-	RETV(PLAIN, (chr)(*v->now++ & 037));
+	SKIPCHR(c);
+	RETV(PLAIN, c & 037);
 	break;
     case CHR('d'):
 	NOTE(REG_ULOCALE);
@@ -911,6 +925,8 @@ lexescape(
  - lexdigits - slurp up digits and return chr value
  ^ static chr lexdigits(struct vars *, int, int, int);
  */
+// FIXME: Perhaps directly return unsigned int.
+// Why should we be restricted to 0-255?
 static chr			/* chr value; errors signalled via ERR */
 lexdigits(
     struct vars *v,
@@ -972,7 +988,7 @@ brenext(
     struct vars *v,
     pchr pc)
 {
-    chr c = (chr)pc;
+    pchr c = pc;
 
     switch (c) {
     case CHR('*'):
@@ -1039,7 +1055,7 @@ brenext(
 	FAILW(REG_EESCAPE);
     }
 
-    c = *v->now++;
+    SKIPCHR(c);
     switch (c) {
     case CHR('{'):
 	INTOCON(L_BBND);
@@ -1147,7 +1163,7 @@ ch(void)
  * use that it hardly matters.
  ^ static chr chrnamed(struct vars *, const chr *, const chr *, pchr);
  */
-static chr
+static pchr
 chrnamed(
     struct vars *v,
     const chr *startp,		/* start of name */
@@ -1166,12 +1182,12 @@ chrnamed(
     v->err = errsave;
 
     if (e != 0) {
-	return (chr)lastresort;
+	return (pchr)lastresort;
     }
 
     cv = range(v, c, c, 0);
     if (cv->nchrs == 0) {
-	return (chr)lastresort;
+	return (pchr)lastresort;
     }
     return cv->chrs[0];
 }
author	Robin Haberkorn <rhaberkorn@fmsbw.de>	2026-06-21 21:42:12 +0200
committer	Robin Haberkorn <rhaberkorn@fmsbw.de>	2026-06-21 22:05:37 +0200
commit	13f5fd77bbc528862f295f9e7196f3ff709d185a (patch)
tree	9f9ce051bc8adf61e5ae2b4e94ccb1331bfdbfa5 /regc_lex.c
parent	10b47c9226b6267e5a4be4e79fe79314bf969025 (diff)
download	terex-13f5fd77bbc528862f295f9e7196f3ff709d185a.tar.gz