Unicode builds now expect UTF-8 strings

* They are built with `-DREGEX_UTF8` instead of `-DREGEX_WCHAR`. Functions are called reg_ucomp() and reg_uexec() instead for consistency. The library is now called libhsurex.so instead of libhswrex.so. * The `chr` type is now always `unsigned char`. As a result many other uses of the `chr` type had to be changed to pchr (which is always large enough to hold a byte or wide character). Generally we try to keep code changes as small as possible since we may have to backport changes from the Tcl codebase or contribute patches to the Tcl project.
author: Robin Haberkorn <rhaberkorn@fmsbw.de> 2026-06-21 21:42:12 +0200
committer: Robin Haberkorn <rhaberkorn@fmsbw.de> 2026-06-21 22:05:37 +0200
commit: 13f5fd77bbc528862f295f9e7196f3ff709d185a (patch)
tree: 9f9ce051bc8adf61e5ae2b4e94ccb1331bfdbfa5
parent: 10b47c9226b6267e5a4be4e79fe79314bf969025 (diff)
download: terex-13f5fd77bbc528862f295f9e7196f3ff709d185a.tar.gz
15 files changed, 252 insertions, 443 deletions
diff --git a/Makefile b/Makefile
index 384a3dd..6a4bdb9 100644
--- a/Makefile
+++ b/Makefile
@@ -1,16 +1,16 @@
 CC = gcc
 # Either this one
-#CFLAGS = -DREGEX_STANDALONE -fPIC -DREG_DEBUG -g
+CFLAGS = -Wall -DREGEX_STANDALONE -fPIC -DREG_DEBUG -g
 # Or this one
-CFLAGS = -DREGEX_STANDALONE -fPIC -D_NDEBUG -O3
+#CFLAGS = -Wall -DREGEX_STANDALONE -fPIC -D_NDEBUG -O3
 LDFLAGS = -shared
-SRCS = regcomp.c regexec.c regerror.c regfree.c regalone.c
+SRCS = regcomp.c regexec.c regerror.c regfree.c
 OBJS = $(SRCS:.c=.o)
-BINS = libhsrex.so libhswrex.so
+BINS = libhsrex.so libhsurex.so
 all:
 	make libhsrex.so
 	rm -f $(OBJS)
-	make "CFLAGS=$(CFLAGS) -DREGEX_WCHAR" libhswrex.so
+	make "CFLAGS=$(CFLAGS) -DREGEX_UTF8" libhsurex.so
 $(BINS): $(OBJS)
 	$(CC) $(LDFLAGS) -o $@ $(OBJS)
 clean:
diff --git a/README b/README
index 7a823b9..c80c5b1 100644
--- a/README
+++ b/README
@@ -19,7 +19,7 @@ regtest_hsrex.sh and execute again.
 	# Either this one
 	$CC -I. -I$H/inc -L. -lhsrex -o $rgbin $rgsrc
 	# or this one
-	#$CC -I. -I$H/inc -L. -lhswrex -DREGEX_WCHAR -o $rgbin $rgsrc
+	#$CC -I. -I$H/inc -L. -lhsurex -DREGEX_UTF8 -o $rgbin $rgsrc
 
 You would like to test with debuging information. Uncomment the proper line in
 the Makefile and rebuild.
@@ -28,14 +28,14 @@ the Makefile and rebuild.
 	# Or this one
 	CFLAGS = -DREGEX_STANDALONE -fPIC -D_NDEBUG -O3
 
-Two libraries are provided, libhsrex.so and libhswrex.so. The first one is for
+Two libraries are provided, libhsrex.so and libhsurex.so. The first one is for
 ascii character code and the second one for wide characters. Both libraries
 were tested in Linux and Solaris. Compiling and runing in Window$ should be
 easy.
 
 The following entry point where defined in each library:
-re_comp()	(re_wcomp() for wide char) to compile a RE
-re_exec()	(re_wexec() for wide char) to parse data against a compiled RE.
+re_comp()	(re_ucomp() for wide char) to compile a RE
+re_exec()	(re_uexec() for wide char) to parse data against a compiled RE.
 regfree()	To dispose the memory of a compiled RE.
 regerror()	Translates error codes to ascii strings.
 
diff --git a/regalone.c b/regalone.c
deleted file mode 100644
index e0a5fcc..0000000
--- a/regalone.c
+++ /dev/null
@@ -1,267 +0,0 @@
-#ifdef REGEX_WCHAR
-
-#include "regcustom.h"
-
-
-
-/*
- *----------------------------------------------------------------------
- *
- * Tcl_DStringInit --
- *
- *	Initializes a dynamic string, discarding any previous contents of the
- *	string (Tcl_DStringFree should have been called already if the dynamic
- *	string was previously in use).
- *
- * Results:
- *	None.
- *
- * Side effects:
- *	The dynamic string is initialized to be empty.
- *
- *----------------------------------------------------------------------
- */
-
-void
-Tcl_DStringInit(
-    Tcl_DString *dsPtr)		/* Pointer to structure for dynamic string. */
-{
-    dsPtr->string = dsPtr->staticSpace;
-    dsPtr->length = 0;
-    dsPtr->spaceAvl = TCL_DSTRING_STATIC_SIZE;
-    dsPtr->staticSpace[0] = '\0';
-}
-
-
-/*
- *----------------------------------------------------------------------
- *
- * Tcl_DStringSetLength --
- *
- *	Change the length of a dynamic string. This can cause the string to
- *	either grow or shrink, depending on the value of length.
- *
- * Results:
- *	None.
- *
- * Side effects:
- *	The length of dsPtr is changed to length and a null byte is stored at
- *	that position in the string. If length is larger than the space
- *	allocated for dsPtr, then a panic occurs.
- *
- *----------------------------------------------------------------------
- */
-
-void
-Tcl_DStringSetLength(
-    Tcl_DString *dsPtr,		/* Structure describing dynamic string. */
-    int length)			/* New length for dynamic string. */
-{
-    int newsize;
-
-    if (length < 0) {
-	length = 0;
-    }
-    if (length >= dsPtr->spaceAvl) {
-	/*
-	 * There are two interesting cases here. In the first case, the user
-	 * may be trying to allocate a large buffer of a specific size. It
-	 * would be wasteful to overallocate that buffer, so we just allocate
-	 * enough for the requested size plus the trailing null byte. In the
-	 * second case, we are growing the buffer incrementally, so we need
-	 * behavior similar to Tcl_DStringAppend. The requested length will
-	 * usually be a small delta above the current spaceAvl, so we'll end
-	 * up doubling the old size. This won't grow the buffer quite as
-	 * quickly, but it should be close enough.
-	 */
-
-	newsize = dsPtr->spaceAvl * 2;
-	if (length < newsize) {
-	    dsPtr->spaceAvl = newsize;
-	} else {
-	    dsPtr->spaceAvl = length + 1;
-	}
-	if (dsPtr->string == dsPtr->staticSpace) {
-	    char *newString = ckalloc((unsigned) dsPtr->spaceAvl);
-
-	    memcpy(newString, dsPtr->string, (size_t) dsPtr->length);
-	    dsPtr->string = newString;
-	} else {
-	    dsPtr->string = (char *) ckrealloc((void *) dsPtr->string,
-		    (size_t) dsPtr->spaceAvl);
-	}
-    }
-    dsPtr->length = length;
-    dsPtr->string[length] = 0;
-}
-
-
-/*
- *----------------------------------------------------------------------
- *
- * Tcl_DStringFree --
- *
- *	Frees up any memory allocated for the dynamic string and reinitializes
- *	the string to an empty state.
- *
- * Results:
- *	None.
- *
- * Side effects:
- *	The previous contents of the dynamic string are lost, and the new
- *	value is an empty string.
- *
- *----------------------------------------------------------------------
- */
-
-void
-Tcl_DStringFree(
-    Tcl_DString *dsPtr)		/* Structure describing dynamic string. */
-{
-    if (dsPtr->string != dsPtr->staticSpace) {
-	ckfree(dsPtr->string);
-    }
-    dsPtr->string = dsPtr->staticSpace;
-    dsPtr->length = 0;
-    dsPtr->spaceAvl = TCL_DSTRING_STATIC_SIZE;
-    dsPtr->staticSpace[0] = '\0';
-}
-
-
-
-/*
- * Unicode characters less than this value are represented by themselves in
- * UTF-8 strings.
- */
-
-#define UNICODE_SELF	0x80
-
-
-/*
- *---------------------------------------------------------------------------
- *
- * Tcl_UniCharToUtf --
- *
- *	Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
- *	provided buffer. Equivalent to Plan 9 runetochar().
- *
- * Results:
- *	The return values is the number of bytes in the buffer that were
- *	consumed.
- *
- * Side effects:
- *	None.
- *
- *---------------------------------------------------------------------------
- */
-
-INLINE int
-Tcl_UniCharToUtf(
-    int ch,			/* The Tcl_UniChar to be stored in the
-				 * buffer. */
-    char *buf)			/* Buffer in which the UTF-8 representation of
-				 * the Tcl_UniChar is stored. Buffer must be
-				 * large enough to hold the UTF-8 character
-				 * (at most TCL_UTF_MAX bytes). */
-{
-    if ((ch > 0) && (ch < UNICODE_SELF)) {
-	buf[0] = (char) ch;
-	return 1;
-    }
-    if (ch >= 0) {
-	if (ch <= 0x7FF) {
-	    buf[1] = (char) ((ch | 0x80) & 0xBF);
-	    buf[0] = (char) ((ch >> 6) | 0xC0);
-	    return 2;
-	}
-	if (ch <= 0xFFFF) {
-	three:
-	    buf[2] = (char) ((ch | 0x80) & 0xBF);
-	    buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
-	    buf[0] = (char) ((ch >> 12) | 0xE0);
-	    return 3;
-	}
-
-#if TCL_UTF_MAX > 3
-	if (ch <= 0x1FFFFF) {
-	    buf[3] = (char) ((ch | 0x80) & 0xBF);
-	    buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
-	    buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
-	    buf[0] = (char) ((ch >> 18) | 0xF0);
-	    return 4;
-	}
-	if (ch <= 0x3FFFFFF) {
-	    buf[4] = (char) ((ch | 0x80) & 0xBF);
-	    buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
-	    buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
-	    buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
-	    buf[0] = (char) ((ch >> 24) | 0xF8);
-	    return 5;
-	}
-	if (ch <= 0x7FFFFFFF) {
-	    buf[5] = (char) ((ch | 0x80) & 0xBF);
-	    buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
-	    buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
-	    buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
-	    buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
-	    buf[0] = (char) ((ch >> 30) | 0xFC);
-	    return 6;
-	}
-#endif
-    }
-
-    ch = 0xFFFD;
-    goto three;
-}
-
-/*
- *---------------------------------------------------------------------------
- *
- * Tcl_UniCharToUtfDString --
- *
- *	Convert the given Unicode string to UTF-8.
- *
- * Results:
- *	The return value is a pointer to the UTF-8 representation of the
- *	Unicode string. Storage for the return value is appended to the end of
- *	dsPtr.
- *
- * Side effects:
- *	None.
- *
- *---------------------------------------------------------------------------
- */
-
-char *
-Tcl_UniCharToUtfDString(
-    const Tcl_UniChar *uniStr,	/* Unicode string to convert to UTF-8. */
-    int uniLength,		/* Length of Unicode string in Tcl_UniChars
-				 * (must be >= 0). */
-    Tcl_DString *dsPtr)		/* UTF-8 representation of string is appended
-				 * to this previously initialized DString. */
-{
-    const Tcl_UniChar *w, *wEnd;
-    char *p, *string;
-    int oldLength;
-
-    /*
-     * UTF-8 string length in bytes will be <= Unicode string length *
-     * TCL_UTF_MAX.
-     */
-
-    oldLength = Tcl_DStringLength(dsPtr);
-    Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX);
-    string = Tcl_DStringValue(dsPtr) + oldLength;
-
-    p = string;
-    wEnd = uniStr + uniLength;
-    for (w = uniStr; w < wEnd; ) {
-	p += Tcl_UniCharToUtf(*w, p);
-	w++;
-    }
-    Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
-
-    return string;
-}
-
-#endif		/* REGEX_WCHAR	*/
diff --git a/regalone.h b/regalone.h
index 940c11d..e05fdb8 100644
--- a/regalone.h
+++ b/regalone.h
@@ -2,22 +2,18 @@
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdint.h>
 #include <string.h>
 
 #ifndef REGEX_STANDALONE
 # define	REGEX_STANDALONE
 #endif
 
-#ifdef REGEX_WCHAR
-#	include <wctype.h>
-#	include <wchar.h>
-	typedef wchar_t chr;
-	typedef chr Tcl_UniChar;
-#else
-#	include <ctype.h>
-	typedef unsigned char chr;
-	typedef wchar_t Tcl_UniChar;
-#endif
+#include <wctype.h>
+#include <ctype.h>
+// FIXME: Should better be a signed char?
+typedef unsigned char chr;
+//typedef wchar_t Tcl_UniChar;
 
 /*
  * In The standalone version we are more concerned with performance,
@@ -34,7 +30,12 @@
 #define ckrealloc(p,n)	realloc(p,n)
 #define ckfree(p)	free(p)
 
-#ifdef REGEX_WCHAR
+// FIXME: Perhaps get rid of these references completely.
+#define Tcl_DStringInit(ds)			do (void)(ds); while (0)
+#define Tcl_UniCharToUtfDString(s,l,ds)		((char *)(s))
+#define Tcl_DStringFree(ds)			do (void)(ds); while (0)
+
+#ifdef REGEX_UTF8
 #	define Tcl_UniCharToLower(c)		towlower(c)
 #	define Tcl_UniCharToUpper(c)		towupper(c)
 #	define Tcl_UniCharToTitle(c)		towupper(c)
@@ -43,9 +44,6 @@
 #	define Tcl_UniCharIsDigit(c)		iswdigit(c)
 #	define Tcl_UniCharIsSpace(c)		iswspace(c)
 #else
-#	define Tcl_DStringInit(ds)
-#	define Tcl_UniCharToUtfDString(s,l,ds)	(s)
-#	define Tcl_DStringFree(ds)
 #	define Tcl_UniCharToLower(c)		tolower(c)
 #	define Tcl_UniCharToUpper(c)		toupper(c)
 #	define Tcl_UniCharToTitle(c)		toupper(c)
@@ -238,13 +236,3 @@ typedef struct Tcl_DString {
 #else
 #   define EXTERN extern TCL_STORAGE_CLASS
 #endif
-
-
-#ifdef REGEX_WCHAR
-EXTERN void		Tcl_DStringFree (Tcl_DString * dsPtr);
-EXTERN void		Tcl_DStringInit (Tcl_DString * dsPtr);
-EXTERN char *		Tcl_UniCharToUtfDString (CONST Tcl_UniChar * uniStr, 
-				int uniLength, Tcl_DString * dsPtr);
-EXTERN void		Tcl_DStringSetLength (Tcl_DString * dsPtr, 
-				int length);
-#endif		/* REGEX_WCHAR	*/
diff --git a/regc_color.c b/regc_color.c
index 7a98dcb..c1d4b21 100644
--- a/regc_color.c
+++ b/regc_color.c
@@ -157,7 +157,7 @@ setcolor(
     pchr c,
     pcolor co)
 {
-    uchr uc = c;
+    pchr uc = c;
     int shift;
     int level;
     int b;
@@ -433,7 +433,7 @@ subrange(
     struct state *lp,
     struct state *rp)
 {
-    uchr uf;
+    pchr uf;
     int i;
 
     assert(from <= to);
@@ -442,8 +442,8 @@ subrange(
      * First, align "from" on a tree-block boundary
      */
 
-    uf = (uchr) from;
-    i = (int) (((uf + BYTTAB - 1) & (uchr) ~BYTMASK) - uf);
+    uf = from;
+    i = (int) (((uf + BYTTAB - 1) & (pchr) ~BYTMASK) - uf);
     for (; from<=to && i>0; i--, from++) {
 	newarc(v->nfa, PLAIN, subcolor(v->cm, from), lp, rp);
     }
@@ -479,7 +479,7 @@ subblock(
     struct state *lp,
     struct state *rp)
 {
-    uchr uc = start;
+    pchr uc = start;
     struct colormap *cm = v->cm;
     int shift;
     int level;
@@ -751,7 +751,7 @@ dumpcolors(
     struct colordesc *cd;
     struct colordesc *end;
     color co;
-    chr c;
+    uchr c;
     char *has;
 
     fprintf(f, "max %ld\n", (long) cm->max);
diff --git a/regc_cvec.c b/regc_cvec.c
index 0247521..b9fba9d 100644
--- a/regc_cvec.c
+++ b/regc_cvec.c
@@ -44,14 +44,14 @@ newcvec(
     int nranges)		/* ... and this many ranges... */
 {
     size_t nc = (size_t)nchrs + (size_t)nranges*2;
-    size_t n = sizeof(struct cvec) + nc*sizeof(chr);
+    size_t n = sizeof(struct cvec) + nc*sizeof(pchr);
     struct cvec *cv = (struct cvec *) MALLOC(n);
 
     if (cv == NULL) {
 	return NULL;
     }
     cv->chrspace = nchrs;
-    cv->chrs = (chr *)(((char *)cv)+sizeof(struct cvec));
+    cv->chrs = (pchr *)(((char *)cv)+sizeof(struct cvec));
     cv->ranges = cv->chrs + nchrs;
     cv->rangespace = nranges;
     return clearcvec(cv);
@@ -81,7 +81,7 @@ addchr(
     struct cvec *cv,		/* character vector */
     pchr c)			/* character to add */
 {
-    cv->chrs[cv->nchrs++] = (chr)c;
+    cv->chrs[cv->nchrs++] = c;
 }
 
 /*
@@ -95,8 +95,8 @@ addrange(
     pchr to)			/* last character of range */
 {
     assert(cv->nranges < cv->rangespace);
-    cv->ranges[cv->nranges*2] = (chr)from;
-    cv->ranges[cv->nranges*2 + 1] = (chr)to;
+    cv->ranges[cv->nranges*2] = from;
+    cv->ranges[cv->nranges*2 + 1] = to;
     cv->nranges++;
 }
 
diff --git a/regc_lex.c b/regc_lex.c
index 4be02c6..ae71884 100644
--- a/regc_lex.c
+++ b/regc_lex.c
@@ -32,6 +32,7 @@
 /* scanning macros (know about v) */
 #define	ATEOS()		(v->now >= v->stop)
 #define	HAVE(n)		(v->stop - v->now >= (n))
+/* will work only for ANSI characters */
 #define	NEXT1(c)	(!ATEOS() && *v->now == CHR(c))
 #define	NEXT2(a,b)	(HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
 #define	NEXT3(a,b,c) \
@@ -45,6 +46,17 @@
 #define	FAILW(e)	return (ERR(e), 0)	/* ERR does SET(EOS) */
 #define	LASTTYPE(t)	(v->lasttype == (t))
 
+/* return and skip the next (unicode) character */
+#ifdef REGEX_UTF8
+#define SKIPCHR(x)	do { \
+	wchar_t __c; \
+	v->now += mbtowc(&__c, (const char *)v->now, v->stop - v->now); \
+	x = __c; \
+} while (0)
+#else
+#define SKIPCHR(x)	do x = *v->now++; while (0)
+#endif
+
 /* lexical contexts */
 #define	L_ERE	1	/* mainline ERE/ARE */
 #define	L_BRE	2	/* mainline BRE */
@@ -292,7 +304,7 @@ static int			/* 1 normal, 0 failure */
 next(
     struct vars *v)
 {
-    chr c;
+    pchr c;
 
     /*
      * Errors yield an infinite sequence of failures.
@@ -371,7 +383,7 @@ next(
      * Okay, time to actually get a character.
      */
 
-    c = *v->now++;
+    SKIPCHR(c);
 
     /*
      * Deal with the easy contexts, punt EREs to code below.
@@ -697,11 +709,12 @@ next(
 
     assert(!ATEOS());
     if (!(v->cflags&REG_ADVF)) {/* only AREs have non-trivial escapes */
-	if (iscalnum(*v->now)) {
+	SKIPCHR(c);
+	if (iscalnum(c)) {
 	    NOTE(REG_UBSALNUM);
 	    NOTE(REG_UUNSPEC);
 	}
-	RETV(PLAIN, *v->now++);
+	RETV(PLAIN, c);
     }
     (DISCARD)lexescape(v);
     if (ISERR()) {
@@ -741,7 +754,7 @@ static int			/* not actually used, but convenient for RETV */
 lexescape(
     struct vars *v)
 {
-    chr c;
+    pchr c;
     static chr alert[] = {
 	CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
     };
@@ -753,7 +766,7 @@ lexescape(
     assert(v->cflags&REG_ADVF);
 
     assert(!ATEOS());
-    c = *v->now++;
+    SKIPCHR(c);
     if (!iscalnum(c)) {
 	RETV(PLAIN, c);
     }
@@ -777,7 +790,8 @@ lexescape(
 	if (ATEOS()) {
 	    FAILW(REG_EESCAPE);
 	}
-	RETV(PLAIN, (chr)(*v->now++ & 037));
+	SKIPCHR(c);
+	RETV(PLAIN, c & 037);
 	break;
     case CHR('d'):
 	NOTE(REG_ULOCALE);
@@ -911,6 +925,8 @@ lexescape(
  - lexdigits - slurp up digits and return chr value
  ^ static chr lexdigits(struct vars *, int, int, int);
  */
+// FIXME: Perhaps directly return unsigned int.
+// Why should we be restricted to 0-255?
 static chr			/* chr value; errors signalled via ERR */
 lexdigits(
     struct vars *v,
@@ -972,7 +988,7 @@ brenext(
     struct vars *v,
     pchr pc)
 {
-    chr c = (chr)pc;
+    pchr c = pc;
 
     switch (c) {
     case CHR('*'):
@@ -1039,7 +1055,7 @@ brenext(
 	FAILW(REG_EESCAPE);
     }
 
-    c = *v->now++;
+    SKIPCHR(c);
     switch (c) {
     case CHR('{'):
 	INTOCON(L_BBND);
@@ -1147,7 +1163,7 @@ ch(void)
  * use that it hardly matters.
  ^ static chr chrnamed(struct vars *, const chr *, const chr *, pchr);
  */
-static chr
+static pchr
 chrnamed(
     struct vars *v,
     const chr *startp,		/* start of name */
@@ -1166,12 +1182,12 @@ chrnamed(
     v->err = errsave;
 
     if (e != 0) {
-	return (chr)lastresort;
+	return (pchr)lastresort;
     }
 
     cv = range(v, c, c, 0);
     if (cv->nchrs == 0) {
-	return (chr)lastresort;
+	return (pchr)lastresort;
     }
     return cv->chrs[0];
 }
diff --git a/regc_locale.c b/regc_locale.c
index a6bc3af..97aa702 100644
--- a/regc_locale.c
+++ b/regc_locale.c
@@ -120,12 +120,16 @@ static const struct cname {
  * Unicode character-class tables.
  */
 
+// FIXME: Perhaps define a new type here, similar to the
+// original chr, so we don't waste space on the tables
+// in ASCII (non-UTF-8) builds.
+// Or perhaps pchr should just be like chr in the original implementation.
 typedef struct crange {
-    chr start;
-    chr end;
+    pchr start;
+    pchr end;
 } crange;
 
-#if defined(REGEX_STANDALONE) && ! defined(REGEX_WCHAR)
+#if defined(REGEX_STANDALONE) && ! defined(REGEX_UTF8)
 
 static const crange alphaRangeTable[] = {
     {0x41, 0x5a}, {0x61, 0x7a}
@@ -133,10 +137,10 @@ static const crange alphaRangeTable[] = {
 
 #define NUM_ALPHA_RANGE (sizeof(alphaRangeTable)/sizeof(crange))
 
-static const chr alphaCharTable[] = {
+static const pchr alphaCharTable[] = {
 };
 
-#define NUM_ALPHA_CHAR (sizeof(alphaCharTable)/sizeof(chr))
+#define NUM_ALPHA_CHAR (sizeof(alphaCharTable)/sizeof(pchr))
 
 static const crange digitRangeTable[] = {
     {0x30, 0x39}
@@ -150,11 +154,11 @@ static const crange punctRangeTable[] = {
 
 #define NUM_PUNCT_RANGE (sizeof(punctRangeTable)/sizeof(crange))
 
-static const chr punctCharTable[] = {
+static const pchr punctCharTable[] = {
     0x3a, 0x3b, 0x3f, 0x40, 0x5f, 0x7b, 0x7d
 };
 
-#define NUM_PUNCT_CHAR (sizeof(punctCharTable)/sizeof(chr))
+#define NUM_PUNCT_CHAR (sizeof(punctCharTable)/sizeof(pchr))
 
 static const crange spaceRangeTable[] = {
     {0x09, 0x0d}
@@ -162,11 +166,11 @@ static const crange spaceRangeTable[] = {
 
 #define NUM_SPACE_RANGE (sizeof(spaceRangeTable)/sizeof(crange))
 
-static const chr spaceCharTable[] = {
+static const pchr spaceCharTable[] = {
     0x20
 };
 
-#define NUM_SPACE_CHAR (sizeof(spaceCharTable)/sizeof(chr))
+#define NUM_SPACE_CHAR (sizeof(spaceCharTable)/sizeof(pchr))
 
 static const crange lowerRangeTable[] = {
     {0x61, 0x7a}
@@ -174,10 +178,10 @@ static const crange lowerRangeTable[] = {
 
 #define NUM_LOWER_RANGE (sizeof(lowerRangeTable)/sizeof(crange))
 
-static const chr lowerCharTable[] = {
+static const pchr lowerCharTable[] = {
 };
 
-#define NUM_LOWER_CHAR (sizeof(lowerCharTable)/sizeof(chr))
+#define NUM_LOWER_CHAR (sizeof(lowerCharTable)/sizeof(pchr))
 
 static const crange upperRangeTable[] = {
     {0x41, 0x5a}
@@ -185,10 +189,10 @@ static const crange upperRangeTable[] = {
 
 #define NUM_UPPER_RANGE (sizeof(upperRangeTable)/sizeof(crange))
 
-static const chr upperCharTable[] = {
+static const pchr upperCharTable[] = {
 };
 
-#define NUM_UPPER_CHAR (sizeof(upperCharTable)/sizeof(chr))
+#define NUM_UPPER_CHAR (sizeof(upperCharTable)/sizeof(pchr))
 
 static const crange graphRangeTable[] = {
     {0x21, 0x7e}
@@ -196,10 +200,10 @@ static const crange graphRangeTable[] = {
 
 #define NUM_GRAPH_RANGE (sizeof(graphRangeTable)/sizeof(crange))
 
-static const chr graphCharTable[] = {
+static const pchr graphCharTable[] = {
 };
 
-#define NUM_GRAPH_CHAR (sizeof(graphCharTable)/sizeof(chr))
+#define NUM_GRAPH_CHAR (sizeof(graphCharTable)/sizeof(pchr))
 
 static const crange printRangeTable[] = {
     {0x20, 0x7E}
@@ -207,10 +211,10 @@ static const crange printRangeTable[] = {
 
 #define NUM_PRINT_RANGE (sizeof(printRangeTable)/sizeof(crange))
 
-static const chr printCharTable[] = {
+static const pchr printCharTable[] = {
 };
 
-#define NUM_PRINT_CHAR (sizeof(printCharTable)/sizeof(chr))
+#define NUM_PRINT_CHAR (sizeof(printCharTable)/sizeof(pchr))
 #else
 
 /*
@@ -269,7 +273,7 @@ static const crange alphaRangeTable[] = {
 
 #define NUM_ALPHA_RANGE (sizeof(alphaRangeTable)/sizeof(crange))
 
-static const chr alphaCharTable[] = {
+static const pchr alphaCharTable[] = {
     0x00aa, 0x00b5, 0x00ba, 0x02d0, 0x02d1, 0x02ee, 0x037a, 0x0386, 0x038c,
     0x04c7, 0x04c8, 0x04cb, 0x04cc, 0x04f8, 0x04f9, 0x0559, 0x06d5, 0x06e5,
     0x06e6, 0x0710, 0x093d, 0x0950, 0x098f, 0x0990, 0x09b2, 0x09dc, 0x09dd,
@@ -285,7 +289,7 @@ static const chr alphaCharTable[] = {
     0x309e, 0xfb1d, 0xfb3e, 0xfb40, 0xfb41, 0xfb43, 0xfb44, 0xfe74, 0xfffe
 };
 
-#define NUM_ALPHA_CHAR (sizeof(alphaCharTable)/sizeof(chr))
+#define NUM_ALPHA_CHAR (sizeof(alphaCharTable)/sizeof(pchr))
 
 /*
  * Unicode: decimal digit characters
@@ -321,7 +325,7 @@ static const crange punctRangeTable[] = {
 
 #define NUM_PUNCT_RANGE (sizeof(punctRangeTable)/sizeof(crange))
 
-static const chr punctCharTable[] = {
+static const pchr punctCharTable[] = {
     0x003a, 0x003b, 0x003f, 0x0040, 0x005f, 0x007b, 0x007d, 0x00a1, 0x00ab,
     0x00ad, 0x00b7, 0x00bb, 0x00bf, 0x037e, 0x0387, 0x0589, 0x058a, 0x05be,
     0x05c0, 0x05c3, 0x05f3, 0x05f4, 0x060c, 0x061b, 0x061f, 0x06d4, 0x0964,
@@ -331,7 +335,7 @@ static const chr punctCharTable[] = {
     0xfe6a, 0xfe6b, 0xff1a, 0xff1b, 0xff1f, 0xff20, 0xff3f, 0xff5b, 0xff5d
 };
 
-#define NUM_PUNCT_CHAR (sizeof(punctCharTable)/sizeof(chr))
+#define NUM_PUNCT_CHAR (sizeof(punctCharTable)/sizeof(pchr))
 
 /*
  * Unicode: white space characters.
@@ -343,11 +347,11 @@ static const crange spaceRangeTable[] = {
 
 #define NUM_SPACE_RANGE (sizeof(spaceRangeTable)/sizeof(crange))
 
-static const chr spaceCharTable[] = {
+static const pchr spaceCharTable[] = {
     0x0020, 0x00a0, 0x1680, 0x2028, 0x2029, 0x202f, 0x3000
 };
 
-#define NUM_SPACE_CHAR (sizeof(spaceCharTable)/sizeof(chr))
+#define NUM_SPACE_CHAR (sizeof(spaceCharTable)/sizeof(pchr))
 
 /*
  * Unicode: lowercase characters
@@ -366,7 +370,7 @@ static const crange lowerRangeTable[] = {
 
 #define NUM_LOWER_RANGE (sizeof(lowerRangeTable)/sizeof(crange))
 
-static const chr lowerCharTable[] = {
+static const pchr lowerCharTable[] = {
     0x00aa, 0x00b5, 0x00ba, 0x0101, 0x0103, 0x0105, 0x0107, 0x0109, 0x010b,
     0x010d, 0x010f, 0x0111, 0x0113, 0x0115, 0x0117, 0x0119, 0x011b, 0x011d,
     0x011f, 0x0121, 0x0123, 0x0125, 0x0127, 0x0129, 0x012b, 0x012d, 0x012f,
@@ -409,7 +413,7 @@ static const chr lowerCharTable[] = {
     0x210f, 0x2113, 0x212f, 0x2134, 0x2139
 };
 
-#define NUM_LOWER_CHAR (sizeof(lowerCharTable)/sizeof(chr))
+#define NUM_LOWER_CHAR (sizeof(lowerCharTable)/sizeof(pchr))
 
 /*
  * Unicode: uppercase characters.
@@ -428,7 +432,7 @@ static const crange upperRangeTable[] = {
 
 #define NUM_UPPER_RANGE (sizeof(upperRangeTable)/sizeof(crange))
 
-static const chr upperCharTable[] = {
+static const pchr upperCharTable[] = {
     0x0100, 0x0102, 0x0104, 0x0106, 0x0108, 0x010a, 0x010c, 0x010e, 0x0110,
     0x0112, 0x0114, 0x0116, 0x0118, 0x011a, 0x011c, 0x011e, 0x0120, 0x0122,
     0x0124, 0x0126, 0x0128, 0x012a, 0x012c, 0x012e, 0x0130, 0x0132, 0x0134,
@@ -471,7 +475,7 @@ static const chr upperCharTable[] = {
     0x2131, 0x2133
 };
 
-#define NUM_UPPER_CHAR (sizeof(upperCharTable)/sizeof(chr))
+#define NUM_UPPER_CHAR (sizeof(upperCharTable)/sizeof(pchr))
 
 /*
  * Unicode: unicode print characters excluding space.
@@ -599,7 +603,7 @@ static const crange graphRangeTable[] = {
 
 #define NUM_GRAPH_RANGE (sizeof(graphRangeTable)/sizeof(crange))
 
-static const chr graphCharTable[] = {
+static const pchr graphCharTable[] = {
     0x0374, 0x0375, 0x037a, 0x037e, 0x038c, 0x0488, 0x0489, 0x04c7, 0x04c8,
     0x04cb, 0x04cc, 0x04f8, 0x04f9, 0x0589, 0x058a, 0x060c, 0x061b, 0x061f,
     0x098f, 0x0990, 0x09b2, 0x09bc, 0x09c7, 0x09c8, 0x09d7, 0x09dc, 0x09dd,
@@ -616,7 +620,7 @@ static const chr graphCharTable[] = {
     0x303e, 0x303f, 0xa4c6, 0xfb3e, 0xfb40, 0xfb41, 0xfb43, 0xfb44, 0xfe74
 };
 
-#define NUM_GRAPH_CHAR (sizeof(graphCharTable)/sizeof(chr))
+#define NUM_GRAPH_CHAR (sizeof(graphCharTable)/sizeof(pchr))
 
 /*
  * Unicode: unicode print characters including space, i.e. all Letters (class
@@ -681,7 +685,7 @@ static const crange printRangeTable[] = {
 
 #define NUM_PRINT_RANGE (sizeof(printRangeTable)/sizeof(crange))
 
-static const chr printCharTable[] = {
+static const pchr printCharTable[] = {
     0x037A, 0x037E, 0x038C, 0x03DA, 0x03DC, 0x03DE, 0x03E0, 0x0589, 0x05BE,
     0x05C0, 0x05C3, 0x060C, 0x061B, 0x061F, 0x06E9, 0x093D, 0x0950, 0x09B2,
     0x0A5E, 0x0A8D, 0x0ABD, 0x0AD0, 0x0AE0, 0x0B3D, 0x0B9C, 0x0CDE, 0x0E01,
@@ -690,7 +694,7 @@ static const chr printCharTable[] = {
     0x2070, 0x2300, 0x274D, 0x2756, 0x303F, 0xFB3E, 0xFE74
 };
 
-#define NUM_PRINT_CHAR (sizeof(printCharTable)/sizeof(chr))
+#define NUM_PRINT_CHAR (sizeof(printCharTable)/sizeof(pchr))
 #endif
 
 /*
@@ -720,9 +724,17 @@ element(
 
     assert(startp < endp);
     len = endp - startp;
+#ifdef REGEX_UTF8
+    wchar_t c;
+    if (mbtowc(&c, (const char *)startp, len) == len) {
+	// single character
+	return c;
+    }
+#else
     if (len == 1) {
 	return *startp;
     }
+#endif
 
     NOTE(REG_ULOCALE);
 
@@ -790,9 +802,9 @@ range(
 
     for (c=a; c<=b; c++) {
 	addchr(cv, c);
-	lc = Tcl_UniCharToLower((chr)c);
-	uc = Tcl_UniCharToUpper((chr)c);
-	tc = Tcl_UniCharToTitle((chr)c);
+	lc = Tcl_UniCharToLower(c);
+	uc = Tcl_UniCharToUpper(c);
+	tc = Tcl_UniCharToTitle(c);
 	if (c != lc) {
 	    addchr(cv, lc);
 	}
@@ -859,7 +871,7 @@ eclass(
     }
     cv = getcvec(v, 1, 0);
     assert(cv != NULL);
-    addchr(cv, (chr)c);
+    addchr(cv, c);
     return cv;
 }
 
@@ -1097,12 +1109,12 @@ allcases(
     pchr pc)			/* character to get case equivs of */
 {
     struct cvec *cv;
-    chr c = (chr)pc;
+    pchr c = pc;
     chr lc, uc, tc;
 
-    lc = Tcl_UniCharToLower((chr)c);
-    uc = Tcl_UniCharToUpper((chr)c);
-    tc = Tcl_UniCharToTitle((chr)c);
+    lc = Tcl_UniCharToLower(c);
+    uc = Tcl_UniCharToUpper(c);
+    tc = Tcl_UniCharToTitle(c);
 
     if (tc != uc) {
 	cv = getcvec(v, 3, 0);
@@ -1147,6 +1159,7 @@ casecmp(
     size_t len)			/* exact length of comparison */
 {
     for (; len > 0; len--, x++, y++) {
+	// FIXME: Will fail if REGEX_UTF8.
 	if ((*x!=*y) && (Tcl_UniCharToLower(*x) != Tcl_UniCharToLower(*y))) {
 	    return 1;
 	}
diff --git a/regcomp.c b/regcomp.c
index 8ff77ad..c00e19e 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -86,7 +86,7 @@ static chr newline(NOPARMS);
 #ifdef REG_DEBUG
 static const chr *ch(NOPARMS);
 #endif
-static chr chrnamed(struct vars *, const chr *, const chr *, pchr);
+static pchr chrnamed(struct vars *, const chr *, const chr *, pchr);
 /* === regc_color.c === */
 static void initcm(struct vars *, struct colormap *);
 static void freecm(struct colormap *);
@@ -193,7 +193,7 @@ struct vars {
     int cflags;			/* copy of compile flags */
     int lasttype;		/* type of previous token */
     int nexttype;		/* type of next token */
-    chr nextvalue;		/* value (if any) of next token */
+    pchr nextvalue;		/* value (if any) of next token */
     int lexcon;			/* lexical context type (see lex.c) */
     int nsubexp;		/* subexpression count */
     struct subre **subs;	/* subRE pointer vector */
@@ -229,6 +229,12 @@ struct vars {
 #define	NOTE(b)	(v->re->re_info |= (b))		/* note visible condition */
 #define	EMPTYARC(x, y)	newarc(v->nfa, EMPTY, 0, x, y)
 
+#ifdef REGEX_UTF8
+#define DECODECHR(buf, c) wctomb((char *)buf, c)
+#else
+#define DECODECHR(buf, c) ((buf)[0] = (c), 1)
+#endif
+
 /* token type codes, some also used as NFA arc types */
 #define	EMPTY	'n'		/* no token present */
 #define	EOS	'e'		/* end of string */
@@ -1458,7 +1464,9 @@ brackpart(
     celt startc, endc;
     struct cvec *cv;
     const chr *startp, *endp;
-    chr c[1];
+    chr buf[MB_LEN_MAX];
+    size_t buf_len;
+    pchr c;
 
     /*
      * Parse something, get rid of special cases, take shortcuts.
@@ -1470,7 +1478,7 @@ brackpart(
 	return;
 	break;
     case PLAIN:
-	c[0] = v->nextvalue;
+	c = v->nextvalue;
 	NEXT();
 
 	/*
@@ -1478,10 +1486,11 @@ brackpart(
 	 */
 
 	if (!SEE(RANGE)) {
-	    onechr(v, c[0], lp, rp);
+	    onechr(v, c, lp, rp);
 	    return;
 	}
-	startc = element(v, c, c+1);
+	buf_len = DECODECHR(buf, c);
+	startc = element(v, buf, buf+buf_len);
 	NOERR();
 	break;
     case COLLEL:
@@ -1525,9 +1534,9 @@ brackpart(
 	switch (v->nexttype) {
 	case PLAIN:
 	case RANGE:
-	    c[0] = v->nextvalue;
+	    buf_len = DECODECHR(buf, v->nextvalue);
 	    NEXT();
-	    endc = element(v, c, c+1);
+	    endc = element(v, buf, buf+buf_len);
 	    NOERR();
 	    break;
 	case COLLEL:
@@ -1623,8 +1632,8 @@ dovec(
     struct state *lp,
     struct state *rp)
 {
-    chr ch, from, to;
-    const chr *p;
+    pchr ch, from, to;
+    const pchr *p;
     int i;
 
     for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) {
diff --git a/regcustom.h b/regcustom.h
index c341c23..1b25fed 100644
--- a/regcustom.h
+++ b/regcustom.h
@@ -99,9 +99,9 @@
 #ifndef REGEX_STANDALONE
 typedef Tcl_UniChar chr;	/* The type itself. */
 #endif
-typedef int pchr;		/* What it promotes to. */
+typedef uint32_t pchr;		/* What it promotes to (holds 8-bit or Unicode char). */
 typedef unsigned uchr;		/* Unsigned type that will hold a chr. */
-typedef int celt;		/* Type to hold chr, or NOCELT */
+typedef int32_t celt;		/* Type to hold chr, or NOCELT */
 #define	NOCELT (-1)		/* Celt value which is not valid chr */
 #define	CHR(c) (UCHAR(c))	/* Turn char literal into chr literal */
 #define	DIGITVAL(c) ((c)-'0')	/* Turn chr digit into its value */
@@ -109,7 +109,7 @@ typedef int celt;		/* Type to hold chr, or NOCELT */
 #define	CHRBITS	32		/* Bits in a chr; must not use sizeof */
 #define	CHR_MIN	0x00000000	/* Smallest and largest chr; the value */
 #define	CHR_MAX	0xffffffff	/* CHR_MAX-CHR_MIN+1 should fit in uchr */
-#elif defined(REGEX_STANDALONE) && ! defined(REGEX_WCHAR)
+#elif defined(REGEX_STANDALONE) && ! defined(REGEX_UTF8)
 #	define CHRBITS	8
 #	define CHR_MIN	0x00
 #	define CHR_MAX	0xff
@@ -133,9 +133,9 @@ typedef int celt;		/* Type to hold chr, or NOCELT */
  */
 
 #ifdef REGEX_STANDALONE
-#	ifdef REGEX_WCHAR
-#		define compile		re_wcomp
-#		define exec		re_wexec
+#	ifdef REGEX_UTF8
+#		define compile		re_ucomp
+#		define exec		re_uexec
 #		define __REG_NOCHAR
 #	else
 #		define compile		re_comp
diff --git a/rege_dfa.c b/rege_dfa.c
index fbeae20..a2f3a28 100644
--- a/rege_dfa.c
+++ b/rege_dfa.c
@@ -70,8 +70,9 @@ longest(
 	co = d->cnfa->bos[(v->eflags&REG_NOTBOL) ? 0 : 1];
 	FDEBUG(("color %ld\n", (long)co));
     } else {
-	co = GETCOLOR(cm, *(cp - 1));
-	FDEBUG(("char %c, color %ld\n", (char)*(cp-1), (long)co));
+	pchr c = getchr(prevchr(cp), stop);
+	co = GETCOLOR(cm, c);
+	FDEBUG(("char %c (%u), color %ld\n", (char)c, c, (long)co));
     }
     css = miss(v, d, css, co, cp, start);
     if (css == NULL) {
@@ -86,30 +87,32 @@ longest(
     if (v->eflags&REG_FTRACE) {
 	while (cp < realstop) {
 	    FDEBUG(("+++ at c%d +++\n", css - d->ssets));
-	    co = GETCOLOR(cm, *cp);
-	    FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co));
+	    pchr c = getchr(cp, stop);
+	    co = GETCOLOR(cm, c);
+	    FDEBUG(("char %c (%u), color %ld\n", (char)c, c, (long)co));
 	    ss = css->outs[co];
 	    if (ss == NULL) {
-		ss = miss(v, d, css, co, cp+1, start);
+		ss = miss(v, d, css, co, nextchr(cp), start);
 		if (ss == NULL) {
 		    break;	/* NOTE BREAK OUT */
 		}
 	    }
-	    cp++;
+	    cp = nextchr(cp);
 	    ss->lastseen = cp;
 	    css = ss;
 	}
     } else {
 	while (cp < realstop) {
-	    co = GETCOLOR(cm, *cp);
+	    pchr c = getchr(cp, stop);
+	    co = GETCOLOR(cm, c);
 	    ss = css->outs[co];
 	    if (ss == NULL) {
-		ss = miss(v, d, css, co, cp+1, start);
+		ss = miss(v, d, css, co, nextchr(cp), start);
 		if (ss == NULL) {
 		    break;	/* NOTE BREAK OUT */
 		}
 	    }
-	    cp++;
+	    cp = nextchr(cp);
 	    ss->lastseen = cp;
 	    css = ss;
 	}
@@ -151,7 +154,7 @@ longest(
 	}
     }
     if (post != NULL) {		/* found one */
-	return post - 1;
+	return prevchr(post);
     }
 
     return NULL;
@@ -199,8 +202,9 @@ shortest(
 	co = d->cnfa->bos[(v->eflags&REG_NOTBOL) ? 0 : 1];
 	FDEBUG(("color %ld\n", (long)co));
     } else {
-	co = GETCOLOR(cm, *(cp - 1));
-	FDEBUG(("char %c, color %ld\n", (char)*(cp-1), (long)co));
+	pchr c = getchr(prevchr(cp), max);
+	co = GETCOLOR(cm, c);
+	FDEBUG(("char %c (%u), color %ld\n", (char)c, c, (long)co));
     }
     css = miss(v, d, css, co, cp, start);
     if (css == NULL) {
@@ -216,16 +220,17 @@ shortest(
     if (v->eflags&REG_FTRACE) {
 	while (cp < realmax) {
 	    FDEBUG(("--- at c%d ---\n", css - d->ssets));
-	    co = GETCOLOR(cm, *cp);
-	    FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co));
+	    pchr c = getchr(cp, max);
+	    co = GETCOLOR(cm, c);
+	    FDEBUG(("char %c (%u), color %ld\n", (char)c, c, (long)co));
 	    ss = css->outs[co];
 	    if (ss == NULL) {
-		ss = miss(v, d, css, co, cp+1, start);
+		ss = miss(v, d, css, co, nextchr(cp), start);
 		if (ss == NULL) {
 		    break;	/* NOTE BREAK OUT */
 		}
 	    }
-	    cp++;
+	    cp = nextchr(cp);
 	    ss->lastseen = cp;
 	    css = ss;
 	    if ((ss->flags&POSTSTATE) && cp >= realmin) {
@@ -234,15 +239,16 @@ shortest(
 	}
     } else {
 	while (cp < realmax) {
-	    co = GETCOLOR(cm, *cp);
+	    pchr c = getchr(cp, max);
+	    co = GETCOLOR(cm, c);
 	    ss = css->outs[co];
 	    if (ss == NULL) {
-		ss = miss(v, d, css, co, cp+1, start);
+		ss = miss(v, d, css, co, nextchr(cp), start);
 		if (ss == NULL) {
 		    break;	/* NOTE BREAK OUT */
 		}
 	    }
-	    cp++;
+	    cp = nextchr(cp);
 	    ss->lastseen = cp;
 	    css = ss;
 	    if ((ss->flags&POSTSTATE) && cp >= realmin) {
@@ -261,7 +267,7 @@ shortest(
 
     if ((ss->flags&POSTSTATE) && cp > min) {
 	assert(cp >= realmin);
-	cp--;
+	cp = prevchr(cp);
     } else if (cp == v->stop && max == v->stop) {
 	co = d->cnfa->eos[(v->eflags&REG_NOTEOL) ? 0 : 1];
 	FDEBUG(("color %ld\n", (long)co));
@@ -775,6 +781,7 @@ pickss(
      * Look for oldest, or old enough anyway.
      */
 
+    // FIXME: is this safe if REGEX_UTF8?
     if (cp - start > d->nssets*2/3) {	/* oldest 33% are expendable */
 	ancient = cp - d->nssets*2/3;
     } else {
diff --git a/regex.h b/regex.h
index 2ef538a..1e32b18 100644
--- a/regex.h
+++ b/regex.h
@@ -119,13 +119,14 @@ extern "C" {
 #	undef		regerror
 #	define regfree	re_free
 #	define regerror	re_error
+// FIXME
 #	undef __REG_WIDE_T
-#	define __REG_WIDE_T		wchar_t
+#	define __REG_WIDE_T		unsigned char
 #	undef __REG_WIDE_COMPILE
-#	define __REG_WIDE_COMPILE	re_wcomp
+#	define __REG_WIDE_COMPILE	re_ucomp
 #	undef __REG_WIDE_EXEC
-#	define __REG_WIDE_EXEC		re_wexec
-#	ifndef REGEX_WCHAR
+#	define __REG_WIDE_EXEC		re_uexec
+#	ifndef REGEX_UTF8
 #		undef __REG_NOCHAR
 #	endif
 #endif
diff --git a/regexec.c b/regexec.c
index 24edb41..2f8a234 100644
--- a/regexec.c
+++ b/regexec.c
@@ -155,6 +155,52 @@ static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *);
 /* automatically gathered by fwd; do not hand-edit */
 /* =====^!^===== end forwards =====^!^===== */
 
+#ifdef REGEX_UTF8
+
+static inline chr *
+nextchr(chr *s)
+{
+    unsigned char c = (unsigned char)*s;
+
+    if (c < 0x80)        /* 0xxxxxxx */
+	return s + 1;
+    if ((c & 0xE0) == 0xC0) /* 110xxxxx */
+	return s + 2;
+    if ((c & 0xF0) == 0xE0) /* 1110xxxx */
+	return s + 3;
+    if ((c & 0xF8) == 0xF0) /* 11110xxx */
+	return s + 4;
+
+    /* invalid lead byte, including stray continuation byte */
+    return s + 1;
+}
+
+static inline chr *
+prevchr(chr *s)
+{
+    do {
+	--s;
+    } while (((unsigned char)*s & 0xC0) == 0x80);
+
+    return s;
+}
+
+static inline pchr
+getchr(const chr *s, const chr *end)
+{
+    wchar_t c = 0;
+    mbtowc(&c, (const char *)s, end - s);
+    return c;
+}
+
+#else /* !REGEX_UTF8 */
+
+static inline chr *nextchr(chr *s) { return s+1; }
+static inline chr *prevchr(chr *s) { return s-1; }
+static inline pchr getchr(const chr *s, const chr *end) { return *s; }
+
+#endif
+
 /*
  - exec - match regular expression
  ^ int exec(regex_t *, const chr *, size_t, rm_detail_t *,
@@ -353,7 +399,7 @@ find(
     d = newdfa(v, cnfa, cm, &v->dfa1);
     assert(!(ISERR() && d != NULL));
     NOERR();
-    for (begin = open; begin <= close; begin++) {
+    for (begin = open; begin <= close; begin = nextchr(begin)) {
 	MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
 	if (shorter) {
 	    end = shortest(v, d, begin, begin, v->stop, NULL, &hitend);
@@ -478,7 +524,7 @@ cfindloop(
 	open = cold;
 	cold = NULL;
 	MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
-	for (begin = open; begin <= close; begin++) {
+	for (begin = open; begin <= close; begin = nextchr(begin)) {
 	    MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
 	    estart = begin;
 	    estop = v->stop;
@@ -525,9 +571,9 @@ cfindloop(
 		 */
 
 		if (shorter) {
-		    estart = end + 1;
+		    estart = nextchr(end);
 		} else {
-		    estop = end - 1;
+		    estop = prevchr(end);
 		}
 	    }
 	}
diff --git a/regguts.h b/regguts.h
index 67e3d03..93e01a6 100644
--- a/regguts.h
+++ b/regguts.h
@@ -241,10 +241,10 @@ struct colormap {
 struct cvec {
     int nchrs;			/* number of chrs */
     int chrspace;		/* number of chrs possible */
-    chr *chrs;			/* pointer to vector of chrs */
+    pchr *chrs;			/* pointer to vector of chrs */
     int nranges;		/* number of ranges (chr pairs) */
     int rangespace;		/* number of chrs possible */
-    chr *ranges;		/* pointer to vector of chr pairs */
+    pchr *ranges;		/* pointer to vector of chr pairs */
 };
 
 /*
diff --git a/regtest_hsrex.sh b/regtest_hsrex.sh
index 0950c04..566a9f3 100755
--- a/regtest_hsrex.sh
+++ b/regtest_hsrex.sh
@@ -11,6 +11,8 @@
 # History:
 #	04/xx/02 (ww)		Version 1.0
 #
+#set -x
+
 H=$HOME
 me=`basename $0`
 rgsrc=regtest_hsrex.c
@@ -71,45 +73,33 @@ cat<<-EOF>$rgsrc
 	#include <string.h>
 	#include "regalone.h"
 	#include "regex.h"
-	#ifdef REGEX_WCHAR
-	#	define chr	wchar_t
-	#	define re_comp	re_wcomp
-	#	define re_exec	re_wexec
-	#else
-	#	define chr	char
+	#ifdef REGEX_UTF8
+	#	define re_comp	re_ucomp
+	#	define re_exec	re_uexec
 	#endif
-	size_t hexescapes2bin(chr *t, char *src, size_t mxlen)
+	size_t hexescapes2bin(unsigned char *t, char *src, size_t mxlen)
 	{
 		char	*s, *xs;
 		size_t	len;
 		s = xs = src;
 		len = 0;
-		while ( s = strstr(s, "\\\x") )
+		while ( (s = strstr(s, "\\\x")) )
 		{
 			int	cbin;
 			sscanf(&s[2], "%2x", &cbin);
-	#		ifdef REGEX_WCHAR
-				*s = '\0';
-				len += mbstowcs(&t[len], xs, mxlen-len);
-	#		else
-				memcpy(&t[len], xs, (size_t ) (s-xs));
-				len += (size_t ) (s-xs);
-	#		endif
+			memcpy(&t[len], xs, (size_t ) (s-xs));
+			len += (size_t ) (s-xs);
 			t[len++] = cbin;
 			s += 4;
 			xs = s;
 		}
-	#	ifdef REGEX_WCHAR
-			len += mbstowcs(&t[len], xs, mxlen-len);
-	#	else
-			strcpy(&t[len], xs);
-			len += strlen(xs);
-	#	endif
+		strcpy((char *)&t[len], xs);
+		len += strlen(xs);
 		return len;
 	}
-	main(int argc, char *argv[])
+	int main(int argc, char *argv[])
 	{
-		chr		re[1024*4], dat[1024*8];
+		unsigned char	re[1024*4], dat[1024*8];
 		size_t		relen, datlen;
 		regex_t		cre;
 		regmatch_t	pmatch[100];
@@ -118,30 +108,30 @@ cat<<-EOF>$rgsrc
 
 		//memset(&cre, '\0', sizeof(cre));
 		nmatch = atoi(argv[1]);
-		relen = hexescapes2bin(re, argv[2], sizeof(re)/sizeof(chr));
-		datlen = hexescapes2bin(dat, argv[3], sizeof(dat)/sizeof(chr));
+		relen = hexescapes2bin(re, argv[2], sizeof(re)/sizeof(char));
+		datlen = hexescapes2bin(dat, argv[3], sizeof(dat)/sizeof(char));
 		cflags = REG_ADVANCED | (nmatch ? 0 : REG_NOSUB);
 		rc = re_comp(&cre, re, relen, cflags);
 		if ( rc != REG_OKAY )
 		{
 			regerror(rc, &cre, buf, sizeof(buf));
 			fprintf(stderr, "Compile error. %s\n", buf);
-			exit(1);
+			return 1;
 		}
 		if ( nmatch >= 0 && cre.re_nsub != nmatch )
 		{
 			fprintf(stderr,
-				"Mismatch on number of group patterns. ",
-				"Expected %d, compiled %d\n",
+				"Mismatch on number of group patterns. "
+				"Expected %d, compiled %zu\n",
 				nmatch, cre.re_nsub);
-			exit(1);
+			return 1;
 		}
 		rc = re_exec(&cre, dat, datlen, NULL, 100, pmatch, 0);
 		if ( rc != REG_OKAY )
 		{
 			regerror(rc, &cre, buf, sizeof(buf));
 			fprintf(stderr, "Execution error. %s\n", buf);
-			exit(1);
+			return 1;
 		}
 		if ( cre.re_nsub )
 		{
@@ -151,21 +141,21 @@ cat<<-EOF>$rgsrc
 			for ( i=1; i<cre.re_nsub+1 && pmatch[i].rm_so>=0; i++ )
 				sprintf(&buf[strlen(buf)], "%s%.*s",
 					i>1 ? ":" : "",
-					pmatch[i].rm_eo-pmatch[i].rm_so,
+					(int)(pmatch[i].rm_eo-pmatch[i].rm_so),
 					argv[3]+pmatch[i].rm_so);
 			printf("%s\n", buf);
 		}
 		regfree(&cre);
-		exit(0);
+		return 0;
 	}
 EOF
 PATH=.:$PATH
 LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH
 export PATH LD_LIBRARY_PATH
 # Either this one
-$CC -I. -I$H/inc -L. -lhsrex -o $rgbin $rgsrc			# Test ascii ch
+#$CC -Wall -g -O0 -I. -I$H/inc -L. -lhsrex -o $rgbin $rgsrc			# Test ascii ch
 # Or this one
-#$CC -I. -I$H/inc -L. -lhswrex -DREGEX_WCHAR -o $rgbin $rgsrc	# Test wide ch
+$CC -Wall -g -O0 -I. -I$H/inc -L. -lhsurex -DREGEX_UTF8 -o $rgbin $rgsrc	# Test wide ch
 #-----------------------------------
 resp=`$rgbin 0 "clavo" "Pablito clavo un clavito" 2>&1`
 msg="Simple match"
@@ -222,7 +212,7 @@ cat<<-EOF>$datsrc
 	#endif
 	char	nums[] = "0123456789";
 	char	alph[] = "abcdefghijklmnopqrstuvwxyz";
-	main(int argc, char *argv[])
+	int main(int argc, char *argv[])
 	{
 		char	dat[16], *arr;
 		int	arrsz, datsz, i;
@@ -236,6 +226,7 @@ cat<<-EOF>$datsrc
 		for ( i=0; i<datsz; i++ ) dat[i] = arr[ rand()%arrsz ];
 		dat[datsz] = '\0';
 		printf("%s\n", dat);
+		return 0;
 	}
 EOF
 $CC -o $datbin $datsrc
@@ -312,3 +303,8 @@ resp=`$rgbin 1 "(?i)(clavo)" "Pablito ClAvO un clavito" 2>&1`
 msg="One group pattern with case-insensitive matching"
 test "$resp" = "ClAvO" && f_ok "$msg" || f_no "$msg" "$resp"
 #-----------------------------------
+# Will only work if REGEX_UTF8
+resp=`$rgbin 1 '([[:alpha:]]+)' 'абвгд' 2>&1`
+msg="Unicode character class"
+test "$resp" = "абвгд" && f_ok "$msg" || f_no "$msg" "$resp"
+#-----------------------------------
author	Robin Haberkorn <rhaberkorn@fmsbw.de>	2026-06-21 21:42:12 +0200
committer	Robin Haberkorn <rhaberkorn@fmsbw.de>	2026-06-21 22:05:37 +0200
commit	13f5fd77bbc528862f295f9e7196f3ff709d185a (patch)
tree	9f9ce051bc8adf61e5ae2b4e94ccb1331bfdbfa5
parent	10b47c9226b6267e5a4be4e79fe79314bf969025 (diff)
download	terex-13f5fd77bbc528862f295f9e7196f3ff709d185a.tar.gz