Unicode builds now expect UTF-8 strings

* They are built with `-DREGEX_UTF8` instead of `-DREGEX_WCHAR`. Functions are called reg_ucomp() and reg_uexec() instead for consistency. The library is now called libhsurex.so instead of libhswrex.so. * The `chr` type is now always `unsigned char`. As a result many other uses of the `chr` type had to be changed to pchr (which is always large enough to hold a byte or wide character). Generally we try to keep code changes as small as possible since we may have to backport changes from the Tcl codebase or contribute patches to the Tcl project.
author: Robin Haberkorn <rhaberkorn@fmsbw.de> 2026-06-21 21:42:12 +0200
committer: Robin Haberkorn <rhaberkorn@fmsbw.de> 2026-06-21 22:05:37 +0200
commit: 13f5fd77bbc528862f295f9e7196f3ff709d185a (patch)
tree: 9f9ce051bc8adf61e5ae2b4e94ccb1331bfdbfa5 /regtest_hsrex.sh
parent: 10b47c9226b6267e5a4be4e79fe79314bf969025 (diff)
download: terex-13f5fd77bbc528862f295f9e7196f3ff709d185a.tar.gz
1 files changed, 31 insertions, 35 deletions
diff --git a/regtest_hsrex.sh b/regtest_hsrex.sh
index 0950c04..566a9f3 100755
--- a/regtest_hsrex.sh
+++ b/regtest_hsrex.sh
@@ -11,6 +11,8 @@
 # History:
 #	04/xx/02 (ww)		Version 1.0
 #
+#set -x
+
 H=$HOME
 me=`basename $0`
 rgsrc=regtest_hsrex.c
@@ -71,45 +73,33 @@ cat<<-EOF>$rgsrc
 	#include <string.h>
 	#include "regalone.h"
 	#include "regex.h"
-	#ifdef REGEX_WCHAR
-	#	define chr	wchar_t
-	#	define re_comp	re_wcomp
-	#	define re_exec	re_wexec
-	#else
-	#	define chr	char
+	#ifdef REGEX_UTF8
+	#	define re_comp	re_ucomp
+	#	define re_exec	re_uexec
 	#endif
-	size_t hexescapes2bin(chr *t, char *src, size_t mxlen)
+	size_t hexescapes2bin(unsigned char *t, char *src, size_t mxlen)
 	{
 		char	*s, *xs;
 		size_t	len;
 		s = xs = src;
 		len = 0;
-		while ( s = strstr(s, "\\\x") )
+		while ( (s = strstr(s, "\\\x")) )
 		{
 			int	cbin;
 			sscanf(&s[2], "%2x", &cbin);
-	#		ifdef REGEX_WCHAR
-				*s = '\0';
-				len += mbstowcs(&t[len], xs, mxlen-len);
-	#		else
-				memcpy(&t[len], xs, (size_t ) (s-xs));
-				len += (size_t ) (s-xs);
-	#		endif
+			memcpy(&t[len], xs, (size_t ) (s-xs));
+			len += (size_t ) (s-xs);
 			t[len++] = cbin;
 			s += 4;
 			xs = s;
 		}
-	#	ifdef REGEX_WCHAR
-			len += mbstowcs(&t[len], xs, mxlen-len);
-	#	else
-			strcpy(&t[len], xs);
-			len += strlen(xs);
-	#	endif
+		strcpy((char *)&t[len], xs);
+		len += strlen(xs);
 		return len;
 	}
-	main(int argc, char *argv[])
+	int main(int argc, char *argv[])
 	{
-		chr		re[1024*4], dat[1024*8];
+		unsigned char	re[1024*4], dat[1024*8];
 		size_t		relen, datlen;
 		regex_t		cre;
 		regmatch_t	pmatch[100];
@@ -118,30 +108,30 @@ cat<<-EOF>$rgsrc
 
 		//memset(&cre, '\0', sizeof(cre));
 		nmatch = atoi(argv[1]);
-		relen = hexescapes2bin(re, argv[2], sizeof(re)/sizeof(chr));
-		datlen = hexescapes2bin(dat, argv[3], sizeof(dat)/sizeof(chr));
+		relen = hexescapes2bin(re, argv[2], sizeof(re)/sizeof(char));
+		datlen = hexescapes2bin(dat, argv[3], sizeof(dat)/sizeof(char));
 		cflags = REG_ADVANCED | (nmatch ? 0 : REG_NOSUB);
 		rc = re_comp(&cre, re, relen, cflags);
 		if ( rc != REG_OKAY )
 		{
 			regerror(rc, &cre, buf, sizeof(buf));
 			fprintf(stderr, "Compile error. %s\n", buf);
-			exit(1);
+			return 1;
 		}
 		if ( nmatch >= 0 && cre.re_nsub != nmatch )
 		{
 			fprintf(stderr,
-				"Mismatch on number of group patterns. ",
-				"Expected %d, compiled %d\n",
+				"Mismatch on number of group patterns. "
+				"Expected %d, compiled %zu\n",
 				nmatch, cre.re_nsub);
-			exit(1);
+			return 1;
 		}
 		rc = re_exec(&cre, dat, datlen, NULL, 100, pmatch, 0);
 		if ( rc != REG_OKAY )
 		{
 			regerror(rc, &cre, buf, sizeof(buf));
 			fprintf(stderr, "Execution error. %s\n", buf);
-			exit(1);
+			return 1;
 		}
 		if ( cre.re_nsub )
 		{
@@ -151,21 +141,21 @@ cat<<-EOF>$rgsrc
 			for ( i=1; i<cre.re_nsub+1 && pmatch[i].rm_so>=0; i++ )
 				sprintf(&buf[strlen(buf)], "%s%.*s",
 					i>1 ? ":" : "",
-					pmatch[i].rm_eo-pmatch[i].rm_so,
+					(int)(pmatch[i].rm_eo-pmatch[i].rm_so),
 					argv[3]+pmatch[i].rm_so);
 			printf("%s\n", buf);
 		}
 		regfree(&cre);
-		exit(0);
+		return 0;
 	}
 EOF
 PATH=.:$PATH
 LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH
 export PATH LD_LIBRARY_PATH
 # Either this one
-$CC -I. -I$H/inc -L. -lhsrex -o $rgbin $rgsrc			# Test ascii ch
+#$CC -Wall -g -O0 -I. -I$H/inc -L. -lhsrex -o $rgbin $rgsrc			# Test ascii ch
 # Or this one
-#$CC -I. -I$H/inc -L. -lhswrex -DREGEX_WCHAR -o $rgbin $rgsrc	# Test wide ch
+$CC -Wall -g -O0 -I. -I$H/inc -L. -lhsurex -DREGEX_UTF8 -o $rgbin $rgsrc	# Test wide ch
 #-----------------------------------
 resp=`$rgbin 0 "clavo" "Pablito clavo un clavito" 2>&1`
 msg="Simple match"
@@ -222,7 +212,7 @@ cat<<-EOF>$datsrc
 	#endif
 	char	nums[] = "0123456789";
 	char	alph[] = "abcdefghijklmnopqrstuvwxyz";
-	main(int argc, char *argv[])
+	int main(int argc, char *argv[])
 	{
 		char	dat[16], *arr;
 		int	arrsz, datsz, i;
@@ -236,6 +226,7 @@ cat<<-EOF>$datsrc
 		for ( i=0; i<datsz; i++ ) dat[i] = arr[ rand()%arrsz ];
 		dat[datsz] = '\0';
 		printf("%s\n", dat);
+		return 0;
 	}
 EOF
 $CC -o $datbin $datsrc
@@ -312,3 +303,8 @@ resp=`$rgbin 1 "(?i)(clavo)" "Pablito ClAvO un clavito" 2>&1`
 msg="One group pattern with case-insensitive matching"
 test "$resp" = "ClAvO" && f_ok "$msg" || f_no "$msg" "$resp"
 #-----------------------------------
+# Will only work if REGEX_UTF8
+resp=`$rgbin 1 '([[:alpha:]]+)' 'абвгд' 2>&1`
+msg="Unicode character class"
+test "$resp" = "абвгд" && f_ok "$msg" || f_no "$msg" "$resp"
+#-----------------------------------
author	Robin Haberkorn <rhaberkorn@fmsbw.de>	2026-06-21 21:42:12 +0200
committer	Robin Haberkorn <rhaberkorn@fmsbw.de>	2026-06-21 22:05:37 +0200
commit	13f5fd77bbc528862f295f9e7196f3ff709d185a (patch)
tree	9f9ce051bc8adf61e5ae2b4e94ccb1331bfdbfa5 /regtest_hsrex.sh
parent	10b47c9226b6267e5a4be4e79fe79314bf969025 (diff)
download	terex-13f5fd77bbc528862f295f9e7196f3ff709d185a.tar.gz