aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobin Haberkorn <rhaberkorn@fmsbw.de>2026-06-21 21:42:12 +0200
committerRobin Haberkorn <rhaberkorn@fmsbw.de>2026-06-21 22:05:37 +0200
commit13f5fd77bbc528862f295f9e7196f3ff709d185a (patch)
tree9f9ce051bc8adf61e5ae2b4e94ccb1331bfdbfa5
parent10b47c9226b6267e5a4be4e79fe79314bf969025 (diff)
downloadterex-13f5fd77bbc528862f295f9e7196f3ff709d185a.tar.gz
Unicode builds now expect UTF-8 strings
* They are built with `-DREGEX_UTF8` instead of `-DREGEX_WCHAR`. Functions are called reg_ucomp() and reg_uexec() instead for consistency. The library is now called libhsurex.so instead of libhswrex.so. * The `chr` type is now always `unsigned char`. As a result many other uses of the `chr` type had to be changed to pchr (which is always large enough to hold a byte or wide character). Generally we try to keep code changes as small as possible since we may have to backport changes from the Tcl codebase or contribute patches to the Tcl project.
-rw-r--r--Makefile10
-rw-r--r--README8
-rw-r--r--regalone.c267
-rw-r--r--regalone.h36
-rw-r--r--regc_color.c12
-rw-r--r--regc_cvec.c10
-rw-r--r--regc_lex.c40
-rw-r--r--regc_locale.c91
-rw-r--r--regcomp.c29
-rw-r--r--regcustom.h12
-rw-r--r--rege_dfa.c47
-rw-r--r--regex.h9
-rw-r--r--regexec.c54
-rw-r--r--regguts.h4
-rwxr-xr-xregtest_hsrex.sh66
15 files changed, 252 insertions, 443 deletions
diff --git a/Makefile b/Makefile
index 384a3dd..6a4bdb9 100644
--- a/Makefile
+++ b/Makefile
@@ -1,16 +1,16 @@
CC = gcc
# Either this one
-#CFLAGS = -DREGEX_STANDALONE -fPIC -DREG_DEBUG -g
+CFLAGS = -Wall -DREGEX_STANDALONE -fPIC -DREG_DEBUG -g
# Or this one
-CFLAGS = -DREGEX_STANDALONE -fPIC -D_NDEBUG -O3
+#CFLAGS = -Wall -DREGEX_STANDALONE -fPIC -D_NDEBUG -O3
LDFLAGS = -shared
-SRCS = regcomp.c regexec.c regerror.c regfree.c regalone.c
+SRCS = regcomp.c regexec.c regerror.c regfree.c
OBJS = $(SRCS:.c=.o)
-BINS = libhsrex.so libhswrex.so
+BINS = libhsrex.so libhsurex.so
all:
make libhsrex.so
rm -f $(OBJS)
- make "CFLAGS=$(CFLAGS) -DREGEX_WCHAR" libhswrex.so
+ make "CFLAGS=$(CFLAGS) -DREGEX_UTF8" libhsurex.so
$(BINS): $(OBJS)
$(CC) $(LDFLAGS) -o $@ $(OBJS)
clean:
diff --git a/README b/README
index 7a823b9..c80c5b1 100644
--- a/README
+++ b/README
@@ -19,7 +19,7 @@ regtest_hsrex.sh and execute again.
# Either this one
$CC -I. -I$H/inc -L. -lhsrex -o $rgbin $rgsrc
# or this one
- #$CC -I. -I$H/inc -L. -lhswrex -DREGEX_WCHAR -o $rgbin $rgsrc
+ #$CC -I. -I$H/inc -L. -lhsurex -DREGEX_UTF8 -o $rgbin $rgsrc
You would like to test with debuging information. Uncomment the proper line in
the Makefile and rebuild.
@@ -28,14 +28,14 @@ the Makefile and rebuild.
# Or this one
CFLAGS = -DREGEX_STANDALONE -fPIC -D_NDEBUG -O3
-Two libraries are provided, libhsrex.so and libhswrex.so. The first one is for
+Two libraries are provided, libhsrex.so and libhsurex.so. The first one is for
ascii character code and the second one for wide characters. Both libraries
were tested in Linux and Solaris. Compiling and runing in Window$ should be
easy.
The following entry point where defined in each library:
-re_comp() (re_wcomp() for wide char) to compile a RE
-re_exec() (re_wexec() for wide char) to parse data against a compiled RE.
+re_comp() (re_ucomp() for wide char) to compile a RE
+re_exec() (re_uexec() for wide char) to parse data against a compiled RE.
regfree() To dispose the memory of a compiled RE.
regerror() Translates error codes to ascii strings.
diff --git a/regalone.c b/regalone.c
deleted file mode 100644
index e0a5fcc..0000000
--- a/regalone.c
+++ /dev/null
@@ -1,267 +0,0 @@
-#ifdef REGEX_WCHAR
-
-#include "regcustom.h"
-
-
-
-/*
- *----------------------------------------------------------------------
- *
- * Tcl_DStringInit --
- *
- * Initializes a dynamic string, discarding any previous contents of the
- * string (Tcl_DStringFree should have been called already if the dynamic
- * string was previously in use).
- *
- * Results:
- * None.
- *
- * Side effects:
- * The dynamic string is initialized to be empty.
- *
- *----------------------------------------------------------------------
- */
-
-void
-Tcl_DStringInit(
- Tcl_DString *dsPtr) /* Pointer to structure for dynamic string. */
-{
- dsPtr->string = dsPtr->staticSpace;
- dsPtr->length = 0;
- dsPtr->spaceAvl = TCL_DSTRING_STATIC_SIZE;
- dsPtr->staticSpace[0] = '\0';
-}
-
-
-/*
- *----------------------------------------------------------------------
- *
- * Tcl_DStringSetLength --
- *
- * Change the length of a dynamic string. This can cause the string to
- * either grow or shrink, depending on the value of length.
- *
- * Results:
- * None.
- *
- * Side effects:
- * The length of dsPtr is changed to length and a null byte is stored at
- * that position in the string. If length is larger than the space
- * allocated for dsPtr, then a panic occurs.
- *
- *----------------------------------------------------------------------
- */
-
-void
-Tcl_DStringSetLength(
- Tcl_DString *dsPtr, /* Structure describing dynamic string. */
- int length) /* New length for dynamic string. */
-{
- int newsize;
-
- if (length < 0) {
- length = 0;
- }
- if (length >= dsPtr->spaceAvl) {
- /*
- * There are two interesting cases here. In the first case, the user
- * may be trying to allocate a large buffer of a specific size. It
- * would be wasteful to overallocate that buffer, so we just allocate
- * enough for the requested size plus the trailing null byte. In the
- * second case, we are growing the buffer incrementally, so we need
- * behavior similar to Tcl_DStringAppend. The requested length will
- * usually be a small delta above the current spaceAvl, so we'll end
- * up doubling the old size. This won't grow the buffer quite as
- * quickly, but it should be close enough.
- */
-
- newsize = dsPtr->spaceAvl * 2;
- if (length < newsize) {
- dsPtr->spaceAvl = newsize;
- } else {
- dsPtr->spaceAvl = length + 1;
- }
- if (dsPtr->string == dsPtr->staticSpace) {
- char *newString = ckalloc((unsigned) dsPtr->spaceAvl);
-
- memcpy(newString, dsPtr->string, (size_t) dsPtr->length);
- dsPtr->string = newString;
- } else {
- dsPtr->string = (char *) ckrealloc((void *) dsPtr->string,
- (size_t) dsPtr->spaceAvl);
- }
- }
- dsPtr->length = length;
- dsPtr->string[length] = 0;
-}
-
-
-/*
- *----------------------------------------------------------------------
- *
- * Tcl_DStringFree --
- *
- * Frees up any memory allocated for the dynamic string and reinitializes
- * the string to an empty state.
- *
- * Results:
- * None.
- *
- * Side effects:
- * The previous contents of the dynamic string are lost, and the new
- * value is an empty string.
- *
- *----------------------------------------------------------------------
- */
-
-void
-Tcl_DStringFree(
- Tcl_DString *dsPtr) /* Structure describing dynamic string. */
-{
- if (dsPtr->string != dsPtr->staticSpace) {
- ckfree(dsPtr->string);
- }
- dsPtr->string = dsPtr->staticSpace;
- dsPtr->length = 0;
- dsPtr->spaceAvl = TCL_DSTRING_STATIC_SIZE;
- dsPtr->staticSpace[0] = '\0';
-}
-
-
-
-/*
- * Unicode characters less than this value are represented by themselves in
- * UTF-8 strings.
- */
-
-#define UNICODE_SELF 0x80
-
-
-/*
- *---------------------------------------------------------------------------
- *
- * Tcl_UniCharToUtf --
- *
- * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
- * provided buffer. Equivalent to Plan 9 runetochar().
- *
- * Results:
- * The return values is the number of bytes in the buffer that were
- * consumed.
- *
- * Side effects:
- * None.
- *
- *---------------------------------------------------------------------------
- */
-
-INLINE int
-Tcl_UniCharToUtf(
- int ch, /* The Tcl_UniChar to be stored in the
- * buffer. */
- char *buf) /* Buffer in which the UTF-8 representation of
- * the Tcl_UniChar is stored. Buffer must be
- * large enough to hold the UTF-8 character
- * (at most TCL_UTF_MAX bytes). */
-{
- if ((ch > 0) && (ch < UNICODE_SELF)) {
- buf[0] = (char) ch;
- return 1;
- }
- if (ch >= 0) {
- if (ch <= 0x7FF) {
- buf[1] = (char) ((ch | 0x80) & 0xBF);
- buf[0] = (char) ((ch >> 6) | 0xC0);
- return 2;
- }
- if (ch <= 0xFFFF) {
- three:
- buf[2] = (char) ((ch | 0x80) & 0xBF);
- buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
- buf[0] = (char) ((ch >> 12) | 0xE0);
- return 3;
- }
-
-#if TCL_UTF_MAX > 3
- if (ch <= 0x1FFFFF) {
- buf[3] = (char) ((ch | 0x80) & 0xBF);
- buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
- buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
- buf[0] = (char) ((ch >> 18) | 0xF0);
- return 4;
- }
- if (ch <= 0x3FFFFFF) {
- buf[4] = (char) ((ch | 0x80) & 0xBF);
- buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
- buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
- buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
- buf[0] = (char) ((ch >> 24) | 0xF8);
- return 5;
- }
- if (ch <= 0x7FFFFFFF) {
- buf[5] = (char) ((ch | 0x80) & 0xBF);
- buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
- buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
- buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
- buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
- buf[0] = (char) ((ch >> 30) | 0xFC);
- return 6;
- }
-#endif
- }
-
- ch = 0xFFFD;
- goto three;
-}
-
-/*
- *---------------------------------------------------------------------------
- *
- * Tcl_UniCharToUtfDString --
- *
- * Convert the given Unicode string to UTF-8.
- *
- * Results:
- * The return value is a pointer to the UTF-8 representation of the
- * Unicode string. Storage for the return value is appended to the end of
- * dsPtr.
- *
- * Side effects:
- * None.
- *
- *---------------------------------------------------------------------------
- */
-
-char *
-Tcl_UniCharToUtfDString(
- const Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */
- int uniLength, /* Length of Unicode string in Tcl_UniChars
- * (must be >= 0). */
- Tcl_DString *dsPtr) /* UTF-8 representation of string is appended
- * to this previously initialized DString. */
-{
- const Tcl_UniChar *w, *wEnd;
- char *p, *string;
- int oldLength;
-
- /*
- * UTF-8 string length in bytes will be <= Unicode string length *
- * TCL_UTF_MAX.
- */
-
- oldLength = Tcl_DStringLength(dsPtr);
- Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX);
- string = Tcl_DStringValue(dsPtr) + oldLength;
-
- p = string;
- wEnd = uniStr + uniLength;
- for (w = uniStr; w < wEnd; ) {
- p += Tcl_UniCharToUtf(*w, p);
- w++;
- }
- Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
-
- return string;
-}
-
-#endif /* REGEX_WCHAR */
diff --git a/regalone.h b/regalone.h
index 940c11d..e05fdb8 100644
--- a/regalone.h
+++ b/regalone.h
@@ -2,22 +2,18 @@
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
+#include <stdint.h>
#include <string.h>
#ifndef REGEX_STANDALONE
# define REGEX_STANDALONE
#endif
-#ifdef REGEX_WCHAR
-# include <wctype.h>
-# include <wchar.h>
- typedef wchar_t chr;
- typedef chr Tcl_UniChar;
-#else
-# include <ctype.h>
- typedef unsigned char chr;
- typedef wchar_t Tcl_UniChar;
-#endif
+#include <wctype.h>
+#include <ctype.h>
+// FIXME: Should better be a signed char?
+typedef unsigned char chr;
+//typedef wchar_t Tcl_UniChar;
/*
* In The standalone version we are more concerned with performance,
@@ -34,7 +30,12 @@
#define ckrealloc(p,n) realloc(p,n)
#define ckfree(p) free(p)
-#ifdef REGEX_WCHAR
+// FIXME: Perhaps get rid of these references completely.
+#define Tcl_DStringInit(ds) do (void)(ds); while (0)
+#define Tcl_UniCharToUtfDString(s,l,ds) ((char *)(s))
+#define Tcl_DStringFree(ds) do (void)(ds); while (0)
+
+#ifdef REGEX_UTF8
# define Tcl_UniCharToLower(c) towlower(c)
# define Tcl_UniCharToUpper(c) towupper(c)
# define Tcl_UniCharToTitle(c) towupper(c)
@@ -43,9 +44,6 @@
# define Tcl_UniCharIsDigit(c) iswdigit(c)
# define Tcl_UniCharIsSpace(c) iswspace(c)
#else
-# define Tcl_DStringInit(ds)
-# define Tcl_UniCharToUtfDString(s,l,ds) (s)
-# define Tcl_DStringFree(ds)
# define Tcl_UniCharToLower(c) tolower(c)
# define Tcl_UniCharToUpper(c) toupper(c)
# define Tcl_UniCharToTitle(c) toupper(c)
@@ -238,13 +236,3 @@ typedef struct Tcl_DString {
#else
# define EXTERN extern TCL_STORAGE_CLASS
#endif
-
-
-#ifdef REGEX_WCHAR
-EXTERN void Tcl_DStringFree (Tcl_DString * dsPtr);
-EXTERN void Tcl_DStringInit (Tcl_DString * dsPtr);
-EXTERN char * Tcl_UniCharToUtfDString (CONST Tcl_UniChar * uniStr,
- int uniLength, Tcl_DString * dsPtr);
-EXTERN void Tcl_DStringSetLength (Tcl_DString * dsPtr,
- int length);
-#endif /* REGEX_WCHAR */
diff --git a/regc_color.c b/regc_color.c
index 7a98dcb..c1d4b21 100644
--- a/regc_color.c
+++ b/regc_color.c
@@ -157,7 +157,7 @@ setcolor(
pchr c,
pcolor co)
{
- uchr uc = c;
+ pchr uc = c;
int shift;
int level;
int b;
@@ -433,7 +433,7 @@ subrange(
struct state *lp,
struct state *rp)
{
- uchr uf;
+ pchr uf;
int i;
assert(from <= to);
@@ -442,8 +442,8 @@ subrange(
* First, align "from" on a tree-block boundary
*/
- uf = (uchr) from;
- i = (int) (((uf + BYTTAB - 1) & (uchr) ~BYTMASK) - uf);
+ uf = from;
+ i = (int) (((uf + BYTTAB - 1) & (pchr) ~BYTMASK) - uf);
for (; from<=to && i>0; i--, from++) {
newarc(v->nfa, PLAIN, subcolor(v->cm, from), lp, rp);
}
@@ -479,7 +479,7 @@ subblock(
struct state *lp,
struct state *rp)
{
- uchr uc = start;
+ pchr uc = start;
struct colormap *cm = v->cm;
int shift;
int level;
@@ -751,7 +751,7 @@ dumpcolors(
struct colordesc *cd;
struct colordesc *end;
color co;
- chr c;
+ uchr c;
char *has;
fprintf(f, "max %ld\n", (long) cm->max);
diff --git a/regc_cvec.c b/regc_cvec.c
index 0247521..b9fba9d 100644
--- a/regc_cvec.c
+++ b/regc_cvec.c
@@ -44,14 +44,14 @@ newcvec(
int nranges) /* ... and this many ranges... */
{
size_t nc = (size_t)nchrs + (size_t)nranges*2;
- size_t n = sizeof(struct cvec) + nc*sizeof(chr);
+ size_t n = sizeof(struct cvec) + nc*sizeof(pchr);
struct cvec *cv = (struct cvec *) MALLOC(n);
if (cv == NULL) {
return NULL;
}
cv->chrspace = nchrs;
- cv->chrs = (chr *)(((char *)cv)+sizeof(struct cvec));
+ cv->chrs = (pchr *)(((char *)cv)+sizeof(struct cvec));
cv->ranges = cv->chrs + nchrs;
cv->rangespace = nranges;
return clearcvec(cv);
@@ -81,7 +81,7 @@ addchr(
struct cvec *cv, /* character vector */
pchr c) /* character to add */
{
- cv->chrs[cv->nchrs++] = (chr)c;
+ cv->chrs[cv->nchrs++] = c;
}
/*
@@ -95,8 +95,8 @@ addrange(
pchr to) /* last character of range */
{
assert(cv->nranges < cv->rangespace);
- cv->ranges[cv->nranges*2] = (chr)from;
- cv->ranges[cv->nranges*2 + 1] = (chr)to;
+ cv->ranges[cv->nranges*2] = from;
+ cv->ranges[cv->nranges*2 + 1] = to;
cv->nranges++;
}
diff --git a/regc_lex.c b/regc_lex.c
index 4be02c6..ae71884 100644
--- a/regc_lex.c
+++ b/regc_lex.c
@@ -32,6 +32,7 @@
/* scanning macros (know about v) */
#define ATEOS() (v->now >= v->stop)
#define HAVE(n) (v->stop - v->now >= (n))
+/* will work only for ANSI characters */
#define NEXT1(c) (!ATEOS() && *v->now == CHR(c))
#define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
#define NEXT3(a,b,c) \
@@ -45,6 +46,17 @@
#define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */
#define LASTTYPE(t) (v->lasttype == (t))
+/* return and skip the next (unicode) character */
+#ifdef REGEX_UTF8
+#define SKIPCHR(x) do { \
+ wchar_t __c; \
+ v->now += mbtowc(&__c, (const char *)v->now, v->stop - v->now); \
+ x = __c; \
+} while (0)
+#else
+#define SKIPCHR(x) do x = *v->now++; while (0)
+#endif
+
/* lexical contexts */
#define L_ERE 1 /* mainline ERE/ARE */
#define L_BRE 2 /* mainline BRE */
@@ -292,7 +304,7 @@ static int /* 1 normal, 0 failure */
next(
struct vars *v)
{
- chr c;
+ pchr c;
/*
* Errors yield an infinite sequence of failures.
@@ -371,7 +383,7 @@ next(
* Okay, time to actually get a character.
*/
- c = *v->now++;
+ SKIPCHR(c);
/*
* Deal with the easy contexts, punt EREs to code below.
@@ -697,11 +709,12 @@ next(
assert(!ATEOS());
if (!(v->cflags&REG_ADVF)) {/* only AREs have non-trivial escapes */
- if (iscalnum(*v->now)) {
+ SKIPCHR(c);
+ if (iscalnum(c)) {
NOTE(REG_UBSALNUM);
NOTE(REG_UUNSPEC);
}
- RETV(PLAIN, *v->now++);
+ RETV(PLAIN, c);
}
(DISCARD)lexescape(v);
if (ISERR()) {
@@ -741,7 +754,7 @@ static int /* not actually used, but convenient for RETV */
lexescape(
struct vars *v)
{
- chr c;
+ pchr c;
static chr alert[] = {
CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
};
@@ -753,7 +766,7 @@ lexescape(
assert(v->cflags&REG_ADVF);
assert(!ATEOS());
- c = *v->now++;
+ SKIPCHR(c);
if (!iscalnum(c)) {
RETV(PLAIN, c);
}
@@ -777,7 +790,8 @@ lexescape(
if (ATEOS()) {
FAILW(REG_EESCAPE);
}
- RETV(PLAIN, (chr)(*v->now++ & 037));
+ SKIPCHR(c);
+ RETV(PLAIN, c & 037);
break;
case CHR('d'):
NOTE(REG_ULOCALE);
@@ -911,6 +925,8 @@ lexescape(
- lexdigits - slurp up digits and return chr value
^ static chr lexdigits(struct vars *, int, int, int);
*/
+// FIXME: Perhaps directly return unsigned int.
+// Why should we be restricted to 0-255?
static chr /* chr value; errors signalled via ERR */
lexdigits(
struct vars *v,
@@ -972,7 +988,7 @@ brenext(
struct vars *v,
pchr pc)
{
- chr c = (chr)pc;
+ pchr c = pc;
switch (c) {
case CHR('*'):
@@ -1039,7 +1055,7 @@ brenext(
FAILW(REG_EESCAPE);
}
- c = *v->now++;
+ SKIPCHR(c);
switch (c) {
case CHR('{'):
INTOCON(L_BBND);
@@ -1147,7 +1163,7 @@ ch(void)
* use that it hardly matters.
^ static chr chrnamed(struct vars *, const chr *, const chr *, pchr);
*/
-static chr
+static pchr
chrnamed(
struct vars *v,
const chr *startp, /* start of name */
@@ -1166,12 +1182,12 @@ chrnamed(
v->err = errsave;
if (e != 0) {
- return (chr)lastresort;
+ return (pchr)lastresort;
}
cv = range(v, c, c, 0);
if (cv->nchrs == 0) {
- return (chr)lastresort;
+ return (pchr)lastresort;
}
return cv->chrs[0];
}
diff --git a/regc_locale.c b/regc_locale.c
index a6bc3af..97aa702 100644
--- a/regc_locale.c
+++ b/regc_locale.c
@@ -120,12 +120,16 @@ static const struct cname {
* Unicode character-class tables.
*/
+// FIXME: Perhaps define a new type here, similar to the
+// original chr, so we don't waste space on the tables
+// in ASCII (non-UTF-8) builds.
+// Or perhaps pchr should just be like chr in the original implementation.
typedef struct crange {
- chr start;
- chr end;
+ pchr start;
+ pchr end;
} crange;
-#if defined(REGEX_STANDALONE) && ! defined(REGEX_WCHAR)
+#if defined(REGEX_STANDALONE) && ! defined(REGEX_UTF8)
static const crange alphaRangeTable[] = {
{0x41, 0x5a}, {0x61, 0x7a}
@@ -133,10 +137,10 @@ static const crange alphaRangeTable[] = {
#define NUM_ALPHA_RANGE (sizeof(alphaRangeTable)/sizeof(crange))
-static const chr alphaCharTable[] = {
+static const pchr alphaCharTable[] = {
};
-#define NUM_ALPHA_CHAR (sizeof(alphaCharTable)/sizeof(chr))
+#define NUM_ALPHA_CHAR (sizeof(alphaCharTable)/sizeof(pchr))
static const crange digitRangeTable[] = {
{0x30, 0x39}
@@ -150,11 +154,11 @@ static const crange punctRangeTable[] = {
#define NUM_PUNCT_RANGE (sizeof(punctRangeTable)/sizeof(crange))
-static const chr punctCharTable[] = {
+static const pchr punctCharTable[] = {
0x3a, 0x3b, 0x3f, 0x40, 0x5f, 0x7b, 0x7d
};
-#define NUM_PUNCT_CHAR (sizeof(punctCharTable)/sizeof(chr))
+#define NUM_PUNCT_CHAR (sizeof(punctCharTable)/sizeof(pchr))
static const crange spaceRangeTable[] = {
{0x09, 0x0d}
@@ -162,11 +166,11 @@ static const crange spaceRangeTable[] = {
#define NUM_SPACE_RANGE (sizeof(spaceRangeTable)/sizeof(crange))
-static const chr spaceCharTable[] = {
+static const pchr spaceCharTable[] = {
0x20
};
-#define NUM_SPACE_CHAR (sizeof(spaceCharTable)/sizeof(chr))
+#define NUM_SPACE_CHAR (sizeof(spaceCharTable)/sizeof(pchr))
static const crange lowerRangeTable[] = {
{0x61, 0x7a}
@@ -174,10 +178,10 @@ static const crange lowerRangeTable[] = {
#define NUM_LOWER_RANGE (sizeof(lowerRangeTable)/sizeof(crange))
-static const chr lowerCharTable[] = {
+static const pchr lowerCharTable[] = {
};
-#define NUM_LOWER_CHAR (sizeof(lowerCharTable)/sizeof(chr))
+#define NUM_LOWER_CHAR (sizeof(lowerCharTable)/sizeof(pchr))
static const crange upperRangeTable[] = {
{0x41, 0x5a}
@@ -185,10 +189,10 @@ static const crange upperRangeTable[] = {
#define NUM_UPPER_RANGE (sizeof(upperRangeTable)/sizeof(crange))
-static const chr upperCharTable[] = {
+static const pchr upperCharTable[] = {
};
-#define NUM_UPPER_CHAR (sizeof(upperCharTable)/sizeof(chr))
+#define NUM_UPPER_CHAR (sizeof(upperCharTable)/sizeof(pchr))
static const crange graphRangeTable[] = {
{0x21, 0x7e}
@@ -196,10 +200,10 @@ static const crange graphRangeTable[] = {
#define NUM_GRAPH_RANGE (sizeof(graphRangeTable)/sizeof(crange))
-static const chr graphCharTable[] = {
+static const pchr graphCharTable[] = {
};
-#define NUM_GRAPH_CHAR (sizeof(graphCharTable)/sizeof(chr))
+#define NUM_GRAPH_CHAR (sizeof(graphCharTable)/sizeof(pchr))
static const crange printRangeTable[] = {
{0x20, 0x7E}
@@ -207,10 +211,10 @@ static const crange printRangeTable[] = {
#define NUM_PRINT_RANGE (sizeof(printRangeTable)/sizeof(crange))
-static const chr printCharTable[] = {
+static const pchr printCharTable[] = {
};
-#define NUM_PRINT_CHAR (sizeof(printCharTable)/sizeof(chr))
+#define NUM_PRINT_CHAR (sizeof(printCharTable)/sizeof(pchr))
#else
/*
@@ -269,7 +273,7 @@ static const crange alphaRangeTable[] = {
#define NUM_ALPHA_RANGE (sizeof(alphaRangeTable)/sizeof(crange))
-static const chr alphaCharTable[] = {
+static const pchr alphaCharTable[] = {
0x00aa, 0x00b5, 0x00ba, 0x02d0, 0x02d1, 0x02ee, 0x037a, 0x0386, 0x038c,
0x04c7, 0x04c8, 0x04cb, 0x04cc, 0x04f8, 0x04f9, 0x0559, 0x06d5, 0x06e5,
0x06e6, 0x0710, 0x093d, 0x0950, 0x098f, 0x0990, 0x09b2, 0x09dc, 0x09dd,
@@ -285,7 +289,7 @@ static const chr alphaCharTable[] = {
0x309e, 0xfb1d, 0xfb3e, 0xfb40, 0xfb41, 0xfb43, 0xfb44, 0xfe74, 0xfffe
};
-#define NUM_ALPHA_CHAR (sizeof(alphaCharTable)/sizeof(chr))
+#define NUM_ALPHA_CHAR (sizeof(alphaCharTable)/sizeof(pchr))
/*
* Unicode: decimal digit characters
@@ -321,7 +325,7 @@ static const crange punctRangeTable[] = {
#define NUM_PUNCT_RANGE (sizeof(punctRangeTable)/sizeof(crange))
-static const chr punctCharTable[] = {
+static const pchr punctCharTable[] = {
0x003a, 0x003b, 0x003f, 0x0040, 0x005f, 0x007b, 0x007d, 0x00a1, 0x00ab,
0x00ad, 0x00b7, 0x00bb, 0x00bf, 0x037e, 0x0387, 0x0589, 0x058a, 0x05be,
0x05c0, 0x05c3, 0x05f3, 0x05f4, 0x060c, 0x061b, 0x061f, 0x06d4, 0x0964,
@@ -331,7 +335,7 @@ static const chr punctCharTable[] = {
0xfe6a, 0xfe6b, 0xff1a, 0xff1b, 0xff1f, 0xff20, 0xff3f, 0xff5b, 0xff5d
};
-#define NUM_PUNCT_CHAR (sizeof(punctCharTable)/sizeof(chr))
+#define NUM_PUNCT_CHAR (sizeof(punctCharTable)/sizeof(pchr))
/*
* Unicode: white space characters.
@@ -343,11 +347,11 @@ static const crange spaceRangeTable[] = {
#define NUM_SPACE_RANGE (sizeof(spaceRangeTable)/sizeof(crange))
-static const chr spaceCharTable[] = {
+static const pchr spaceCharTable[] = {
0x0020, 0x00a0, 0x1680, 0x2028, 0x2029, 0x202f, 0x3000
};
-#define NUM_SPACE_CHAR (sizeof(spaceCharTable)/sizeof(chr))
+#define NUM_SPACE_CHAR (sizeof(spaceCharTable)/sizeof(pchr))
/*
* Unicode: lowercase characters
@@ -366,7 +370,7 @@ static const crange lowerRangeTable[] = {
#define NUM_LOWER_RANGE (sizeof(lowerRangeTable)/sizeof(crange))
-static const chr lowerCharTable[] = {
+static const pchr lowerCharTable[] = {
0x00aa, 0x00b5, 0x00ba, 0x0101, 0x0103, 0x0105, 0x0107, 0x0109, 0x010b,
0x010d, 0x010f, 0x0111, 0x0113, 0x0115, 0x0117, 0x0119, 0x011b, 0x011d,
0x011f, 0x0121, 0x0123, 0x0125, 0x0127, 0x0129, 0x012b, 0x012d, 0x012f,
@@ -409,7 +413,7 @@ static const chr lowerCharTable[] = {
0x210f, 0x2113, 0x212f, 0x2134, 0x2139
};
-#define NUM_LOWER_CHAR (sizeof(lowerCharTable)/sizeof(chr))
+#define NUM_LOWER_CHAR (sizeof(lowerCharTable)/sizeof(pchr))
/*
* Unicode: uppercase characters.
@@ -428,7 +432,7 @@ static const crange upperRangeTable[] = {
#define NUM_UPPER_RANGE (sizeof(upperRangeTable)/sizeof(crange))
-static const chr upperCharTable[] = {
+static const pchr upperCharTable[] = {
0x0100, 0x0102, 0x0104, 0x0106, 0x0108, 0x010a, 0x010c, 0x010e, 0x0110,
0x0112, 0x0114, 0x0116, 0x0118, 0x011a, 0x011c, 0x011e, 0x0120, 0x0122,
0x0124, 0x0126, 0x0128, 0x012a, 0x012c, 0x012e, 0x0130, 0x0132, 0x0134,
@@ -471,7 +475,7 @@ static const chr upperCharTable[] = {
0x2131, 0x2133
};
-#define NUM_UPPER_CHAR (sizeof(upperCharTable)/sizeof(chr))
+#define NUM_UPPER_CHAR (sizeof(upperCharTable)/sizeof(pchr))
/*
* Unicode: unicode print characters excluding space.
@@ -599,7 +603,7 @@ static const crange graphRangeTable[] = {
#define NUM_GRAPH_RANGE (sizeof(graphRangeTable)/sizeof(crange))
-static const chr graphCharTable[] = {
+static const pchr graphCharTable[] = {
0x0374, 0x0375, 0x037a, 0x037e, 0x038c, 0x0488, 0x0489, 0x04c7, 0x04c8,
0x04cb, 0x04cc, 0x04f8, 0x04f9, 0x0589, 0x058a, 0x060c, 0x061b, 0x061f,
0x098f, 0x0990, 0x09b2, 0x09bc, 0x09c7, 0x09c8, 0x09d7, 0x09dc, 0x09dd,
@@ -616,7 +620,7 @@ static const chr graphCharTable[] = {
0x303e, 0x303f, 0xa4c6, 0xfb3e, 0xfb40, 0xfb41, 0xfb43, 0xfb44, 0xfe74
};
-#define NUM_GRAPH_CHAR (sizeof(graphCharTable)/sizeof(chr))
+#define NUM_GRAPH_CHAR (sizeof(graphCharTable)/sizeof(pchr))
/*
* Unicode: unicode print characters including space, i.e. all Letters (class
@@ -681,7 +685,7 @@ static const crange printRangeTable[] = {
#define NUM_PRINT_RANGE (sizeof(printRangeTable)/sizeof(crange))
-static const chr printCharTable[] = {
+static const pchr printCharTable[] = {
0x037A, 0x037E, 0x038C, 0x03DA, 0x03DC, 0x03DE, 0x03E0, 0x0589, 0x05BE,
0x05C0, 0x05C3, 0x060C, 0x061B, 0x061F, 0x06E9, 0x093D, 0x0950, 0x09B2,
0x0A5E, 0x0A8D, 0x0ABD, 0x0AD0, 0x0AE0, 0x0B3D, 0x0B9C, 0x0CDE, 0x0E01,
@@ -690,7 +694,7 @@ static const chr printCharTable[] = {
0x2070, 0x2300, 0x274D, 0x2756, 0x303F, 0xFB3E, 0xFE74
};
-#define NUM_PRINT_CHAR (sizeof(printCharTable)/sizeof(chr))
+#define NUM_PRINT_CHAR (sizeof(printCharTable)/sizeof(pchr))
#endif
/*
@@ -720,9 +724,17 @@ element(
assert(startp < endp);
len = endp - startp;
+#ifdef REGEX_UTF8
+ wchar_t c;
+ if (mbtowc(&c, (const char *)startp, len) == len) {
+ // single character
+ return c;
+ }
+#else
if (len == 1) {
return *startp;
}
+#endif
NOTE(REG_ULOCALE);
@@ -790,9 +802,9 @@ range(
for (c=a; c<=b; c++) {
addchr(cv, c);
- lc = Tcl_UniCharToLower((chr)c);
- uc = Tcl_UniCharToUpper((chr)c);
- tc = Tcl_UniCharToTitle((chr)c);
+ lc = Tcl_UniCharToLower(c);
+ uc = Tcl_UniCharToUpper(c);
+ tc = Tcl_UniCharToTitle(c);
if (c != lc) {
addchr(cv, lc);
}
@@ -859,7 +871,7 @@ eclass(
}
cv = getcvec(v, 1, 0);
assert(cv != NULL);
- addchr(cv, (chr)c);
+ addchr(cv, c);
return cv;
}
@@ -1097,12 +1109,12 @@ allcases(
pchr pc) /* character to get case equivs of */
{
struct cvec *cv;
- chr c = (chr)pc;
+ pchr c = pc;
chr lc, uc, tc;
- lc = Tcl_UniCharToLower((chr)c);
- uc = Tcl_UniCharToUpper((chr)c);
- tc = Tcl_UniCharToTitle((chr)c);
+ lc = Tcl_UniCharToLower(c);
+ uc = Tcl_UniCharToUpper(c);
+ tc = Tcl_UniCharToTitle(c);
if (tc != uc) {
cv = getcvec(v, 3, 0);
@@ -1147,6 +1159,7 @@ casecmp(
size_t len) /* exact length of comparison */
{
for (; len > 0; len--, x++, y++) {
+ // FIXME: Will fail if REGEX_UTF8.
if ((*x!=*y) && (Tcl_UniCharToLower(*x) != Tcl_UniCharToLower(*y))) {
return 1;
}
diff --git a/regcomp.c b/regcomp.c
index 8ff77ad..c00e19e 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -86,7 +86,7 @@ static chr newline(NOPARMS);
#ifdef REG_DEBUG
static const chr *ch(NOPARMS);
#endif
-static chr chrnamed(struct vars *, const chr *, const chr *, pchr);
+static pchr chrnamed(struct vars *, const chr *, const chr *, pchr);
/* === regc_color.c === */
static void initcm(struct vars *, struct colormap *);
static void freecm(struct colormap *);
@@ -193,7 +193,7 @@ struct vars {
int cflags; /* copy of compile flags */
int lasttype; /* type of previous token */
int nexttype; /* type of next token */
- chr nextvalue; /* value (if any) of next token */
+ pchr nextvalue; /* value (if any) of next token */
int lexcon; /* lexical context type (see lex.c) */
int nsubexp; /* subexpression count */
struct subre **subs; /* subRE pointer vector */
@@ -229,6 +229,12 @@ struct vars {
#define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */
#define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y)
+#ifdef REGEX_UTF8
+#define DECODECHR(buf, c) wctomb((char *)buf, c)
+#else
+#define DECODECHR(buf, c) ((buf)[0] = (c), 1)
+#endif
+
/* token type codes, some also used as NFA arc types */
#define EMPTY 'n' /* no token present */
#define EOS 'e' /* end of string */
@@ -1458,7 +1464,9 @@ brackpart(
celt startc, endc;
struct cvec *cv;
const chr *startp, *endp;
- chr c[1];
+ chr buf[MB_LEN_MAX];
+ size_t buf_len;
+ pchr c;
/*
* Parse something, get rid of special cases, take shortcuts.
@@ -1470,7 +1478,7 @@ brackpart(
return;
break;
case PLAIN:
- c[0] = v->nextvalue;
+ c = v->nextvalue;
NEXT();
/*
@@ -1478,10 +1486,11 @@ brackpart(
*/
if (!SEE(RANGE)) {
- onechr(v, c[0], lp, rp);
+ onechr(v, c, lp, rp);
return;
}
- startc = element(v, c, c+1);
+ buf_len = DECODECHR(buf, c);
+ startc = element(v, buf, buf+buf_len);
NOERR();
break;
case COLLEL:
@@ -1525,9 +1534,9 @@ brackpart(
switch (v->nexttype) {
case PLAIN:
case RANGE:
- c[0] = v->nextvalue;
+ buf_len = DECODECHR(buf, v->nextvalue);
NEXT();
- endc = element(v, c, c+1);
+ endc = element(v, buf, buf+buf_len);
NOERR();
break;
case COLLEL:
@@ -1623,8 +1632,8 @@ dovec(
struct state *lp,
struct state *rp)
{
- chr ch, from, to;
- const chr *p;
+ pchr ch, from, to;
+ const pchr *p;
int i;
for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) {
diff --git a/regcustom.h b/regcustom.h
index c341c23..1b25fed 100644
--- a/regcustom.h
+++ b/regcustom.h
@@ -99,9 +99,9 @@
#ifndef REGEX_STANDALONE
typedef Tcl_UniChar chr; /* The type itself. */
#endif
-typedef int pchr; /* What it promotes to. */
+typedef uint32_t pchr; /* What it promotes to (holds 8-bit or Unicode char). */
typedef unsigned uchr; /* Unsigned type that will hold a chr. */
-typedef int celt; /* Type to hold chr, or NOCELT */
+typedef int32_t celt; /* Type to hold chr, or NOCELT */
#define NOCELT (-1) /* Celt value which is not valid chr */
#define CHR(c) (UCHAR(c)) /* Turn char literal into chr literal */
#define DIGITVAL(c) ((c)-'0') /* Turn chr digit into its value */
@@ -109,7 +109,7 @@ typedef int celt; /* Type to hold chr, or NOCELT */
#define CHRBITS 32 /* Bits in a chr; must not use sizeof */
#define CHR_MIN 0x00000000 /* Smallest and largest chr; the value */
#define CHR_MAX 0xffffffff /* CHR_MAX-CHR_MIN+1 should fit in uchr */
-#elif defined(REGEX_STANDALONE) && ! defined(REGEX_WCHAR)
+#elif defined(REGEX_STANDALONE) && ! defined(REGEX_UTF8)
# define CHRBITS 8
# define CHR_MIN 0x00
# define CHR_MAX 0xff
@@ -133,9 +133,9 @@ typedef int celt; /* Type to hold chr, or NOCELT */
*/
#ifdef REGEX_STANDALONE
-# ifdef REGEX_WCHAR
-# define compile re_wcomp
-# define exec re_wexec
+# ifdef REGEX_UTF8
+# define compile re_ucomp
+# define exec re_uexec
# define __REG_NOCHAR
# else
# define compile re_comp
diff --git a/rege_dfa.c b/rege_dfa.c
index fbeae20..a2f3a28 100644
--- a/rege_dfa.c
+++ b/rege_dfa.c
@@ -70,8 +70,9 @@ longest(
co = d->cnfa->bos[(v->eflags&REG_NOTBOL) ? 0 : 1];
FDEBUG(("color %ld\n", (long)co));
} else {
- co = GETCOLOR(cm, *(cp - 1));
- FDEBUG(("char %c, color %ld\n", (char)*(cp-1), (long)co));
+ pchr c = getchr(prevchr(cp), stop);
+ co = GETCOLOR(cm, c);
+ FDEBUG(("char %c (%u), color %ld\n", (char)c, c, (long)co));
}
css = miss(v, d, css, co, cp, start);
if (css == NULL) {
@@ -86,30 +87,32 @@ longest(
if (v->eflags&REG_FTRACE) {
while (cp < realstop) {
FDEBUG(("+++ at c%d +++\n", css - d->ssets));
- co = GETCOLOR(cm, *cp);
- FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co));
+ pchr c = getchr(cp, stop);
+ co = GETCOLOR(cm, c);
+ FDEBUG(("char %c (%u), color %ld\n", (char)c, c, (long)co));
ss = css->outs[co];
if (ss == NULL) {
- ss = miss(v, d, css, co, cp+1, start);
+ ss = miss(v, d, css, co, nextchr(cp), start);
if (ss == NULL) {
break; /* NOTE BREAK OUT */
}
}
- cp++;
+ cp = nextchr(cp);
ss->lastseen = cp;
css = ss;
}
} else {
while (cp < realstop) {
- co = GETCOLOR(cm, *cp);
+ pchr c = getchr(cp, stop);
+ co = GETCOLOR(cm, c);
ss = css->outs[co];
if (ss == NULL) {
- ss = miss(v, d, css, co, cp+1, start);
+ ss = miss(v, d, css, co, nextchr(cp), start);
if (ss == NULL) {
break; /* NOTE BREAK OUT */
}
}
- cp++;
+ cp = nextchr(cp);
ss->lastseen = cp;
css = ss;
}
@@ -151,7 +154,7 @@ longest(
}
}
if (post != NULL) { /* found one */
- return post - 1;
+ return prevchr(post);
}
return NULL;
@@ -199,8 +202,9 @@ shortest(
co = d->cnfa->bos[(v->eflags&REG_NOTBOL) ? 0 : 1];
FDEBUG(("color %ld\n", (long)co));
} else {
- co = GETCOLOR(cm, *(cp - 1));
- FDEBUG(("char %c, color %ld\n", (char)*(cp-1), (long)co));
+ pchr c = getchr(prevchr(cp), max);
+ co = GETCOLOR(cm, c);
+ FDEBUG(("char %c (%u), color %ld\n", (char)c, c, (long)co));
}
css = miss(v, d, css, co, cp, start);
if (css == NULL) {
@@ -216,16 +220,17 @@ shortest(
if (v->eflags&REG_FTRACE) {
while (cp < realmax) {
FDEBUG(("--- at c%d ---\n", css - d->ssets));
- co = GETCOLOR(cm, *cp);
- FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co));
+ pchr c = getchr(cp, max);
+ co = GETCOLOR(cm, c);
+ FDEBUG(("char %c (%u), color %ld\n", (char)c, c, (long)co));
ss = css->outs[co];
if (ss == NULL) {
- ss = miss(v, d, css, co, cp+1, start);
+ ss = miss(v, d, css, co, nextchr(cp), start);
if (ss == NULL) {
break; /* NOTE BREAK OUT */
}
}
- cp++;
+ cp = nextchr(cp);
ss->lastseen = cp;
css = ss;
if ((ss->flags&POSTSTATE) && cp >= realmin) {
@@ -234,15 +239,16 @@ shortest(
}
} else {
while (cp < realmax) {
- co = GETCOLOR(cm, *cp);
+ pchr c = getchr(cp, max);
+ co = GETCOLOR(cm, c);
ss = css->outs[co];
if (ss == NULL) {
- ss = miss(v, d, css, co, cp+1, start);
+ ss = miss(v, d, css, co, nextchr(cp), start);
if (ss == NULL) {
break; /* NOTE BREAK OUT */
}
}
- cp++;
+ cp = nextchr(cp);
ss->lastseen = cp;
css = ss;
if ((ss->flags&POSTSTATE) && cp >= realmin) {
@@ -261,7 +267,7 @@ shortest(
if ((ss->flags&POSTSTATE) && cp > min) {
assert(cp >= realmin);
- cp--;
+ cp = prevchr(cp);
} else if (cp == v->stop && max == v->stop) {
co = d->cnfa->eos[(v->eflags&REG_NOTEOL) ? 0 : 1];
FDEBUG(("color %ld\n", (long)co));
@@ -775,6 +781,7 @@ pickss(
* Look for oldest, or old enough anyway.
*/
+ // FIXME: is this safe if REGEX_UTF8?
if (cp - start > d->nssets*2/3) { /* oldest 33% are expendable */
ancient = cp - d->nssets*2/3;
} else {
diff --git a/regex.h b/regex.h
index 2ef538a..1e32b18 100644
--- a/regex.h
+++ b/regex.h
@@ -119,13 +119,14 @@ extern "C" {
# undef regerror
# define regfree re_free
# define regerror re_error
+// FIXME
# undef __REG_WIDE_T
-# define __REG_WIDE_T wchar_t
+# define __REG_WIDE_T unsigned char
# undef __REG_WIDE_COMPILE
-# define __REG_WIDE_COMPILE re_wcomp
+# define __REG_WIDE_COMPILE re_ucomp
# undef __REG_WIDE_EXEC
-# define __REG_WIDE_EXEC re_wexec
-# ifndef REGEX_WCHAR
+# define __REG_WIDE_EXEC re_uexec
+# ifndef REGEX_UTF8
# undef __REG_NOCHAR
# endif
#endif
diff --git a/regexec.c b/regexec.c
index 24edb41..2f8a234 100644
--- a/regexec.c
+++ b/regexec.c
@@ -155,6 +155,52 @@ static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *);
/* automatically gathered by fwd; do not hand-edit */
/* =====^!^===== end forwards =====^!^===== */
+#ifdef REGEX_UTF8
+
+static inline chr *
+nextchr(chr *s)
+{
+ unsigned char c = (unsigned char)*s;
+
+ if (c < 0x80) /* 0xxxxxxx */
+ return s + 1;
+ if ((c & 0xE0) == 0xC0) /* 110xxxxx */
+ return s + 2;
+ if ((c & 0xF0) == 0xE0) /* 1110xxxx */
+ return s + 3;
+ if ((c & 0xF8) == 0xF0) /* 11110xxx */
+ return s + 4;
+
+ /* invalid lead byte, including stray continuation byte */
+ return s + 1;
+}
+
+static inline chr *
+prevchr(chr *s)
+{
+ do {
+ --s;
+ } while (((unsigned char)*s & 0xC0) == 0x80);
+
+ return s;
+}
+
+static inline pchr
+getchr(const chr *s, const chr *end)
+{
+ wchar_t c = 0;
+ mbtowc(&c, (const char *)s, end - s);
+ return c;
+}
+
+#else /* !REGEX_UTF8 */
+
+static inline chr *nextchr(chr *s) { return s+1; }
+static inline chr *prevchr(chr *s) { return s-1; }
+static inline pchr getchr(const chr *s, const chr *end) { return *s; }
+
+#endif
+
/*
- exec - match regular expression
^ int exec(regex_t *, const chr *, size_t, rm_detail_t *,
@@ -353,7 +399,7 @@ find(
d = newdfa(v, cnfa, cm, &v->dfa1);
assert(!(ISERR() && d != NULL));
NOERR();
- for (begin = open; begin <= close; begin++) {
+ for (begin = open; begin <= close; begin = nextchr(begin)) {
MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
if (shorter) {
end = shortest(v, d, begin, begin, v->stop, NULL, &hitend);
@@ -478,7 +524,7 @@ cfindloop(
open = cold;
cold = NULL;
MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
- for (begin = open; begin <= close; begin++) {
+ for (begin = open; begin <= close; begin = nextchr(begin)) {
MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
estart = begin;
estop = v->stop;
@@ -525,9 +571,9 @@ cfindloop(
*/
if (shorter) {
- estart = end + 1;
+ estart = nextchr(end);
} else {
- estop = end - 1;
+ estop = prevchr(end);
}
}
}
diff --git a/regguts.h b/regguts.h
index 67e3d03..93e01a6 100644
--- a/regguts.h
+++ b/regguts.h
@@ -241,10 +241,10 @@ struct colormap {
struct cvec {
int nchrs; /* number of chrs */
int chrspace; /* number of chrs possible */
- chr *chrs; /* pointer to vector of chrs */
+ pchr *chrs; /* pointer to vector of chrs */
int nranges; /* number of ranges (chr pairs) */
int rangespace; /* number of chrs possible */
- chr *ranges; /* pointer to vector of chr pairs */
+ pchr *ranges; /* pointer to vector of chr pairs */
};
/*
diff --git a/regtest_hsrex.sh b/regtest_hsrex.sh
index 0950c04..566a9f3 100755
--- a/regtest_hsrex.sh
+++ b/regtest_hsrex.sh
@@ -11,6 +11,8 @@
# History:
# 04/xx/02 (ww) Version 1.0
#
+#set -x
+
H=$HOME
me=`basename $0`
rgsrc=regtest_hsrex.c
@@ -71,45 +73,33 @@ cat<<-EOF>$rgsrc
#include <string.h>
#include "regalone.h"
#include "regex.h"
- #ifdef REGEX_WCHAR
- # define chr wchar_t
- # define re_comp re_wcomp
- # define re_exec re_wexec
- #else
- # define chr char
+ #ifdef REGEX_UTF8
+ # define re_comp re_ucomp
+ # define re_exec re_uexec
#endif
- size_t hexescapes2bin(chr *t, char *src, size_t mxlen)
+ size_t hexescapes2bin(unsigned char *t, char *src, size_t mxlen)
{
char *s, *xs;
size_t len;
s = xs = src;
len = 0;
- while ( s = strstr(s, "\\\x") )
+ while ( (s = strstr(s, "\\\x")) )
{
int cbin;
sscanf(&s[2], "%2x", &cbin);
- # ifdef REGEX_WCHAR
- *s = '\0';
- len += mbstowcs(&t[len], xs, mxlen-len);
- # else
- memcpy(&t[len], xs, (size_t ) (s-xs));
- len += (size_t ) (s-xs);
- # endif
+ memcpy(&t[len], xs, (size_t ) (s-xs));
+ len += (size_t ) (s-xs);
t[len++] = cbin;
s += 4;
xs = s;
}
- # ifdef REGEX_WCHAR
- len += mbstowcs(&t[len], xs, mxlen-len);
- # else
- strcpy(&t[len], xs);
- len += strlen(xs);
- # endif
+ strcpy((char *)&t[len], xs);
+ len += strlen(xs);
return len;
}
- main(int argc, char *argv[])
+ int main(int argc, char *argv[])
{
- chr re[1024*4], dat[1024*8];
+ unsigned char re[1024*4], dat[1024*8];
size_t relen, datlen;
regex_t cre;
regmatch_t pmatch[100];
@@ -118,30 +108,30 @@ cat<<-EOF>$rgsrc
//memset(&cre, '\0', sizeof(cre));
nmatch = atoi(argv[1]);
- relen = hexescapes2bin(re, argv[2], sizeof(re)/sizeof(chr));
- datlen = hexescapes2bin(dat, argv[3], sizeof(dat)/sizeof(chr));
+ relen = hexescapes2bin(re, argv[2], sizeof(re)/sizeof(char));
+ datlen = hexescapes2bin(dat, argv[3], sizeof(dat)/sizeof(char));
cflags = REG_ADVANCED | (nmatch ? 0 : REG_NOSUB);
rc = re_comp(&cre, re, relen, cflags);
if ( rc != REG_OKAY )
{
regerror(rc, &cre, buf, sizeof(buf));
fprintf(stderr, "Compile error. %s\n", buf);
- exit(1);
+ return 1;
}
if ( nmatch >= 0 && cre.re_nsub != nmatch )
{
fprintf(stderr,
- "Mismatch on number of group patterns. ",
- "Expected %d, compiled %d\n",
+ "Mismatch on number of group patterns. "
+ "Expected %d, compiled %zu\n",
nmatch, cre.re_nsub);
- exit(1);
+ return 1;
}
rc = re_exec(&cre, dat, datlen, NULL, 100, pmatch, 0);
if ( rc != REG_OKAY )
{
regerror(rc, &cre, buf, sizeof(buf));
fprintf(stderr, "Execution error. %s\n", buf);
- exit(1);
+ return 1;
}
if ( cre.re_nsub )
{
@@ -151,21 +141,21 @@ cat<<-EOF>$rgsrc
for ( i=1; i<cre.re_nsub+1 && pmatch[i].rm_so>=0; i++ )
sprintf(&buf[strlen(buf)], "%s%.*s",
i>1 ? ":" : "",
- pmatch[i].rm_eo-pmatch[i].rm_so,
+ (int)(pmatch[i].rm_eo-pmatch[i].rm_so),
argv[3]+pmatch[i].rm_so);
printf("%s\n", buf);
}
regfree(&cre);
- exit(0);
+ return 0;
}
EOF
PATH=.:$PATH
LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH
export PATH LD_LIBRARY_PATH
# Either this one
-$CC -I. -I$H/inc -L. -lhsrex -o $rgbin $rgsrc # Test ascii ch
+#$CC -Wall -g -O0 -I. -I$H/inc -L. -lhsrex -o $rgbin $rgsrc # Test ascii ch
# Or this one
-#$CC -I. -I$H/inc -L. -lhswrex -DREGEX_WCHAR -o $rgbin $rgsrc # Test wide ch
+$CC -Wall -g -O0 -I. -I$H/inc -L. -lhsurex -DREGEX_UTF8 -o $rgbin $rgsrc # Test wide ch
#-----------------------------------
resp=`$rgbin 0 "clavo" "Pablito clavo un clavito" 2>&1`
msg="Simple match"
@@ -222,7 +212,7 @@ cat<<-EOF>$datsrc
#endif
char nums[] = "0123456789";
char alph[] = "abcdefghijklmnopqrstuvwxyz";
- main(int argc, char *argv[])
+ int main(int argc, char *argv[])
{
char dat[16], *arr;
int arrsz, datsz, i;
@@ -236,6 +226,7 @@ cat<<-EOF>$datsrc
for ( i=0; i<datsz; i++ ) dat[i] = arr[ rand()%arrsz ];
dat[datsz] = '\0';
printf("%s\n", dat);
+ return 0;
}
EOF
$CC -o $datbin $datsrc
@@ -312,3 +303,8 @@ resp=`$rgbin 1 "(?i)(clavo)" "Pablito ClAvO un clavito" 2>&1`
msg="One group pattern with case-insensitive matching"
test "$resp" = "ClAvO" && f_ok "$msg" || f_no "$msg" "$resp"
#-----------------------------------
+# Will only work if REGEX_UTF8
+resp=`$rgbin 1 '([[:alpha:]]+)' 'абвгд' 2>&1`
+msg="Unicode character class"
+test "$resp" = "абвгд" && f_ok "$msg" || f_no "$msg" "$resp"
+#-----------------------------------