From 13f5fd77bbc528862f295f9e7196f3ff709d185a Mon Sep 17 00:00:00 2001 From: Robin Haberkorn Date: Sun, 21 Jun 2026 21:42:12 +0200 Subject: Unicode builds now expect UTF-8 strings * They are built with `-DREGEX_UTF8` instead of `-DREGEX_WCHAR`. Functions are called reg_ucomp() and reg_uexec() instead for consistency. The library is now called libhsurex.so instead of libhswrex.so. * The `chr` type is now always `unsigned char`. As a result many other uses of the `chr` type had to be changed to pchr (which is always large enough to hold a byte or wide character). Generally we try to keep code changes as small as possible since we may have to backport changes from the Tcl codebase or contribute patches to the Tcl project. --- rege_dfa.c | 47 +++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) (limited to 'rege_dfa.c') diff --git a/rege_dfa.c b/rege_dfa.c index fbeae20..a2f3a28 100644 --- a/rege_dfa.c +++ b/rege_dfa.c @@ -70,8 +70,9 @@ longest( co = d->cnfa->bos[(v->eflags®_NOTBOL) ? 0 : 1]; FDEBUG(("color %ld\n", (long)co)); } else { - co = GETCOLOR(cm, *(cp - 1)); - FDEBUG(("char %c, color %ld\n", (char)*(cp-1), (long)co)); + pchr c = getchr(prevchr(cp), stop); + co = GETCOLOR(cm, c); + FDEBUG(("char %c (%u), color %ld\n", (char)c, c, (long)co)); } css = miss(v, d, css, co, cp, start); if (css == NULL) { @@ -86,30 +87,32 @@ longest( if (v->eflags®_FTRACE) { while (cp < realstop) { FDEBUG(("+++ at c%d +++\n", css - d->ssets)); - co = GETCOLOR(cm, *cp); - FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co)); + pchr c = getchr(cp, stop); + co = GETCOLOR(cm, c); + FDEBUG(("char %c (%u), color %ld\n", (char)c, c, (long)co)); ss = css->outs[co]; if (ss == NULL) { - ss = miss(v, d, css, co, cp+1, start); + ss = miss(v, d, css, co, nextchr(cp), start); if (ss == NULL) { break; /* NOTE BREAK OUT */ } } - cp++; + cp = nextchr(cp); ss->lastseen = cp; css = ss; } } else { while (cp < realstop) { - co = GETCOLOR(cm, *cp); + pchr c = getchr(cp, stop); + co = GETCOLOR(cm, c); ss = css->outs[co]; if (ss == NULL) { - ss = miss(v, d, css, co, cp+1, start); + ss = miss(v, d, css, co, nextchr(cp), start); if (ss == NULL) { break; /* NOTE BREAK OUT */ } } - cp++; + cp = nextchr(cp); ss->lastseen = cp; css = ss; } @@ -151,7 +154,7 @@ longest( } } if (post != NULL) { /* found one */ - return post - 1; + return prevchr(post); } return NULL; @@ -199,8 +202,9 @@ shortest( co = d->cnfa->bos[(v->eflags®_NOTBOL) ? 0 : 1]; FDEBUG(("color %ld\n", (long)co)); } else { - co = GETCOLOR(cm, *(cp - 1)); - FDEBUG(("char %c, color %ld\n", (char)*(cp-1), (long)co)); + pchr c = getchr(prevchr(cp), max); + co = GETCOLOR(cm, c); + FDEBUG(("char %c (%u), color %ld\n", (char)c, c, (long)co)); } css = miss(v, d, css, co, cp, start); if (css == NULL) { @@ -216,16 +220,17 @@ shortest( if (v->eflags®_FTRACE) { while (cp < realmax) { FDEBUG(("--- at c%d ---\n", css - d->ssets)); - co = GETCOLOR(cm, *cp); - FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co)); + pchr c = getchr(cp, max); + co = GETCOLOR(cm, c); + FDEBUG(("char %c (%u), color %ld\n", (char)c, c, (long)co)); ss = css->outs[co]; if (ss == NULL) { - ss = miss(v, d, css, co, cp+1, start); + ss = miss(v, d, css, co, nextchr(cp), start); if (ss == NULL) { break; /* NOTE BREAK OUT */ } } - cp++; + cp = nextchr(cp); ss->lastseen = cp; css = ss; if ((ss->flags&POSTSTATE) && cp >= realmin) { @@ -234,15 +239,16 @@ shortest( } } else { while (cp < realmax) { - co = GETCOLOR(cm, *cp); + pchr c = getchr(cp, max); + co = GETCOLOR(cm, c); ss = css->outs[co]; if (ss == NULL) { - ss = miss(v, d, css, co, cp+1, start); + ss = miss(v, d, css, co, nextchr(cp), start); if (ss == NULL) { break; /* NOTE BREAK OUT */ } } - cp++; + cp = nextchr(cp); ss->lastseen = cp; css = ss; if ((ss->flags&POSTSTATE) && cp >= realmin) { @@ -261,7 +267,7 @@ shortest( if ((ss->flags&POSTSTATE) && cp > min) { assert(cp >= realmin); - cp--; + cp = prevchr(cp); } else if (cp == v->stop && max == v->stop) { co = d->cnfa->eos[(v->eflags®_NOTEOL) ? 0 : 1]; FDEBUG(("color %ld\n", (long)co)); @@ -775,6 +781,7 @@ pickss( * Look for oldest, or old enough anyway. */ + // FIXME: is this safe if REGEX_UTF8? if (cp - start > d->nssets*2/3) { /* oldest 33% are expendable */ ancient = cp - d->nssets*2/3; } else { -- cgit v1.2.3