From 13f5fd77bbc528862f295f9e7196f3ff709d185a Mon Sep 17 00:00:00 2001 From: Robin Haberkorn Date: Sun, 21 Jun 2026 21:42:12 +0200 Subject: Unicode builds now expect UTF-8 strings * They are built with `-DREGEX_UTF8` instead of `-DREGEX_WCHAR`. Functions are called reg_ucomp() and reg_uexec() instead for consistency. The library is now called libhsurex.so instead of libhswrex.so. * The `chr` type is now always `unsigned char`. As a result many other uses of the `chr` type had to be changed to pchr (which is always large enough to hold a byte or wide character). Generally we try to keep code changes as small as possible since we may have to backport changes from the Tcl codebase or contribute patches to the Tcl project. --- regc_lex.c | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) (limited to 'regc_lex.c') diff --git a/regc_lex.c b/regc_lex.c index 4be02c6..ae71884 100644 --- a/regc_lex.c +++ b/regc_lex.c @@ -32,6 +32,7 @@ /* scanning macros (know about v) */ #define ATEOS() (v->now >= v->stop) #define HAVE(n) (v->stop - v->now >= (n)) +/* will work only for ANSI characters */ #define NEXT1(c) (!ATEOS() && *v->now == CHR(c)) #define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b)) #define NEXT3(a,b,c) \ @@ -45,6 +46,17 @@ #define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */ #define LASTTYPE(t) (v->lasttype == (t)) +/* return and skip the next (unicode) character */ +#ifdef REGEX_UTF8 +#define SKIPCHR(x) do { \ + wchar_t __c; \ + v->now += mbtowc(&__c, (const char *)v->now, v->stop - v->now); \ + x = __c; \ +} while (0) +#else +#define SKIPCHR(x) do x = *v->now++; while (0) +#endif + /* lexical contexts */ #define L_ERE 1 /* mainline ERE/ARE */ #define L_BRE 2 /* mainline BRE */ @@ -292,7 +304,7 @@ static int /* 1 normal, 0 failure */ next( struct vars *v) { - chr c; + pchr c; /* * Errors yield an infinite sequence of failures. @@ -371,7 +383,7 @@ next( * Okay, time to actually get a character. */ - c = *v->now++; + SKIPCHR(c); /* * Deal with the easy contexts, punt EREs to code below. @@ -697,11 +709,12 @@ next( assert(!ATEOS()); if (!(v->cflags®_ADVF)) {/* only AREs have non-trivial escapes */ - if (iscalnum(*v->now)) { + SKIPCHR(c); + if (iscalnum(c)) { NOTE(REG_UBSALNUM); NOTE(REG_UUNSPEC); } - RETV(PLAIN, *v->now++); + RETV(PLAIN, c); } (DISCARD)lexescape(v); if (ISERR()) { @@ -741,7 +754,7 @@ static int /* not actually used, but convenient for RETV */ lexescape( struct vars *v) { - chr c; + pchr c; static chr alert[] = { CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t') }; @@ -753,7 +766,7 @@ lexescape( assert(v->cflags®_ADVF); assert(!ATEOS()); - c = *v->now++; + SKIPCHR(c); if (!iscalnum(c)) { RETV(PLAIN, c); } @@ -777,7 +790,8 @@ lexescape( if (ATEOS()) { FAILW(REG_EESCAPE); } - RETV(PLAIN, (chr)(*v->now++ & 037)); + SKIPCHR(c); + RETV(PLAIN, c & 037); break; case CHR('d'): NOTE(REG_ULOCALE); @@ -911,6 +925,8 @@ lexescape( - lexdigits - slurp up digits and return chr value ^ static chr lexdigits(struct vars *, int, int, int); */ +// FIXME: Perhaps directly return unsigned int. +// Why should we be restricted to 0-255? static chr /* chr value; errors signalled via ERR */ lexdigits( struct vars *v, @@ -972,7 +988,7 @@ brenext( struct vars *v, pchr pc) { - chr c = (chr)pc; + pchr c = pc; switch (c) { case CHR('*'): @@ -1039,7 +1055,7 @@ brenext( FAILW(REG_EESCAPE); } - c = *v->now++; + SKIPCHR(c); switch (c) { case CHR('{'): INTOCON(L_BBND); @@ -1147,7 +1163,7 @@ ch(void) * use that it hardly matters. ^ static chr chrnamed(struct vars *, const chr *, const chr *, pchr); */ -static chr +static pchr chrnamed( struct vars *v, const chr *startp, /* start of name */ @@ -1166,12 +1182,12 @@ chrnamed( v->err = errsave; if (e != 0) { - return (chr)lastresort; + return (pchr)lastresort; } cv = range(v, c, c, 0); if (cv->nchrs == 0) { - return (chr)lastresort; + return (pchr)lastresort; } return cv->chrs[0]; } -- cgit v1.2.3