diff options
| author | Robin Haberkorn <rhaberkorn@fmsbw.de> | 2026-06-21 21:42:12 +0200 |
|---|---|---|
| committer | Robin Haberkorn <rhaberkorn@fmsbw.de> | 2026-06-21 22:05:37 +0200 |
| commit | 13f5fd77bbc528862f295f9e7196f3ff709d185a (patch) | |
| tree | 9f9ce051bc8adf61e5ae2b4e94ccb1331bfdbfa5 /regc_lex.c | |
| parent | 10b47c9226b6267e5a4be4e79fe79314bf969025 (diff) | |
| download | terex-13f5fd77bbc528862f295f9e7196f3ff709d185a.tar.gz | |
Unicode builds now expect UTF-8 strings
* They are built with `-DREGEX_UTF8` instead of `-DREGEX_WCHAR`.
Functions are called reg_ucomp() and reg_uexec() instead for consistency.
The library is now called libhsurex.so instead of libhswrex.so.
* The `chr` type is now always `unsigned char`.
As a result many other uses of the `chr` type had to be changed to pchr
(which is always large enough to hold a byte or wide character).
Generally we try to keep code changes as small as possible since
we may have to backport changes from the Tcl codebase or contribute
patches to the Tcl project.
Diffstat (limited to 'regc_lex.c')
| -rw-r--r-- | regc_lex.c | 40 |
1 files changed, 28 insertions, 12 deletions
@@ -32,6 +32,7 @@ /* scanning macros (know about v) */ #define ATEOS() (v->now >= v->stop) #define HAVE(n) (v->stop - v->now >= (n)) +/* will work only for ANSI characters */ #define NEXT1(c) (!ATEOS() && *v->now == CHR(c)) #define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b)) #define NEXT3(a,b,c) \ @@ -45,6 +46,17 @@ #define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */ #define LASTTYPE(t) (v->lasttype == (t)) +/* return and skip the next (unicode) character */ +#ifdef REGEX_UTF8 +#define SKIPCHR(x) do { \ + wchar_t __c; \ + v->now += mbtowc(&__c, (const char *)v->now, v->stop - v->now); \ + x = __c; \ +} while (0) +#else +#define SKIPCHR(x) do x = *v->now++; while (0) +#endif + /* lexical contexts */ #define L_ERE 1 /* mainline ERE/ARE */ #define L_BRE 2 /* mainline BRE */ @@ -292,7 +304,7 @@ static int /* 1 normal, 0 failure */ next( struct vars *v) { - chr c; + pchr c; /* * Errors yield an infinite sequence of failures. @@ -371,7 +383,7 @@ next( * Okay, time to actually get a character. */ - c = *v->now++; + SKIPCHR(c); /* * Deal with the easy contexts, punt EREs to code below. @@ -697,11 +709,12 @@ next( assert(!ATEOS()); if (!(v->cflags®_ADVF)) {/* only AREs have non-trivial escapes */ - if (iscalnum(*v->now)) { + SKIPCHR(c); + if (iscalnum(c)) { NOTE(REG_UBSALNUM); NOTE(REG_UUNSPEC); } - RETV(PLAIN, *v->now++); + RETV(PLAIN, c); } (DISCARD)lexescape(v); if (ISERR()) { @@ -741,7 +754,7 @@ static int /* not actually used, but convenient for RETV */ lexescape( struct vars *v) { - chr c; + pchr c; static chr alert[] = { CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t') }; @@ -753,7 +766,7 @@ lexescape( assert(v->cflags®_ADVF); assert(!ATEOS()); - c = *v->now++; + SKIPCHR(c); if (!iscalnum(c)) { RETV(PLAIN, c); } @@ -777,7 +790,8 @@ lexescape( if (ATEOS()) { FAILW(REG_EESCAPE); } - RETV(PLAIN, (chr)(*v->now++ & 037)); + SKIPCHR(c); + RETV(PLAIN, c & 037); break; case CHR('d'): NOTE(REG_ULOCALE); @@ -911,6 +925,8 @@ lexescape( - lexdigits - slurp up digits and return chr value ^ static chr lexdigits(struct vars *, int, int, int); */ +// FIXME: Perhaps directly return unsigned int. +// Why should we be restricted to 0-255? static chr /* chr value; errors signalled via ERR */ lexdigits( struct vars *v, @@ -972,7 +988,7 @@ brenext( struct vars *v, pchr pc) { - chr c = (chr)pc; + pchr c = pc; switch (c) { case CHR('*'): @@ -1039,7 +1055,7 @@ brenext( FAILW(REG_EESCAPE); } - c = *v->now++; + SKIPCHR(c); switch (c) { case CHR('{'): INTOCON(L_BBND); @@ -1147,7 +1163,7 @@ ch(void) * use that it hardly matters. ^ static chr chrnamed(struct vars *, const chr *, const chr *, pchr); */ -static chr +static pchr chrnamed( struct vars *v, const chr *startp, /* start of name */ @@ -1166,12 +1182,12 @@ chrnamed( v->err = errsave; if (e != 0) { - return (chr)lastresort; + return (pchr)lastresort; } cv = range(v, c, c, 0); if (cv->nchrs == 0) { - return (chr)lastresort; + return (pchr)lastresort; } return cv->chrs[0]; } |
