diff options
| author | Robin Haberkorn <rhaberkorn@fmsbw.de> | 2026-06-21 21:42:12 +0200 |
|---|---|---|
| committer | Robin Haberkorn <rhaberkorn@fmsbw.de> | 2026-06-21 22:05:37 +0200 |
| commit | 13f5fd77bbc528862f295f9e7196f3ff709d185a (patch) | |
| tree | 9f9ce051bc8adf61e5ae2b4e94ccb1331bfdbfa5 /regc_locale.c | |
| parent | 10b47c9226b6267e5a4be4e79fe79314bf969025 (diff) | |
| download | terex-13f5fd77bbc528862f295f9e7196f3ff709d185a.tar.gz | |
Unicode builds now expect UTF-8 strings
* They are built with `-DREGEX_UTF8` instead of `-DREGEX_WCHAR`.
Functions are called reg_ucomp() and reg_uexec() instead for consistency.
The library is now called libhsurex.so instead of libhswrex.so.
* The `chr` type is now always `unsigned char`.
As a result many other uses of the `chr` type had to be changed to pchr
(which is always large enough to hold a byte or wide character).
Generally we try to keep code changes as small as possible since
we may have to backport changes from the Tcl codebase or contribute
patches to the Tcl project.
Diffstat (limited to 'regc_locale.c')
| -rw-r--r-- | regc_locale.c | 91 |
1 files changed, 52 insertions, 39 deletions
diff --git a/regc_locale.c b/regc_locale.c index a6bc3af..97aa702 100644 --- a/regc_locale.c +++ b/regc_locale.c @@ -120,12 +120,16 @@ static const struct cname { * Unicode character-class tables. */ +// FIXME: Perhaps define a new type here, similar to the +// original chr, so we don't waste space on the tables +// in ASCII (non-UTF-8) builds. +// Or perhaps pchr should just be like chr in the original implementation. typedef struct crange { - chr start; - chr end; + pchr start; + pchr end; } crange; -#if defined(REGEX_STANDALONE) && ! defined(REGEX_WCHAR) +#if defined(REGEX_STANDALONE) && ! defined(REGEX_UTF8) static const crange alphaRangeTable[] = { {0x41, 0x5a}, {0x61, 0x7a} @@ -133,10 +137,10 @@ static const crange alphaRangeTable[] = { #define NUM_ALPHA_RANGE (sizeof(alphaRangeTable)/sizeof(crange)) -static const chr alphaCharTable[] = { +static const pchr alphaCharTable[] = { }; -#define NUM_ALPHA_CHAR (sizeof(alphaCharTable)/sizeof(chr)) +#define NUM_ALPHA_CHAR (sizeof(alphaCharTable)/sizeof(pchr)) static const crange digitRangeTable[] = { {0x30, 0x39} @@ -150,11 +154,11 @@ static const crange punctRangeTable[] = { #define NUM_PUNCT_RANGE (sizeof(punctRangeTable)/sizeof(crange)) -static const chr punctCharTable[] = { +static const pchr punctCharTable[] = { 0x3a, 0x3b, 0x3f, 0x40, 0x5f, 0x7b, 0x7d }; -#define NUM_PUNCT_CHAR (sizeof(punctCharTable)/sizeof(chr)) +#define NUM_PUNCT_CHAR (sizeof(punctCharTable)/sizeof(pchr)) static const crange spaceRangeTable[] = { {0x09, 0x0d} @@ -162,11 +166,11 @@ static const crange spaceRangeTable[] = { #define NUM_SPACE_RANGE (sizeof(spaceRangeTable)/sizeof(crange)) -static const chr spaceCharTable[] = { +static const pchr spaceCharTable[] = { 0x20 }; -#define NUM_SPACE_CHAR (sizeof(spaceCharTable)/sizeof(chr)) +#define NUM_SPACE_CHAR (sizeof(spaceCharTable)/sizeof(pchr)) static const crange lowerRangeTable[] = { {0x61, 0x7a} @@ -174,10 +178,10 @@ static const crange lowerRangeTable[] = { #define NUM_LOWER_RANGE (sizeof(lowerRangeTable)/sizeof(crange)) -static const chr lowerCharTable[] = { +static const pchr lowerCharTable[] = { }; -#define NUM_LOWER_CHAR (sizeof(lowerCharTable)/sizeof(chr)) +#define NUM_LOWER_CHAR (sizeof(lowerCharTable)/sizeof(pchr)) static const crange upperRangeTable[] = { {0x41, 0x5a} @@ -185,10 +189,10 @@ static const crange upperRangeTable[] = { #define NUM_UPPER_RANGE (sizeof(upperRangeTable)/sizeof(crange)) -static const chr upperCharTable[] = { +static const pchr upperCharTable[] = { }; -#define NUM_UPPER_CHAR (sizeof(upperCharTable)/sizeof(chr)) +#define NUM_UPPER_CHAR (sizeof(upperCharTable)/sizeof(pchr)) static const crange graphRangeTable[] = { {0x21, 0x7e} @@ -196,10 +200,10 @@ static const crange graphRangeTable[] = { #define NUM_GRAPH_RANGE (sizeof(graphRangeTable)/sizeof(crange)) -static const chr graphCharTable[] = { +static const pchr graphCharTable[] = { }; -#define NUM_GRAPH_CHAR (sizeof(graphCharTable)/sizeof(chr)) +#define NUM_GRAPH_CHAR (sizeof(graphCharTable)/sizeof(pchr)) static const crange printRangeTable[] = { {0x20, 0x7E} @@ -207,10 +211,10 @@ static const crange printRangeTable[] = { #define NUM_PRINT_RANGE (sizeof(printRangeTable)/sizeof(crange)) -static const chr printCharTable[] = { +static const pchr printCharTable[] = { }; -#define NUM_PRINT_CHAR (sizeof(printCharTable)/sizeof(chr)) +#define NUM_PRINT_CHAR (sizeof(printCharTable)/sizeof(pchr)) #else /* @@ -269,7 +273,7 @@ static const crange alphaRangeTable[] = { #define NUM_ALPHA_RANGE (sizeof(alphaRangeTable)/sizeof(crange)) -static const chr alphaCharTable[] = { +static const pchr alphaCharTable[] = { 0x00aa, 0x00b5, 0x00ba, 0x02d0, 0x02d1, 0x02ee, 0x037a, 0x0386, 0x038c, 0x04c7, 0x04c8, 0x04cb, 0x04cc, 0x04f8, 0x04f9, 0x0559, 0x06d5, 0x06e5, 0x06e6, 0x0710, 0x093d, 0x0950, 0x098f, 0x0990, 0x09b2, 0x09dc, 0x09dd, @@ -285,7 +289,7 @@ static const chr alphaCharTable[] = { 0x309e, 0xfb1d, 0xfb3e, 0xfb40, 0xfb41, 0xfb43, 0xfb44, 0xfe74, 0xfffe }; -#define NUM_ALPHA_CHAR (sizeof(alphaCharTable)/sizeof(chr)) +#define NUM_ALPHA_CHAR (sizeof(alphaCharTable)/sizeof(pchr)) /* * Unicode: decimal digit characters @@ -321,7 +325,7 @@ static const crange punctRangeTable[] = { #define NUM_PUNCT_RANGE (sizeof(punctRangeTable)/sizeof(crange)) -static const chr punctCharTable[] = { +static const pchr punctCharTable[] = { 0x003a, 0x003b, 0x003f, 0x0040, 0x005f, 0x007b, 0x007d, 0x00a1, 0x00ab, 0x00ad, 0x00b7, 0x00bb, 0x00bf, 0x037e, 0x0387, 0x0589, 0x058a, 0x05be, 0x05c0, 0x05c3, 0x05f3, 0x05f4, 0x060c, 0x061b, 0x061f, 0x06d4, 0x0964, @@ -331,7 +335,7 @@ static const chr punctCharTable[] = { 0xfe6a, 0xfe6b, 0xff1a, 0xff1b, 0xff1f, 0xff20, 0xff3f, 0xff5b, 0xff5d }; -#define NUM_PUNCT_CHAR (sizeof(punctCharTable)/sizeof(chr)) +#define NUM_PUNCT_CHAR (sizeof(punctCharTable)/sizeof(pchr)) /* * Unicode: white space characters. @@ -343,11 +347,11 @@ static const crange spaceRangeTable[] = { #define NUM_SPACE_RANGE (sizeof(spaceRangeTable)/sizeof(crange)) -static const chr spaceCharTable[] = { +static const pchr spaceCharTable[] = { 0x0020, 0x00a0, 0x1680, 0x2028, 0x2029, 0x202f, 0x3000 }; -#define NUM_SPACE_CHAR (sizeof(spaceCharTable)/sizeof(chr)) +#define NUM_SPACE_CHAR (sizeof(spaceCharTable)/sizeof(pchr)) /* * Unicode: lowercase characters @@ -366,7 +370,7 @@ static const crange lowerRangeTable[] = { #define NUM_LOWER_RANGE (sizeof(lowerRangeTable)/sizeof(crange)) -static const chr lowerCharTable[] = { +static const pchr lowerCharTable[] = { 0x00aa, 0x00b5, 0x00ba, 0x0101, 0x0103, 0x0105, 0x0107, 0x0109, 0x010b, 0x010d, 0x010f, 0x0111, 0x0113, 0x0115, 0x0117, 0x0119, 0x011b, 0x011d, 0x011f, 0x0121, 0x0123, 0x0125, 0x0127, 0x0129, 0x012b, 0x012d, 0x012f, @@ -409,7 +413,7 @@ static const chr lowerCharTable[] = { 0x210f, 0x2113, 0x212f, 0x2134, 0x2139 }; -#define NUM_LOWER_CHAR (sizeof(lowerCharTable)/sizeof(chr)) +#define NUM_LOWER_CHAR (sizeof(lowerCharTable)/sizeof(pchr)) /* * Unicode: uppercase characters. @@ -428,7 +432,7 @@ static const crange upperRangeTable[] = { #define NUM_UPPER_RANGE (sizeof(upperRangeTable)/sizeof(crange)) -static const chr upperCharTable[] = { +static const pchr upperCharTable[] = { 0x0100, 0x0102, 0x0104, 0x0106, 0x0108, 0x010a, 0x010c, 0x010e, 0x0110, 0x0112, 0x0114, 0x0116, 0x0118, 0x011a, 0x011c, 0x011e, 0x0120, 0x0122, 0x0124, 0x0126, 0x0128, 0x012a, 0x012c, 0x012e, 0x0130, 0x0132, 0x0134, @@ -471,7 +475,7 @@ static const chr upperCharTable[] = { 0x2131, 0x2133 }; -#define NUM_UPPER_CHAR (sizeof(upperCharTable)/sizeof(chr)) +#define NUM_UPPER_CHAR (sizeof(upperCharTable)/sizeof(pchr)) /* * Unicode: unicode print characters excluding space. @@ -599,7 +603,7 @@ static const crange graphRangeTable[] = { #define NUM_GRAPH_RANGE (sizeof(graphRangeTable)/sizeof(crange)) -static const chr graphCharTable[] = { +static const pchr graphCharTable[] = { 0x0374, 0x0375, 0x037a, 0x037e, 0x038c, 0x0488, 0x0489, 0x04c7, 0x04c8, 0x04cb, 0x04cc, 0x04f8, 0x04f9, 0x0589, 0x058a, 0x060c, 0x061b, 0x061f, 0x098f, 0x0990, 0x09b2, 0x09bc, 0x09c7, 0x09c8, 0x09d7, 0x09dc, 0x09dd, @@ -616,7 +620,7 @@ static const chr graphCharTable[] = { 0x303e, 0x303f, 0xa4c6, 0xfb3e, 0xfb40, 0xfb41, 0xfb43, 0xfb44, 0xfe74 }; -#define NUM_GRAPH_CHAR (sizeof(graphCharTable)/sizeof(chr)) +#define NUM_GRAPH_CHAR (sizeof(graphCharTable)/sizeof(pchr)) /* * Unicode: unicode print characters including space, i.e. all Letters (class @@ -681,7 +685,7 @@ static const crange printRangeTable[] = { #define NUM_PRINT_RANGE (sizeof(printRangeTable)/sizeof(crange)) -static const chr printCharTable[] = { +static const pchr printCharTable[] = { 0x037A, 0x037E, 0x038C, 0x03DA, 0x03DC, 0x03DE, 0x03E0, 0x0589, 0x05BE, 0x05C0, 0x05C3, 0x060C, 0x061B, 0x061F, 0x06E9, 0x093D, 0x0950, 0x09B2, 0x0A5E, 0x0A8D, 0x0ABD, 0x0AD0, 0x0AE0, 0x0B3D, 0x0B9C, 0x0CDE, 0x0E01, @@ -690,7 +694,7 @@ static const chr printCharTable[] = { 0x2070, 0x2300, 0x274D, 0x2756, 0x303F, 0xFB3E, 0xFE74 }; -#define NUM_PRINT_CHAR (sizeof(printCharTable)/sizeof(chr)) +#define NUM_PRINT_CHAR (sizeof(printCharTable)/sizeof(pchr)) #endif /* @@ -720,9 +724,17 @@ element( assert(startp < endp); len = endp - startp; +#ifdef REGEX_UTF8 + wchar_t c; + if (mbtowc(&c, (const char *)startp, len) == len) { + // single character + return c; + } +#else if (len == 1) { return *startp; } +#endif NOTE(REG_ULOCALE); @@ -790,9 +802,9 @@ range( for (c=a; c<=b; c++) { addchr(cv, c); - lc = Tcl_UniCharToLower((chr)c); - uc = Tcl_UniCharToUpper((chr)c); - tc = Tcl_UniCharToTitle((chr)c); + lc = Tcl_UniCharToLower(c); + uc = Tcl_UniCharToUpper(c); + tc = Tcl_UniCharToTitle(c); if (c != lc) { addchr(cv, lc); } @@ -859,7 +871,7 @@ eclass( } cv = getcvec(v, 1, 0); assert(cv != NULL); - addchr(cv, (chr)c); + addchr(cv, c); return cv; } @@ -1097,12 +1109,12 @@ allcases( pchr pc) /* character to get case equivs of */ { struct cvec *cv; - chr c = (chr)pc; + pchr c = pc; chr lc, uc, tc; - lc = Tcl_UniCharToLower((chr)c); - uc = Tcl_UniCharToUpper((chr)c); - tc = Tcl_UniCharToTitle((chr)c); + lc = Tcl_UniCharToLower(c); + uc = Tcl_UniCharToUpper(c); + tc = Tcl_UniCharToTitle(c); if (tc != uc) { cv = getcvec(v, 3, 0); @@ -1147,6 +1159,7 @@ casecmp( size_t len) /* exact length of comparison */ { for (; len > 0; len--, x++, y++) { + // FIXME: Will fail if REGEX_UTF8. if ((*x!=*y) && (Tcl_UniCharToLower(*x) != Tcl_UniCharToLower(*y))) { return 1; } |
