diff options
-rw-r--r-- | Makefile.am | 2 | ||||
-rw-r--r-- | configure.ac | 1 | ||||
-rw-r--r-- | contrib/hsrex/Makefile.am | 10 | ||||
-rw-r--r-- | contrib/hsrex/regalone.c | 267 | ||||
-rw-r--r-- | contrib/hsrex/regalone.h | 250 | ||||
-rw-r--r-- | contrib/hsrex/regc_color.c | 848 | ||||
-rw-r--r-- | contrib/hsrex/regc_cvec.c | 146 | ||||
-rw-r--r-- | contrib/hsrex/regc_lex.c | 1185 | ||||
-rw-r--r-- | contrib/hsrex/regc_locale.c | 1163 | ||||
-rw-r--r-- | contrib/hsrex/regc_nfa.c | 1873 | ||||
-rw-r--r-- | contrib/hsrex/regcomp.c | 2169 | ||||
-rw-r--r-- | contrib/hsrex/regcustom.h | 185 | ||||
-rw-r--r-- | contrib/hsrex/rege_dfa.c | 816 | ||||
-rw-r--r-- | contrib/hsrex/regerror.c | 129 | ||||
-rw-r--r-- | contrib/hsrex/regerrs.h | 19 | ||||
-rw-r--r-- | contrib/hsrex/regex.h | 336 | ||||
-rw-r--r-- | contrib/hsrex/regexec.c | 1215 | ||||
-rw-r--r-- | contrib/hsrex/regfree.c | 60 | ||||
-rw-r--r-- | contrib/hsrex/regguts.h | 428 | ||||
-rw-r--r-- | src/Makefile.am | 5 | ||||
-rw-r--r-- | src/search.c | 126 |
21 files changed, 11164 insertions, 69 deletions
diff --git a/Makefile.am b/Makefile.am index 3375ce8..10e54d4 100644 --- a/Makefile.am +++ b/Makefile.am @@ -5,7 +5,7 @@ ACLOCAL_AMFLAGS = -I m4 if REPLACE_MALLOC MAYBE_DLMALLOC = contrib/dlmalloc endif -SUBDIRS = lib $(MAYBE_DLMALLOC) contrib/rb3ptr src doc tests +SUBDIRS = lib $(MAYBE_DLMALLOC) contrib/rb3ptr contrib/hsrex src doc tests dist_scitecodata_DATA = sample.teco_ini diff --git a/configure.ac b/configure.ac index b7a5636..5fe6fb7 100644 --- a/configure.ac +++ b/configure.ac @@ -470,6 +470,7 @@ AC_CONFIG_FILES([GNUmakefile:Makefile.in src/GNUmakefile:src/Makefile.in] [src/interface-curses/GNUmakefile:src/interface-curses/Makefile.in] [contrib/dlmalloc/GNUmakefile:contrib/dlmalloc/Makefile.in] [contrib/rb3ptr/GNUmakefile:contrib/rb3ptr/Makefile.in] + [contrib/hsrex/GNUmakefile:contrib/hsrex/Makefile.in] [lib/GNUmakefile:lib/Makefile.in] [doc/GNUmakefile:doc/Makefile.in doc/Doxyfile] [tests/GNUmakefile:tests/Makefile.in tests/atlocal]) diff --git a/contrib/hsrex/Makefile.am b/contrib/hsrex/Makefile.am new file mode 100644 index 0000000..11b979a --- /dev/null +++ b/contrib/hsrex/Makefile.am @@ -0,0 +1,10 @@ +# FIXME: We probably need both ASCII and widechar versions +# as separate libraries. +AM_CPPFLAGS = -DREGEX_STANDALONE +# -DREGEX_WCHAR + +noinst_LTLIBRARIES = libhswrex.la +libhswrex_la_SOURCES = regcomp.c regexec.c regerror.c regfree.c regalone.c \ + regalone.h regcustom.h regerrs.h regex.h regguts.h +# included from regcomp.c and regexec.c +EXTRA_libhswrex_la_SOURCES = regc_color.c regc_cvec.c regc_lex.c regc_locale.c regc_nfa.c diff --git a/contrib/hsrex/regalone.c b/contrib/hsrex/regalone.c new file mode 100644 index 0000000..e0a5fcc --- /dev/null +++ b/contrib/hsrex/regalone.c @@ -0,0 +1,267 @@ +#ifdef REGEX_WCHAR + +#include "regcustom.h" + + + +/* + *---------------------------------------------------------------------- + * + * Tcl_DStringInit -- + * + * Initializes a dynamic string, discarding any previous contents of the + * string (Tcl_DStringFree should have been called already if the dynamic + * string was previously in use). + * + * Results: + * None. + * + * Side effects: + * The dynamic string is initialized to be empty. + * + *---------------------------------------------------------------------- + */ + +void +Tcl_DStringInit( + Tcl_DString *dsPtr) /* Pointer to structure for dynamic string. */ +{ + dsPtr->string = dsPtr->staticSpace; + dsPtr->length = 0; + dsPtr->spaceAvl = TCL_DSTRING_STATIC_SIZE; + dsPtr->staticSpace[0] = '\0'; +} + + +/* + *---------------------------------------------------------------------- + * + * Tcl_DStringSetLength -- + * + * Change the length of a dynamic string. This can cause the string to + * either grow or shrink, depending on the value of length. + * + * Results: + * None. + * + * Side effects: + * The length of dsPtr is changed to length and a null byte is stored at + * that position in the string. If length is larger than the space + * allocated for dsPtr, then a panic occurs. + * + *---------------------------------------------------------------------- + */ + +void +Tcl_DStringSetLength( + Tcl_DString *dsPtr, /* Structure describing dynamic string. */ + int length) /* New length for dynamic string. */ +{ + int newsize; + + if (length < 0) { + length = 0; + } + if (length >= dsPtr->spaceAvl) { + /* + * There are two interesting cases here. In the first case, the user + * may be trying to allocate a large buffer of a specific size. It + * would be wasteful to overallocate that buffer, so we just allocate + * enough for the requested size plus the trailing null byte. In the + * second case, we are growing the buffer incrementally, so we need + * behavior similar to Tcl_DStringAppend. The requested length will + * usually be a small delta above the current spaceAvl, so we'll end + * up doubling the old size. This won't grow the buffer quite as + * quickly, but it should be close enough. + */ + + newsize = dsPtr->spaceAvl * 2; + if (length < newsize) { + dsPtr->spaceAvl = newsize; + } else { + dsPtr->spaceAvl = length + 1; + } + if (dsPtr->string == dsPtr->staticSpace) { + char *newString = ckalloc((unsigned) dsPtr->spaceAvl); + + memcpy(newString, dsPtr->string, (size_t) dsPtr->length); + dsPtr->string = newString; + } else { + dsPtr->string = (char *) ckrealloc((void *) dsPtr->string, + (size_t) dsPtr->spaceAvl); + } + } + dsPtr->length = length; + dsPtr->string[length] = 0; +} + + +/* + *---------------------------------------------------------------------- + * + * Tcl_DStringFree -- + * + * Frees up any memory allocated for the dynamic string and reinitializes + * the string to an empty state. + * + * Results: + * None. + * + * Side effects: + * The previous contents of the dynamic string are lost, and the new + * value is an empty string. + * + *---------------------------------------------------------------------- + */ + +void +Tcl_DStringFree( + Tcl_DString *dsPtr) /* Structure describing dynamic string. */ +{ + if (dsPtr->string != dsPtr->staticSpace) { + ckfree(dsPtr->string); + } + dsPtr->string = dsPtr->staticSpace; + dsPtr->length = 0; + dsPtr->spaceAvl = TCL_DSTRING_STATIC_SIZE; + dsPtr->staticSpace[0] = '\0'; +} + + + +/* + * Unicode characters less than this value are represented by themselves in + * UTF-8 strings. + */ + +#define UNICODE_SELF 0x80 + + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UniCharToUtf -- + * + * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the + * provided buffer. Equivalent to Plan 9 runetochar(). + * + * Results: + * The return values is the number of bytes in the buffer that were + * consumed. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +INLINE int +Tcl_UniCharToUtf( + int ch, /* The Tcl_UniChar to be stored in the + * buffer. */ + char *buf) /* Buffer in which the UTF-8 representation of + * the Tcl_UniChar is stored. Buffer must be + * large enough to hold the UTF-8 character + * (at most TCL_UTF_MAX bytes). */ +{ + if ((ch > 0) && (ch < UNICODE_SELF)) { + buf[0] = (char) ch; + return 1; + } + if (ch >= 0) { + if (ch <= 0x7FF) { + buf[1] = (char) ((ch | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 6) | 0xC0); + return 2; + } + if (ch <= 0xFFFF) { + three: + buf[2] = (char) ((ch | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 12) | 0xE0); + return 3; + } + +#if TCL_UTF_MAX > 3 + if (ch <= 0x1FFFFF) { + buf[3] = (char) ((ch | 0x80) & 0xBF); + buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 18) | 0xF0); + return 4; + } + if (ch <= 0x3FFFFFF) { + buf[4] = (char) ((ch | 0x80) & 0xBF); + buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 24) | 0xF8); + return 5; + } + if (ch <= 0x7FFFFFFF) { + buf[5] = (char) ((ch | 0x80) & 0xBF); + buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF); + buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 30) | 0xFC); + return 6; + } +#endif + } + + ch = 0xFFFD; + goto three; +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UniCharToUtfDString -- + * + * Convert the given Unicode string to UTF-8. + * + * Results: + * The return value is a pointer to the UTF-8 representation of the + * Unicode string. Storage for the return value is appended to the end of + * dsPtr. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +char * +Tcl_UniCharToUtfDString( + const Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */ + int uniLength, /* Length of Unicode string in Tcl_UniChars + * (must be >= 0). */ + Tcl_DString *dsPtr) /* UTF-8 representation of string is appended + * to this previously initialized DString. */ +{ + const Tcl_UniChar *w, *wEnd; + char *p, *string; + int oldLength; + + /* + * UTF-8 string length in bytes will be <= Unicode string length * + * TCL_UTF_MAX. + */ + + oldLength = Tcl_DStringLength(dsPtr); + Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX); + string = Tcl_DStringValue(dsPtr) + oldLength; + + p = string; + wEnd = uniStr + uniLength; + for (w = uniStr; w < wEnd; ) { + p += Tcl_UniCharToUtf(*w, p); + w++; + } + Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); + + return string; +} + +#endif /* REGEX_WCHAR */ diff --git a/contrib/hsrex/regalone.h b/contrib/hsrex/regalone.h new file mode 100644 index 0000000..940c11d --- /dev/null +++ b/contrib/hsrex/regalone.h @@ -0,0 +1,250 @@ + +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#ifndef REGEX_STANDALONE +# define REGEX_STANDALONE +#endif + +#ifdef REGEX_WCHAR +# include <wctype.h> +# include <wchar.h> + typedef wchar_t chr; + typedef chr Tcl_UniChar; +#else +# include <ctype.h> + typedef unsigned char chr; + typedef wchar_t Tcl_UniChar; +#endif + +/* + * In The standalone version we are more concerned with performance, + * so an automatic var is our best choice. + */ +#define AllocVars(vPtr) \ + struct vars regex_autovar; \ + register struct vars *vPtr = ®ex_autovar; + +#define MALLOC(n) calloc(1,n) +#define FREE(p) free(VS(p)) +#define REALLOC(p,n) realloc(VS(p),n) +#define ckalloc(n) calloc(1,n) +#define ckrealloc(p,n) realloc(p,n) +#define ckfree(p) free(p) + +#ifdef REGEX_WCHAR +# define Tcl_UniCharToLower(c) towlower(c) +# define Tcl_UniCharToUpper(c) towupper(c) +# define Tcl_UniCharToTitle(c) towupper(c) +# define Tcl_UniCharIsAlpha(c) iswalpha(c) +# define Tcl_UniCharIsAlnum(c) iswalnum(c) +# define Tcl_UniCharIsDigit(c) iswdigit(c) +# define Tcl_UniCharIsSpace(c) iswspace(c) +#else +# define Tcl_DStringInit(ds) +# define Tcl_UniCharToUtfDString(s,l,ds) (s) +# define Tcl_DStringFree(ds) +# define Tcl_UniCharToLower(c) tolower(c) +# define Tcl_UniCharToUpper(c) toupper(c) +# define Tcl_UniCharToTitle(c) toupper(c) +# define Tcl_UniCharIsAlpha(c) isalpha(c) +# define Tcl_UniCharIsAlnum(c) isalnum(c) +# define Tcl_UniCharIsDigit(c) isdigit(c) +# define Tcl_UniCharIsSpace(c) isspace(c) +#endif + + +/* + * The maximum number of bytes that are necessary to represent a single + * Unicode character in UTF-8. The valid values should be 3 or 6 (or perhaps 1 + * if we want to support a non-unicode enabled core). If 3, then Tcl_UniChar + * must be 2-bytes in size (UCS-2) (the default). If 6, then Tcl_UniChar must + * be 4-bytes in size (UCS-4). At this time UCS-2 mode is the default and + * recommended mode. UCS-4 is experimental and not recommended. It works for + * the core, but most extensions expect UCS-2. + */ + +#ifndef TCL_UTF_MAX +#define TCL_UTF_MAX 3 +#endif + + +/* + * The structure defined below is used to hold dynamic strings. The only + * fields that clients should use are string and length, accessible via the + * macros Tcl_DStringValue and Tcl_DStringLength. + */ + +#define TCL_DSTRING_STATIC_SIZE 200 +typedef struct Tcl_DString { + char *string; /* Points to beginning of string: either + * staticSpace below or a malloced array. */ + int length; /* Number of non-NULL characters in the + * string. */ + int spaceAvl; /* Total number of bytes available for the + * string and its terminating NULL char. */ + char staticSpace[TCL_DSTRING_STATIC_SIZE]; + /* Space to use in common case where string is + * small. */ +} Tcl_DString; + +#define Tcl_DStringLength(dsPtr) ((dsPtr)->length) +#define Tcl_DStringValue(dsPtr) ((dsPtr)->string) + + +/* + * The macro below is used to modify a "char" value (e.g. by casting it to an + * unsigned character) so that it can be used safely with macros such as + * isspace. + */ + +#define UCHAR(c) ((unsigned char) (c)) + + +/* + * Used to tag functions that are only to be visible within the module being + * built and not outside it (where this is supported by the linker). + */ + +#ifndef MODULE_SCOPE +# ifdef __cplusplus +# define MODULE_SCOPE extern "C" +# else +# define MODULE_SCOPE extern +# endif +#endif + + +/* + * Macros used to declare a function to be exported by a DLL. Used by Windows, + * maps to no-op declarations on non-Windows systems. The default build on + * windows is for a DLL, which causes the DLLIMPORT and DLLEXPORT macros to be + * nonempty. To build a static library, the macro STATIC_BUILD should be + * defined. + * + * Note: when building static but linking dynamically to MSVCRT we must still + * correctly decorate the C library imported function. Use CRTIMPORT + * for this purpose. _DLL is defined by the compiler when linking to + * MSVCRT. + */ + +#if (defined(__WIN32__) && (defined(_MSC_VER) || (__BORLANDC__ >= 0x0550) || defined(__LCC__) || defined(__WATCOMC__) || (defined(__GNUC__) && defined(__declspec)))) +# define HAVE_DECLSPEC 1 +# ifdef STATIC_BUILD +# define DLLIMPORT +# define DLLEXPORT +# ifdef _DLL +# define CRTIMPORT __declspec(dllimport) +# else +# define CRTIMPORT +# endif +# else +# define DLLIMPORT __declspec(dllimport) +# define DLLEXPORT __declspec(dllexport) +# define CRTIMPORT __declspec(dllimport) +# endif +#else +# define DLLIMPORT +# if defined(__GNUC__) && __GNUC__ > 3 +# define DLLEXPORT __attribute__ ((visibility("default"))) +# else +# define DLLEXPORT +# endif +# define CRTIMPORT +#endif + +/* + * These macros are used to control whether functions are being declared for + * import or export. If a function is being declared while it is being built + * to be included in a shared library, then it should have the DLLEXPORT + * storage class. If is being declared for use by a module that is going to + * link against the shared library, then it should have the DLLIMPORT storage + * class. If the symbol is beind declared for a static build or for use from a + * stub library, then the storage class should be empty. + * + * The convention is that a macro called BUILD_xxxx, where xxxx is the name of + * a library we are building, is set on the compile line for sources that are + * to be placed in the library. When this macro is set, the storage class will + * be set to DLLEXPORT. At the end of the header file, the storage class will + * be reset to DLLIMPORT. + */ + +#undef TCL_STORAGE_CLASS +#ifdef BUILD_tcl +# define TCL_STORAGE_CLASS DLLEXPORT +#else +# ifdef USE_TCL_STUBS +# define TCL_STORAGE_CLASS +# else +# define TCL_STORAGE_CLASS DLLIMPORT +# endif +#endif + +/* + * Definitions that allow this header file to be used either with or without + * ANSI C features like function prototypes. + */ + +#undef _ANSI_ARGS_ +#undef CONST +#ifndef INLINE +# define INLINE +#endif + +#ifndef NO_CONST +# define CONST const +#else +# define CONST +#endif + +#ifndef NO_PROTOTYPES +# define _ANSI_ARGS_(x) x +#else +# define _ANSI_ARGS_(x) () +#endif + +#ifdef USE_NON_CONST +# ifdef USE_COMPAT_CONST +# error define at most one of USE_NON_CONST and USE_COMPAT_CONST +# endif +# define CONST84 +# define CONST84_RETURN +#else +# ifdef USE_COMPAT_CONST +# define CONST84 +# define CONST84_RETURN CONST +# else +# define CONST84 CONST +# define CONST84_RETURN CONST +# endif +#endif + +#ifndef CONST86 +# define CONST86 CONST +#endif + +/* + * Make sure EXTERN isn't defined elsewhere + */ + +#ifdef EXTERN +# undef EXTERN +#endif /* EXTERN */ + +#ifdef __cplusplus +# define EXTERN extern "C" TCL_STORAGE_CLASS +#else +# define EXTERN extern TCL_STORAGE_CLASS +#endif + + +#ifdef REGEX_WCHAR +EXTERN void Tcl_DStringFree (Tcl_DString * dsPtr); +EXTERN void Tcl_DStringInit (Tcl_DString * dsPtr); +EXTERN char * Tcl_UniCharToUtfDString (CONST Tcl_UniChar * uniStr, + int uniLength, Tcl_DString * dsPtr); +EXTERN void Tcl_DStringSetLength (Tcl_DString * dsPtr, + int length); +#endif /* REGEX_WCHAR */ diff --git a/contrib/hsrex/regc_color.c b/contrib/hsrex/regc_color.c new file mode 100644 index 0000000..7a98dcb --- /dev/null +++ b/contrib/hsrex/regc_color.c @@ -0,0 +1,848 @@ +/* + * colorings of characters + * This file is #included by regcomp.c. + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation of + * software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Note that there are some incestuous relationships between this code and NFA + * arc maintenance, which perhaps ought to be cleaned up sometime. + */ + +#define CISERR() VISERR(cm->v) +#define CERR(e) VERR(cm->v, (e)) + +/* + - initcm - set up new colormap + ^ static void initcm(struct vars *, struct colormap *); + */ +static void +initcm( + struct vars *v, + struct colormap *cm) +{ + int i; + int j; + union tree *t; + union tree *nextt; + struct colordesc *cd; + + cm->magic = CMMAGIC; + cm->v = v; + + cm->ncds = NINLINECDS; + cm->cd = cm->cdspace; + cm->max = 0; + cm->free = 0; + + cd = cm->cd; /* cm->cd[WHITE] */ + cd->sub = NOSUB; + cd->arcs = NULL; + cd->flags = 0; + cd->nchrs = CHR_MAX - CHR_MIN + 1; + + /* + * Upper levels of tree. + */ + + for (t=&cm->tree[0], j=NBYTS-1 ; j>0 ; t=nextt, j--) { + nextt = t + 1; + for (i=BYTTAB-1 ; i>=0 ; i--) { + t->tptr[i] = nextt; + } + } + + /* + * Bottom level is solid white. + */ + + t = &cm->tree[NBYTS-1]; + for (i=BYTTAB-1 ; i>=0 ; i--) { + t->tcolor[i] = WHITE; + } + cd->block = t; +} + +/* + - freecm - free dynamically-allocated things in a colormap + ^ static void freecm(struct colormap *); + */ +static void +freecm( + struct colormap *cm) +{ + size_t i; + union tree *cb; + + cm->magic = 0; + if (NBYTS > 1) { + cmtreefree(cm, cm->tree, 0); + } + for (i=1 ; i<=cm->max ; i++) { /* skip WHITE */ + if (!UNUSEDCOLOR(&cm->cd[i])) { + cb = cm->cd[i].block; + if (cb != NULL) { + FREE(cb); + } + } + } + if (cm->cd != cm->cdspace) { + FREE(cm->cd); + } +} + +/* + - cmtreefree - free a non-terminal part of a colormap tree + ^ static void cmtreefree(struct colormap *, union tree *, int); + */ +static void +cmtreefree( + struct colormap *cm, + union tree *tree, + int level) /* level number (top == 0) of this block */ +{ + int i; + union tree *t; + union tree *fillt = &cm->tree[level+1]; + union tree *cb; + + assert(level < NBYTS-1); /* this level has pointers */ + for (i=BYTTAB-1 ; i>=0 ; i--) { + t = tree->tptr[i]; + assert(t != NULL); + if (t != fillt) { + if (level < NBYTS-2) { /* more pointer blocks below */ + cmtreefree(cm, t, level+1); + FREE(t); + } else { /* color block below */ + cb = cm->cd[t->tcolor[0]].block; + if (t != cb) { /* not a solid block */ + FREE(t); + } + } + } + } +} + +/* + - setcolor - set the color of a character in a colormap + ^ static color setcolor(struct colormap *, pchr, pcolor); + */ +static color /* previous color */ +setcolor( + struct colormap *cm, + pchr c, + pcolor co) +{ + uchr uc = c; + int shift; + int level; + int b; + int bottom; + union tree *t; + union tree *newt; + union tree *fillt; + union tree *lastt; + union tree *cb; + color prev; + + assert(cm->magic == CMMAGIC); + if (CISERR() || co == COLORLESS) { + return COLORLESS; + } + + t = cm->tree; + for (level=0, shift=BYTBITS*(NBYTS-1) ; shift>0; level++, shift-=BYTBITS){ + b = (uc >> shift) & BYTMASK; + lastt = t; + t = lastt->tptr[b]; + assert(t != NULL); + fillt = &cm->tree[level+1]; + bottom = (shift <= BYTBITS) ? 1 : 0; + cb = (bottom) ? cm->cd[t->tcolor[0]].block : fillt; + if (t == fillt || t == cb) { /* must allocate a new block */ + newt = (union tree *) MALLOC((bottom) ? + sizeof(struct colors) : sizeof(struct ptrs)); + if (newt == NULL) { + CERR(REG_ESPACE); + return COLORLESS; + } + if (bottom) { + memcpy(newt->tcolor, t->tcolor, BYTTAB*sizeof(color)); + } else { + memcpy(newt->tptr, t->tptr, BYTTAB*sizeof(union tree *)); + } + t = newt; + lastt->tptr[b] = t; + } + } + + b = uc & BYTMASK; + prev = t->tcolor[b]; + t->tcolor[b] = (color) co; + return prev; +} + +/* + - maxcolor - report largest color number in use + ^ static color maxcolor(struct colormap *); + */ +static color +maxcolor( + struct colormap *cm) +{ + if (CISERR()) { + return COLORLESS; + } + + return (color) cm->max; +} + +/* + - newcolor - find a new color (must be subject of setcolor at once) + * Beware: may relocate the colordescs. + ^ static color newcolor(struct colormap *); + */ +static color /* COLORLESS for error */ +newcolor( + struct colormap *cm) +{ + struct colordesc *cd; + size_t n; + + if (CISERR()) { + return COLORLESS; + } + + if (cm->free != 0) { + assert(cm->free > 0); + assert((size_t) cm->free < cm->ncds); + cd = &cm->cd[cm->free]; + assert(UNUSEDCOLOR(cd)); + assert(cd->arcs == NULL); + cm->free = cd->sub; + } else if (cm->max < cm->ncds - 1) { + cm->max++; + cd = &cm->cd[cm->max]; + } else { + struct colordesc *newCd; + + /* + * Oops, must allocate more. + */ + + n = cm->ncds * 2; + if (cm->cd == cm->cdspace) { + newCd = (struct colordesc *) MALLOC(n * sizeof(struct colordesc)); + if (newCd != NULL) { + memcpy(newCd, cm->cdspace, + cm->ncds * sizeof(struct colordesc)); + } + } else { + newCd = (struct colordesc *) + REALLOC(cm->cd, n * sizeof(struct colordesc)); + } + if (newCd == NULL) { + CERR(REG_ESPACE); + return COLORLESS; + } + cm->cd = newCd; + cm->ncds = n; + assert(cm->max < cm->ncds - 1); + cm->max++; + cd = &cm->cd[cm->max]; + } + + cd->nchrs = 0; + cd->sub = NOSUB; + cd->arcs = NULL; + cd->flags = 0; + cd->block = NULL; + + return (color) (cd - cm->cd); +} + +/* + - freecolor - free a color (must have no arcs or subcolor) + ^ static void freecolor(struct colormap *, pcolor); + */ +static void +freecolor( + struct colormap *cm, + pcolor co) +{ + struct colordesc *cd = &cm->cd[co]; + color pco, nco; /* for freelist scan */ + + assert(co >= 0); + if (co == WHITE) { + return; + } + + assert(cd->arcs == NULL); + assert(cd->sub == NOSUB); + assert(cd->nchrs == 0); + cd->flags = FREECOL; + if (cd->block != NULL) { + FREE(cd->block); + cd->block = NULL; /* just paranoia */ + } + + if ((size_t) co == cm->max) { + while (cm->max > WHITE && UNUSEDCOLOR(&cm->cd[cm->max])) { + cm->max--; + } + assert(cm->free >= 0); + while ((size_t) cm->free > cm->max) { + cm->free = cm->cd[cm->free].sub; + } + if (cm->free > 0) { + assert(cm->free < cm->max); + pco = cm->free; + nco = cm->cd[pco].sub; + while (nco > 0) { + if ((size_t) nco > cm->max) { + /* + * Take this one out of freelist. + */ + + nco = cm->cd[nco].sub; + cm->cd[pco].sub = nco; + } else { + assert(nco < cm->max); + pco = nco; + nco = cm->cd[pco].sub; + } + } + } + } else { + cd->sub = cm->free; + cm->free = (color) (cd - cm->cd); + } +} + +/* + - pseudocolor - allocate a false color, to be managed by other means + ^ static color pseudocolor(struct colormap *); + */ +static color +pseudocolor( + struct colormap *cm) +{ + color co; + + co = newcolor(cm); + if (CISERR()) { + return COLORLESS; + } + cm->cd[co].nchrs = 1; + cm->cd[co].flags = PSEUDO; + return co; +} + +/* + - subcolor - allocate a new subcolor (if necessary) to this chr + ^ static color subcolor(struct colormap *, pchr c); + */ +static color +subcolor( + struct colormap *cm, + pchr c) +{ + color co; /* current color of c */ + color sco; /* new subcolor */ + + co = GETCOLOR(cm, c); + sco = newsub(cm, co); + if (CISERR()) { + return COLORLESS; + } + assert(sco != COLORLESS); + + if (co == sco) { /* already in an open subcolor */ + return co; /* rest is redundant */ + } + cm->cd[co].nchrs--; + cm->cd[sco].nchrs++; + setcolor(cm, c, sco); + return sco; +} + +/* + - newsub - allocate a new subcolor (if necessary) for a color + ^ static color newsub(struct colormap *, pcolor); + */ +static color +newsub( + struct colormap *cm, + pcolor co) +{ + color sco; /* new subcolor */ + + sco = cm->cd[co].sub; + if (sco == NOSUB) { /* color has no open subcolor */ + if (cm->cd[co].nchrs == 1) { /* optimization */ + return co; + } + sco = newcolor(cm); /* must create subcolor */ + if (sco == COLORLESS) { + assert(CISERR()); + return COLORLESS; + } + cm->cd[co].sub = sco; + cm->cd[sco].sub = sco; /* open subcolor points to self */ + } + assert(sco != NOSUB); + + return sco; +} + +/* + - subrange - allocate new subcolors to this range of chrs, fill in arcs + ^ static void subrange(struct vars *, pchr, pchr, struct state *, + ^ struct state *); + */ +static void +subrange( + struct vars *v, + pchr from, + pchr to, + struct state *lp, + struct state *rp) +{ + uchr uf; + int i; + + assert(from <= to); + + /* + * First, align "from" on a tree-block boundary + */ + + uf = (uchr) from; + i = (int) (((uf + BYTTAB - 1) & (uchr) ~BYTMASK) - uf); + for (; from<=to && i>0; i--, from++) { + newarc(v->nfa, PLAIN, subcolor(v->cm, from), lp, rp); + } + if (from > to) { /* didn't reach a boundary */ + return; + } + + /* + * Deal with whole blocks. + */ + + for (; to-from>=BYTTAB ; from+=BYTTAB) { + subblock(v, from, lp, rp); + } + + /* + * Clean up any remaining partial table. + */ + + for (; from<=to ; from++) { + newarc(v->nfa, PLAIN, subcolor(v->cm, from), lp, rp); + } +} + +/* + - subblock - allocate new subcolors for one tree block of chrs, fill in arcs + ^ static void subblock(struct vars *, pchr, struct state *, struct state *); + */ +static void +subblock( + struct vars *v, + pchr start, /* first of BYTTAB chrs */ + struct state *lp, + struct state *rp) +{ + uchr uc = start; + struct colormap *cm = v->cm; + int shift; + int level; + int i; + int b; + union tree *t; + union tree *cb; + union tree *fillt; + union tree *lastt; + int previ; + int ndone; + color co; + color sco; + + assert((uc % BYTTAB) == 0); + + /* + * Find its color block, making new pointer blocks as needed. + */ + + t = cm->tree; + fillt = NULL; + for (level=0, shift=BYTBITS*(NBYTS-1); shift>0; level++, shift-=BYTBITS) { + b = (uc >> shift) & BYTMASK; + lastt = t; + t = lastt->tptr[b]; + assert(t != NULL); + fillt = &cm->tree[level+1]; + if (t == fillt && shift > BYTBITS) { /* need new ptr block */ + t = (union tree *) MALLOC(sizeof(struct ptrs)); + if (t == NULL) { + CERR(REG_ESPACE); + return; + } + memcpy(t->tptr, fillt->tptr, BYTTAB*sizeof(union tree *)); + lastt->tptr[b] = t; + } + } + + /* + * Special cases: fill block or solid block. + */ + co = t->tcolor[0]; + cb = cm->cd[co].block; + if (t == fillt || t == cb) { + /* + * Either way, we want a subcolor solid block. + */ + + sco = newsub(cm, co); + t = cm->cd[sco].block; + if (t == NULL) { /* must set it up */ + t = (union tree *) MALLOC(sizeof(struct colors)); + if (t == NULL) { + CERR(REG_ESPACE); + return; + } + for (i=0 ; i<BYTTAB ; i++) { + t->tcolor[i] = sco; + } + cm->cd[sco].block = t; + } + + /* + * Find loop must have run at least once. + */ + + lastt->tptr[b] = t; + newarc(v->nfa, PLAIN, sco, lp, rp); + cm->cd[co].nchrs -= BYTTAB; + cm->cd[sco].nchrs += BYTTAB; + return; + } + + /* + * General case, a mixed block to be altered. + */ + + i = 0; + while (i < BYTTAB) { + co = t->tcolor[i]; + sco = newsub(cm, co); + newarc(v->nfa, PLAIN, sco, lp, rp); + previ = i; + do { + t->tcolor[i++] = sco; + } while (i < BYTTAB && t->tcolor[i] == co); + ndone = i - previ; + cm->cd[co].nchrs -= ndone; + cm->cd[sco].nchrs += ndone; + } +} + +/* + - okcolors - promote subcolors to full colors + ^ static void okcolors(struct nfa *, struct colormap *); + */ +static void +okcolors( + struct nfa *nfa, + struct colormap *cm) +{ + struct colordesc *cd; + struct colordesc *end = CDEND(cm); + struct colordesc *scd; + struct arc *a; + color co; + color sco; + + for (cd=cm->cd, co=0 ; cd<end ; cd++, co++) { + sco = cd->sub; + if (UNUSEDCOLOR(cd) || sco == NOSUB) { + /* + * Has no subcolor, no further action. + */ + } else if (sco == co) { + /* + * Is subcolor, let parent deal with it. + */ + } else if (cd->nchrs == 0) { + /* + * Parent empty, its arcs change color to subcolor. + */ + + cd->sub = NOSUB; + scd = &cm->cd[sco]; + assert(scd->nchrs > 0); + assert(scd->sub == sco); + scd->sub = NOSUB; + while ((a = cd->arcs) != NULL) { + assert(a->co == co); + uncolorchain(cm, a); + a->co = sco; + colorchain(cm, a); + } + freecolor(cm, co); + } else { + /* + * Parent's arcs must gain parallel subcolor arcs. + */ + + cd->sub = NOSUB; + scd = &cm->cd[sco]; + assert(scd->nchrs > 0); + assert(scd->sub == sco); + scd->sub = NOSUB; + for (a=cd->arcs ; a!=NULL ; a=a->colorchain) { + assert(a->co == co); + newarc(nfa, a->type, sco, a->from, a->to); + } + } + } +} + +/* + - colorchain - add this arc to the color chain of its color + ^ static void colorchain(struct colormap *, struct arc *); + */ +static void +colorchain( + struct colormap *cm, + struct arc *a) +{ + struct colordesc *cd = &cm->cd[a->co]; + + if (cd->arcs != NULL) { + cd->arcs->colorchainRev = a; + } + a->colorchain = cd->arcs; + a->colorchainRev = NULL; + cd->arcs = a; +} + +/* + - uncolorchain - delete this arc from the color chain of its color + ^ static void uncolorchain(struct colormap *, struct arc *); + */ +static void +uncolorchain( + struct colormap *cm, + struct arc *a) +{ + struct colordesc *cd = &cm->cd[a->co]; + struct arc *aa = a->colorchainRev; + + if (aa == NULL) { + assert(cd->arcs == a); + cd->arcs = a->colorchain; + } else { + assert(aa->colorchain == a); + aa->colorchain = a->colorchain; + } + if (a->colorchain != NULL) { + a->colorchain->colorchainRev = aa; + } + a->colorchain = NULL; /* paranoia */ + a->colorchainRev = NULL; +} + +/* + - rainbow - add arcs of all full colors (but one) between specified states + ^ static void rainbow(struct nfa *, struct colormap *, int, pcolor, + ^ struct state *, struct state *); + */ +static void +rainbow( + struct nfa *nfa, + struct colormap *cm, + int type, + pcolor but, /* COLORLESS if no exceptions */ + struct state *from, + struct state *to) +{ + struct colordesc *cd; + struct colordesc *end = CDEND(cm); + color co; + + for (cd=cm->cd, co=0 ; cd<end && !CISERR(); cd++, co++) { + if (!UNUSEDCOLOR(cd) && (cd->sub != co) && (co != but) + && !(cd->flags&PSEUDO)) { + newarc(nfa, type, co, from, to); + } + } +} + +/* + - colorcomplement - add arcs of complementary colors + * The calling sequence ought to be reconciled with cloneouts(). + ^ static void colorcomplement(struct nfa *, struct colormap *, int, + ^ struct state *, struct state *, struct state *); + */ +static void +colorcomplement( + struct nfa *nfa, + struct colormap *cm, + int type, + struct state *of, /* complements of this guy's PLAIN outarcs */ + struct state *from, + struct state *to) +{ + struct colordesc *cd; + struct colordesc *end = CDEND(cm); + color co; + + assert(of != from); + for (cd=cm->cd, co=0 ; cd<end && !CISERR() ; cd++, co++) { + if (!UNUSEDCOLOR(cd) && !(cd->flags&PSEUDO)) { + if (findarc(of, PLAIN, co) == NULL) { + newarc(nfa, type, co, from, to); + } + } + } +} + +#ifdef REG_DEBUG +/* + ^ #ifdef REG_DEBUG + */ + +/* + - dumpcolors - debugging output + ^ static void dumpcolors(struct colormap *, FILE *); + */ +static void +dumpcolors( + struct colormap *cm, + FILE *f) +{ + struct colordesc *cd; + struct colordesc *end; + color co; + chr c; + char *has; + + fprintf(f, "max %ld\n", (long) cm->max); + if (NBYTS > 1) { + fillcheck(cm, cm->tree, 0, f); + } + end = CDEND(cm); + for (cd=cm->cd+1, co=1 ; cd<end ; cd++, co++) { /* skip 0 */ + if (!UNUSEDCOLOR(cd)) { + assert(cd->nchrs > 0); + has = (cd->block != NULL) ? "#" : ""; + if (cd->flags&PSEUDO) { + fprintf(f, "#%2ld%s(ps): ", (long) co, has); + } else { + fprintf(f, "#%2ld%s(%2d): ", (long) co, has, cd->nchrs); + } + + /* + * It's hard to do this more efficiently. + */ + + for (c=CHR_MIN ; c<CHR_MAX ; c++) { + if (GETCOLOR(cm, c) == co) { + dumpchr(c, f); + } + } + assert(c == CHR_MAX); + if (GETCOLOR(cm, c) == co) { + dumpchr(c, f); + } + fprintf(f, "\n"); + } + } +} + +/* + - fillcheck - check proper filling of a tree + ^ static void fillcheck(struct colormap *, union tree *, int, FILE *); + */ +static void +fillcheck( + struct colormap *cm, + union tree *tree, + int level, /* level number (top == 0) of this block */ + FILE *f) +{ + int i; + union tree *t; + union tree *fillt = &cm->tree[level+1]; + + assert(level < NBYTS-1); /* this level has pointers */ + for (i=BYTTAB-1 ; i>=0 ; i--) { + t = tree->tptr[i]; + if (t == NULL) { + fprintf(f, "NULL found in filled tree!\n"); + } else if (t == fillt) { + /* empty body */ + } else if (level < NBYTS-2) { /* more pointer blocks below */ + fillcheck(cm, t, level+1, f); + } + } +} + +/* + - dumpchr - print a chr + * Kind of char-centric but works well enough for debug use. + ^ static void dumpchr(pchr, FILE *); + */ +static void +dumpchr( + pchr c, + FILE *f) +{ + if (c == '\\') { + fprintf(f, "\\\\"); + } else if (c > ' ' && c <= '~') { + putc((char) c, f); + } else { + fprintf(f, "\\u%04lx", (long) c); + } +} + +/* + ^ #endif + */ +#endif /* ifdef REG_DEBUG */ + +/* + * Local Variables: + * mode: c + * c-basic-offset: 4 + * fill-column: 78 + * End: + */ diff --git a/contrib/hsrex/regc_cvec.c b/contrib/hsrex/regc_cvec.c new file mode 100644 index 0000000..0247521 --- /dev/null +++ b/contrib/hsrex/regc_cvec.c @@ -0,0 +1,146 @@ +/* + * Utility functions for handling cvecs + * This file is #included by regcomp.c. + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation of + * software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Notes: + * Only (selected) functions in _this_ file should treat chr* as non-constant. + */ + +/* + - newcvec - allocate a new cvec + ^ static struct cvec *newcvec(int, int); + */ +static struct cvec * +newcvec( + int nchrs, /* to hold this many chrs... */ + int nranges) /* ... and this many ranges... */ +{ + size_t nc = (size_t)nchrs + (size_t)nranges*2; + size_t n = sizeof(struct cvec) + nc*sizeof(chr); + struct cvec *cv = (struct cvec *) MALLOC(n); + + if (cv == NULL) { + return NULL; + } + cv->chrspace = nchrs; + cv->chrs = (chr *)(((char *)cv)+sizeof(struct cvec)); + cv->ranges = cv->chrs + nchrs; + cv->rangespace = nranges; + return clearcvec(cv); +} + +/* + - clearcvec - clear a possibly-new cvec + * Returns pointer as convenience. + ^ static struct cvec *clearcvec(struct cvec *); + */ +static struct cvec * +clearcvec( + struct cvec *cv) /* character vector */ +{ + assert(cv != NULL); + cv->nchrs = 0; + cv->nranges = 0; + return cv; +} + +/* + - addchr - add a chr to a cvec + ^ static void addchr(struct cvec *, pchr); + */ +static void +addchr( + struct cvec *cv, /* character vector */ + pchr c) /* character to add */ +{ + cv->chrs[cv->nchrs++] = (chr)c; +} + +/* + - addrange - add a range to a cvec + ^ static void addrange(struct cvec *, pchr, pchr); + */ +static void +addrange( + struct cvec *cv, /* character vector */ + pchr from, /* first character of range */ + pchr to) /* last character of range */ +{ + assert(cv->nranges < cv->rangespace); + cv->ranges[cv->nranges*2] = (chr)from; + cv->ranges[cv->nranges*2 + 1] = (chr)to; + cv->nranges++; +} + +/* + - getcvec - get a cvec, remembering it as v->cv + ^ static struct cvec *getcvec(struct vars *, int, int); + */ +static struct cvec * +getcvec( + struct vars *v, /* context */ + int nchrs, /* to hold this many chrs... */ + int nranges) /* ... and this many ranges... */ +{ + if ((v->cv != NULL) && (nchrs <= v->cv->chrspace) && + (nranges <= v->cv->rangespace)) { + return clearcvec(v->cv); + } + + if (v->cv != NULL) { + freecvec(v->cv); + } + v->cv = newcvec(nchrs, nranges); + if (v->cv == NULL) { + ERR(REG_ESPACE); + } + + return v->cv; +} + +/* + - freecvec - free a cvec + ^ static void freecvec(struct cvec *); + */ +static void +freecvec( + struct cvec *cv) /* character vector */ +{ + FREE(cv); +} + +/* + * Local Variables: + * mode: c + * c-basic-offset: 4 + * fill-column: 78 + * End: + */ diff --git a/contrib/hsrex/regc_lex.c b/contrib/hsrex/regc_lex.c new file mode 100644 index 0000000..4be02c6 --- /dev/null +++ b/contrib/hsrex/regc_lex.c @@ -0,0 +1,1185 @@ +/* + * lexical analyzer + * This file is #included by regcomp.c. + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation of + * software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* scanning macros (know about v) */ +#define ATEOS() (v->now >= v->stop) +#define HAVE(n) (v->stop - v->now >= (n)) +#define NEXT1(c) (!ATEOS() && *v->now == CHR(c)) +#define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b)) +#define NEXT3(a,b,c) \ + (HAVE(3) && *v->now == CHR(a) && \ + *(v->now+1) == CHR(b) && \ + *(v->now+2) == CHR(c)) +#define SET(c) (v->nexttype = (c)) +#define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n)) +#define RET(c) return (SET(c), 1) +#define RETV(c, n) return (SETV(c, n), 1) +#define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */ +#define LASTTYPE(t) (v->lasttype == (t)) + +/* lexical contexts */ +#define L_ERE 1 /* mainline ERE/ARE */ +#define L_BRE 2 /* mainline BRE */ +#define L_Q 3 /* REG_QUOTE */ +#define L_EBND 4 /* ERE/ARE bound */ +#define L_BBND 5 /* BRE bound */ +#define L_BRACK 6 /* brackets */ +#define L_CEL 7 /* collating element */ +#define L_ECL 8 /* equivalence class */ +#define L_CCL 9 /* character class */ +#define INTOCON(c) (v->lexcon = (c)) +#define INCON(con) (v->lexcon == (con)) + +/* construct pointer past end of chr array */ +#define ENDOF(array) ((array) + sizeof(array)/sizeof(chr)) + +/* + - lexstart - set up lexical stuff, scan leading options + ^ static void lexstart(struct vars *); + */ +static void +lexstart( + struct vars *v) +{ + prefixes(v); /* may turn on new type bits etc. */ + NOERR(); + + if (v->cflags®_QUOTE) { + assert(!(v->cflags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE))); + INTOCON(L_Q); + } else if (v->cflags®_EXTENDED) { + assert(!(v->cflags®_QUOTE)); + INTOCON(L_ERE); + } else { + assert(!(v->cflags&(REG_QUOTE|REG_ADVF))); + INTOCON(L_BRE); + } + + v->nexttype = EMPTY; /* remember we were at the start */ + next(v); /* set up the first token */ +} + +/* + - prefixes - implement various special prefixes + ^ static void prefixes(struct vars *); + */ +static void +prefixes( + struct vars *v) +{ + /* + * Literal string doesn't get any of this stuff. + */ + + if (v->cflags®_QUOTE) { + return; + } + + /* + * Initial "***" gets special things. + */ + + if (HAVE(4) && NEXT3('*', '*', '*')) { + switch (*(v->now + 3)) { + case CHR('?'): /* "***?" error, msg shows version */ + ERR(REG_BADPAT); + return; /* proceed no further */ + break; + case CHR('='): /* "***=" shifts to literal string */ + NOTE(REG_UNONPOSIX); + v->cflags |= REG_QUOTE; + v->cflags &= ~(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE); + v->now += 4; + return; /* and there can be no more prefixes */ + break; + case CHR(':'): /* "***:" shifts to AREs */ + NOTE(REG_UNONPOSIX); + v->cflags |= REG_ADVANCED; + v->now += 4; + break; + default: /* otherwise *** is just an error */ + ERR(REG_BADRPT); + return; + break; + } + } + + /* + * BREs and EREs don't get embedded options. + */ + + if ((v->cflags®_ADVANCED) != REG_ADVANCED) { + return; + } + + /* + * Embedded options (AREs only). + */ + + if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2))) { + NOTE(REG_UNONPOSIX); + v->now += 2; + for (; !ATEOS() && iscalpha(*v->now); v->now++) { + switch (*v->now) { + case CHR('b'): /* BREs (but why???) */ + v->cflags &= ~(REG_ADVANCED|REG_QUOTE); + break; + case CHR('c'): /* case sensitive */ + v->cflags &= ~REG_ICASE; + break; + case CHR('e'): /* plain EREs */ + v->cflags |= REG_EXTENDED; + v->cflags &= ~(REG_ADVF|REG_QUOTE); + break; + case CHR('i'): /* case insensitive */ + v->cflags |= REG_ICASE; + break; + case CHR('m'): /* Perloid synonym for n */ + case CHR('n'): /* \n affects ^ $ . [^ */ + v->cflags |= REG_NEWLINE; + break; + case CHR('p'): /* ~Perl, \n affects . [^ */ + v->cflags |= REG_NLSTOP; + v->cflags &= ~REG_NLANCH; + break; + case CHR('q'): /* literal string */ + v->cflags |= REG_QUOTE; + v->cflags &= ~REG_ADVANCED; + break; + case CHR('s'): /* single line, \n ordinary */ + v->cflags &= ~REG_NEWLINE; + break; + case CHR('t'): /* tight syntax */ + v->cflags &= ~REG_EXPANDED; + break; + case CHR('w'): /* weird, \n affects ^ $ only */ + v->cflags &= ~REG_NLSTOP; + v->cflags |= REG_NLANCH; + break; + case CHR('x'): /* expanded syntax */ + v->cflags |= REG_EXPANDED; + break; + default: + ERR(REG_BADOPT); + return; + } + } + if (!NEXT1(')')) { + ERR(REG_BADOPT); + return; + } + v->now++; + if (v->cflags®_QUOTE) { + v->cflags &= ~(REG_EXPANDED|REG_NEWLINE); + } + } +} + +/* + - lexnest - "call a subroutine", interpolating string at the lexical level + * Note, this is not a very general facility. There are a number of + * implicit assumptions about what sorts of strings can be subroutines. + ^ static void lexnest(struct vars *, const chr *, const chr *); + */ +static void +lexnest( + struct vars *v, + const chr *beginp, /* start of interpolation */ + const chr *endp) /* one past end of interpolation */ +{ + assert(v->savenow == NULL); /* only one level of nesting */ + v->savenow = v->now; + v->savestop = v->stop; + v->now = beginp; + v->stop = endp; +} + +/* + * string constants to interpolate as expansions of things like \d + */ + +static const chr backd[] = { /* \d */ + CHR('['), CHR('['), CHR(':'), + CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), + CHR(':'), CHR(']'), CHR(']') +}; +static const chr backD[] = { /* \D */ + CHR('['), CHR('^'), CHR('['), CHR(':'), + CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), + CHR(':'), CHR(']'), CHR(']') +}; +static const chr brbackd[] = { /* \d within brackets */ + CHR('['), CHR(':'), + CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), + CHR(':'), CHR(']') +}; +static const chr backs[] = { /* \s */ + CHR('['), CHR('['), CHR(':'), + CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), + CHR(':'), CHR(']'), CHR(']') +}; +static const chr backS[] = { /* \S */ + CHR('['), CHR('^'), CHR('['), CHR(':'), + CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), + CHR(':'), CHR(']'), CHR(']') +}; +static const chr brbacks[] = { /* \s within brackets */ + CHR('['), CHR(':'), + CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), + CHR(':'), CHR(']') +}; +static const chr backw[] = { /* \w */ + CHR('['), CHR('['), CHR(':'), + CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), + CHR(':'), CHR(']'), CHR('_'), CHR(']') +}; +static const chr backW[] = { /* \W */ + CHR('['), CHR('^'), CHR('['), CHR(':'), + CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), + CHR(':'), CHR(']'), CHR('_'), CHR(']') +}; +static const chr brbackw[] = { /* \w within brackets */ + CHR('['), CHR(':'), + CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), + CHR(':'), CHR(']'), CHR('_') +}; + +/* + - lexword - interpolate a bracket expression for word characters + * Possibly ought to inquire whether there is a "word" character class. + ^ static void lexword(struct vars *); + */ +static void +lexword( + struct vars *v) +{ + lexnest(v, backw, ENDOF(backw)); +} + +/* + - next - get next token + ^ static int next(struct vars *); + */ +static int /* 1 normal, 0 failure */ +next( + struct vars *v) +{ + chr c; + + /* + * Errors yield an infinite sequence of failures. + */ + + if (ISERR()) { + return 0; /* the error has set nexttype to EOS */ + } + + /* + * Remember flavor of last token. + */ + + v->lasttype = v->nexttype; + + /* + * REG_BOSONLY + */ + + if (v->nexttype == EMPTY && (v->cflags®_BOSONLY)) { + /* at start of a REG_BOSONLY RE */ + RETV(SBEGIN, 0); /* same as \A */ + } + + /* + * If we're nested and we've hit end, return to outer level. + */ + + if (v->savenow != NULL && ATEOS()) { + v->now = v->savenow; + v->stop = v->savestop; + v->savenow = v->savestop = NULL; + } + + /* + * Skip white space etc. if appropriate (not in literal or []) + */ + + if (v->cflags®_EXPANDED) { + switch (v->lexcon) { + case L_ERE: + case L_BRE: + case L_EBND: + case L_BBND: + skip(v); + break; + } + } + + /* + * Handle EOS, depending on context. + */ + + if (ATEOS()) { + switch (v->lexcon) { + case L_ERE: + case L_BRE: + case L_Q: + RET(EOS); + break; + case L_EBND: + case L_BBND: + FAILW(REG_EBRACE); + break; + case L_BRACK: + case L_CEL: + case L_ECL: + case L_CCL: + FAILW(REG_EBRACK); + break; + } + assert(NOTREACHED); + } + + /* + * Okay, time to actually get a character. + */ + + c = *v->now++; + + /* + * Deal with the easy contexts, punt EREs to code below. + */ + + switch (v->lexcon) { + case L_BRE: /* punt BREs to separate function */ + return brenext(v, c); + break; + case L_ERE: /* see below */ + break; + case L_Q: /* literal strings are easy */ + RETV(PLAIN, c); + break; + case L_BBND: /* bounds are fairly simple */ + case L_EBND: + switch (c) { + case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'): + case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'): + case CHR('8'): case CHR('9'): + RETV(DIGIT, (chr)DIGITVAL(c)); + break; + case CHR(','): + RET(','); + break; + case CHR('}'): /* ERE bound ends with } */ + if (INCON(L_EBND)) { + INTOCON(L_ERE); + if ((v->cflags®_ADVF) && NEXT1('?')) { + v->now++; + NOTE(REG_UNONPOSIX); + RETV('}', 0); + } + RETV('}', 1); + } else { + FAILW(REG_BADBR); + } + break; + case CHR('\\'): /* BRE bound ends with \} */ + if (INCON(L_BBND) && NEXT1('}')) { + v->now++; + INTOCON(L_BRE); + RET('}'); + } else { + FAILW(REG_BADBR); + } + break; + default: + FAILW(REG_BADBR); + break; + } + assert(NOTREACHED); + break; + case L_BRACK: /* brackets are not too hard */ + switch (c) { + case CHR(']'): + if (LASTTYPE('[')) { + RETV(PLAIN, c); + } else { + INTOCON((v->cflags®_EXTENDED) ? L_ERE : L_BRE); + RET(']'); + } + break; + case CHR('\\'): + NOTE(REG_UBBS); + if (!(v->cflags®_ADVF)) { + RETV(PLAIN, c); + } + NOTE(REG_UNONPOSIX); + if (ATEOS()) { + FAILW(REG_EESCAPE); + } + (DISCARD)lexescape(v); + switch (v->nexttype) { /* not all escapes okay here */ + case PLAIN: + return 1; + break; + case CCLASS: + switch (v->nextvalue) { + case 'd': + lexnest(v, brbackd, ENDOF(brbackd)); + break; + case 's': + lexnest(v, brbacks, ENDOF(brbacks)); + break; + case 'w': + lexnest(v, brbackw, ENDOF(brbackw)); + break; + default: + FAILW(REG_EESCAPE); + break; + } + + /* + * lexnest() done, back up and try again. + */ + + v->nexttype = v->lasttype; + return next(v); + break; + } + + /* + * Not one of the acceptable escapes. + */ + + FAILW(REG_EESCAPE); + break; + case CHR('-'): + if (LASTTYPE('[') || NEXT1(']')) { + RETV(PLAIN, c); + } else { + RETV(RANGE, c); + } + break; + case CHR('['): + if (ATEOS()) { + FAILW(REG_EBRACK); + } + switch (*v->now++) { + case CHR('.'): + INTOCON(L_CEL); + + /* + * Might or might not be locale-specific. + */ + + RET(COLLEL); + break; + case CHR('='): + INTOCON(L_ECL); + NOTE(REG_ULOCALE); + RET(ECLASS); + break; + case CHR(':'): + INTOCON(L_CCL); + NOTE(REG_ULOCALE); + RET(CCLASS); + break; + default: /* oops */ + v->now--; + RETV(PLAIN, c); + break; + } + assert(NOTREACHED); + break; + default: + RETV(PLAIN, c); + break; + } + assert(NOTREACHED); + break; + case L_CEL: /* collating elements are easy */ + if (c == CHR('.') && NEXT1(']')) { + v->now++; + INTOCON(L_BRACK); + RETV(END, '.'); + } else { + RETV(PLAIN, c); + } + break; + case L_ECL: /* ditto equivalence classes */ + if (c == CHR('=') && NEXT1(']')) { + v->now++; + INTOCON(L_BRACK); + RETV(END, '='); + } else { + RETV(PLAIN, c); + } + break; + case L_CCL: /* ditto character classes */ + if (c == CHR(':') && NEXT1(']')) { + v->now++; + INTOCON(L_BRACK); + RETV(END, ':'); + } else { + RETV(PLAIN, c); + } + break; + default: + assert(NOTREACHED); + break; + } + + /* + * That got rid of everything except EREs and AREs. + */ + + assert(INCON(L_ERE)); + + /* + * Deal with EREs and AREs, except for backslashes. + */ + + switch (c) { + case CHR('|'): + RET('|'); + break; + case CHR('*'): + if ((v->cflags®_ADVF) && NEXT1('?')) { + v->now++; + NOTE(REG_UNONPOSIX); + RETV('*', 0); + } + RETV('*', 1); + break; + case CHR('+'): + if ((v->cflags®_ADVF) && NEXT1('?')) { + v->now++; + NOTE(REG_UNONPOSIX); + RETV('+', 0); + } + RETV('+', 1); + break; + case CHR('?'): + if ((v->cflags®_ADVF) && NEXT1('?')) { + v->now++; + NOTE(REG_UNONPOSIX); + RETV('?', 0); + } + RETV('?', 1); + break; + case CHR('{'): /* bounds start or plain character */ + if (v->cflags®_EXPANDED) { + skip(v); + } + if (ATEOS() || !iscdigit(*v->now)) { + NOTE(REG_UBRACES); + NOTE(REG_UUNSPEC); + RETV(PLAIN, c); + } else { + NOTE(REG_UBOUNDS); + INTOCON(L_EBND); + RET('{'); + } + assert(NOTREACHED); + break; + case CHR('('): /* parenthesis, or advanced extension */ + if ((v->cflags®_ADVF) && NEXT1('?')) { + NOTE(REG_UNONPOSIX); + v->now++; + switch (*v->now++) { + case CHR(':'): /* non-capturing paren */ + RETV('(', 0); + break; + case CHR('#'): /* comment */ + while (!ATEOS() && *v->now != CHR(')')) { + v->now++; + } + if (!ATEOS()) { + v->now++; + } + assert(v->nexttype == v->lasttype); + return next(v); + break; + case CHR('='): /* positive lookahead */ + NOTE(REG_ULOOKAHEAD); + RETV(LACON, 1); + break; + case CHR('!'): /* negative lookahead */ + NOTE(REG_ULOOKAHEAD); + RETV(LACON, 0); + break; + default: + FAILW(REG_BADRPT); + break; + } + assert(NOTREACHED); + } + if (v->cflags®_NOSUB) { + RETV('(', 0); /* all parens non-capturing */ + } else { + RETV('(', 1); + } + break; + case CHR(')'): + if (LASTTYPE('(')) { + NOTE(REG_UUNSPEC); + } + RETV(')', c); + break; + case CHR('['): /* easy except for [[:<:]] and [[:>:]] */ + if (HAVE(6) && *(v->now+0) == CHR('[') && + *(v->now+1) == CHR(':') && + (*(v->now+2) == CHR('<') || *(v->now+2) == CHR('>')) && + *(v->now+3) == CHR(':') && + *(v->now+4) == CHR(']') && + *(v->now+5) == CHR(']')) { + c = *(v->now+2); + v->now += 6; + NOTE(REG_UNONPOSIX); + RET((c == CHR('<')) ? '<' : '>'); + } + INTOCON(L_BRACK); + if (NEXT1('^')) { + v->now++; + RETV('[', 0); + } + RETV('[', 1); + break; + case CHR('.'): + RET('.'); + break; + case CHR('^'): + RET('^'); + break; + case CHR('$'): + RET('$'); + break; + case CHR('\\'): /* mostly punt backslashes to code below */ + if (ATEOS()) { + FAILW(REG_EESCAPE); + } + break; + default: /* ordinary character */ + RETV(PLAIN, c); + break; + } + + /* + * ERE/ARE backslash handling; backslash already eaten. + */ + + assert(!ATEOS()); + if (!(v->cflags®_ADVF)) {/* only AREs have non-trivial escapes */ + if (iscalnum(*v->now)) { + NOTE(REG_UBSALNUM); + NOTE(REG_UUNSPEC); + } + RETV(PLAIN, *v->now++); + } + (DISCARD)lexescape(v); + if (ISERR()) { + FAILW(REG_EESCAPE); + } + if (v->nexttype == CCLASS) {/* fudge at lexical level */ + switch (v->nextvalue) { + case 'd': lexnest(v, backd, ENDOF(backd)); break; + case 'D': lexnest(v, backD, ENDOF(backD)); break; + case 's': lexnest(v, backs, ENDOF(backs)); break; + case 'S': lexnest(v, backS, ENDOF(backS)); break; + case 'w': lexnest(v, backw, ENDOF(backw)); break; + case 'W': lexnest(v, backW, ENDOF(backW)); break; + default: + assert(NOTREACHED); + FAILW(REG_ASSERT); + break; + } + /* lexnest done, back up and try again */ + v->nexttype = v->lasttype; + return next(v); + } + + /* + * Otherwise, lexescape has already done the work. + */ + + return !ISERR(); +} + +/* + - lexescape - parse an ARE backslash escape (backslash already eaten) + * Note slightly nonstandard use of the CCLASS type code. + ^ static int lexescape(struct vars *); + */ +static int /* not actually used, but convenient for RETV */ +lexescape( + struct vars *v) +{ + chr c; + static chr alert[] = { + CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t') + }; + static chr esc[] = { + CHR('E'), CHR('S'), CHR('C') + }; + const chr *save; + + assert(v->cflags®_ADVF); + + assert(!ATEOS()); + c = *v->now++; + if (!iscalnum(c)) { + RETV(PLAIN, c); + } + + NOTE(REG_UNONPOSIX); + switch (c) { + case CHR('a'): + RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007'))); + break; + case CHR('A'): + RETV(SBEGIN, 0); + break; + case CHR('b'): + RETV(PLAIN, CHR('\b')); + break; + case CHR('B'): + RETV(PLAIN, CHR('\\')); + break; + case CHR('c'): + NOTE(REG_UUNPORT); + if (ATEOS()) { + FAILW(REG_EESCAPE); + } + RETV(PLAIN, (chr)(*v->now++ & 037)); + break; + case CHR('d'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 'd'); + break; + case CHR('D'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 'D'); + break; + case CHR('e'): + NOTE(REG_UUNPORT); + RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033'))); + break; + case CHR('f'): + RETV(PLAIN, CHR('\f')); + break; + case CHR('m'): + RET('<'); + break; + case CHR('M'): + RET('>'); + break; + case CHR('n'): + RETV(PLAIN, CHR('\n')); + break; + case CHR('r'): + RETV(PLAIN, CHR('\r')); + break; + case CHR('s'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 's'); + break; + case CHR('S'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 'S'); + break; + case CHR('t'): + RETV(PLAIN, CHR('\t')); + break; + case CHR('u'): + c = lexdigits(v, 16, 4, 4); + if (ISERR()) { + FAILW(REG_EESCAPE); + } + RETV(PLAIN, c); + break; + case CHR('U'): + c = lexdigits(v, 16, 8, 8); + if (ISERR()) { + FAILW(REG_EESCAPE); + } + RETV(PLAIN, c); + break; + case CHR('v'): + RETV(PLAIN, CHR('\v')); + break; + case CHR('w'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 'w'); + break; + case CHR('W'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 'W'); + break; + case CHR('x'): + NOTE(REG_UUNPORT); + c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */ + if (ISERR()) { + FAILW(REG_EESCAPE); + } + RETV(PLAIN, c); + break; + case CHR('y'): + NOTE(REG_ULOCALE); + RETV(WBDRY, 0); + break; + case CHR('Y'): + NOTE(REG_ULOCALE); + RETV(NWBDRY, 0); + break; + case CHR('Z'): + RETV(SEND, 0); + break; + case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'): + case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'): + case CHR('9'): + save = v->now; + v->now--; /* put first digit back */ + c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */ + if (ISERR()) { + FAILW(REG_EESCAPE); + } + + /* + * Ugly heuristic (first test is "exactly 1 digit?") + */ + + if (v->now - save == 0 || ((int) c > 0 && (int)c <= v->nsubexp)) { + NOTE(REG_UBACKREF); + RETV(BACKREF, (chr)c); + } + + /* + * Oops, doesn't look like it's a backref after all... + */ + + v->now = save; + + /* + * And fall through into octal number. + */ + + case CHR('0'): + NOTE(REG_UUNPORT); + v->now--; /* put first digit back */ + c = lexdigits(v, 8, 1, 3); + if (ISERR()) { + FAILW(REG_EESCAPE); + } + RETV(PLAIN, c); + break; + default: + assert(iscalpha(c)); + FAILW(REG_EESCAPE); /* unknown alphabetic escape */ + break; + } + assert(NOTREACHED); +} + +/* + - lexdigits - slurp up digits and return chr value + ^ static chr lexdigits(struct vars *, int, int, int); + */ +static chr /* chr value; errors signalled via ERR */ +lexdigits( + struct vars *v, + int base, + int minlen, + int maxlen) +{ + uchr n; /* unsigned to avoid overflow misbehavior */ + int len; + chr c; + int d; + const uchr ub = (uchr) base; + + n = 0; + for (len = 0; len < maxlen && !ATEOS(); len++) { + c = *v->now++; + switch (c) { + case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'): + case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'): + case CHR('8'): case CHR('9'): + d = DIGITVAL(c); + break; + case CHR('a'): case CHR('A'): d = 10; break; + case CHR('b'): case CHR('B'): d = 11; break; + case CHR('c'): case CHR('C'): d = 12; break; + case CHR('d'): case CHR('D'): d = 13; break; + case CHR('e'): case CHR('E'): d = 14; break; + case CHR('f'): case CHR('F'): d = 15; break; + default: + v->now--; /* oops, not a digit at all */ + d = -1; + break; + } + + if (d >= base) { /* not a plausible digit */ + v->now--; + d = -1; + } + if (d < 0) { + break; /* NOTE BREAK OUT */ + } + n = n*ub + (uchr)d; + } + if (len < minlen) { + ERR(REG_EESCAPE); + } + + return (chr)n; +} + +/* + - brenext - get next BRE token + * This is much like EREs except for all the stupid backslashes and the + * context-dependency of some things. + ^ static int brenext(struct vars *, pchr); + */ +static int /* 1 normal, 0 failure */ +brenext( + struct vars *v, + pchr pc) +{ + chr c = (chr)pc; + + switch (c) { + case CHR('*'): + if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^')) { + RETV(PLAIN, c); + } + RET('*'); + break; + case CHR('['): + if (HAVE(6) && *(v->now+0) == CHR('[') && + *(v->now+1) == CHR(':') && + (*(v->now+2) == CHR('<') || *(v->now+2) == CHR('>')) && + *(v->now+3) == CHR(':') && + *(v->now+4) == CHR(']') && + *(v->now+5) == CHR(']')) { + c = *(v->now+2); + v->now += 6; + NOTE(REG_UNONPOSIX); + RET((c == CHR('<')) ? '<' : '>'); + } + INTOCON(L_BRACK); + if (NEXT1('^')) { + v->now++; + RETV('[', 0); + } + RETV('[', 1); + break; + case CHR('.'): + RET('.'); + break; + case CHR('^'): + if (LASTTYPE(EMPTY)) { + RET('^'); + } + if (LASTTYPE('(')) { + NOTE(REG_UUNSPEC); + RET('^'); + } + RETV(PLAIN, c); + break; + case CHR('$'): + if (v->cflags®_EXPANDED) { + skip(v); + } + if (ATEOS()) { + RET('$'); + } + if (NEXT2('\\', ')')) { + NOTE(REG_UUNSPEC); + RET('$'); + } + RETV(PLAIN, c); + break; + case CHR('\\'): + break; /* see below */ + default: + RETV(PLAIN, c); + break; + } + + assert(c == CHR('\\')); + + if (ATEOS()) { + FAILW(REG_EESCAPE); + } + + c = *v->now++; + switch (c) { + case CHR('{'): + INTOCON(L_BBND); + NOTE(REG_UBOUNDS); + RET('{'); + break; + case CHR('('): + RETV('(', 1); + break; + case CHR(')'): + RETV(')', c); + break; + case CHR('<'): + NOTE(REG_UNONPOSIX); + RET('<'); + break; + case CHR('>'): + NOTE(REG_UNONPOSIX); + RET('>'); + break; + case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'): + case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'): + case CHR('9'): + NOTE(REG_UBACKREF); + RETV(BACKREF, (chr)DIGITVAL(c)); + break; + default: + if (iscalnum(c)) { + NOTE(REG_UBSALNUM); + NOTE(REG_UUNSPEC); + } + RETV(PLAIN, c); + break; + } + + assert(NOTREACHED); +} + +/* + - skip - skip white space and comments in expanded form + ^ static void skip(struct vars *); + */ +static void +skip( + struct vars *v) +{ + const chr *start = v->now; + + assert(v->cflags®_EXPANDED); + + for (;;) { + while (!ATEOS() && iscspace(*v->now)) { + v->now++; + } + if (ATEOS() || *v->now != CHR('#')) { + break; /* NOTE BREAK OUT */ + } + assert(NEXT1('#')); + while (!ATEOS() && *v->now != CHR('\n')) { + v->now++; + } + + /* + * Leave the newline to be picked up by the iscspace loop. + */ + } + + if (v->now != start) { + NOTE(REG_UNONPOSIX); + } +} + +/* + - newline - return the chr for a newline + * This helps confine use of CHR to this source file. + ^ static chr newline(NOPARMS); + */ +static chr +newline(void) +{ + return CHR('\n'); +} + +/* + - ch - return the chr sequence for regc_locale.c's fake collating element ch + * This helps confine use of CHR to this source file. Beware that the caller + * knows how long the sequence is. + ^ #ifdef REG_DEBUG + ^ static const chr *ch(NOPARMS); + ^ #endif + */ +#ifdef REG_DEBUG +static const chr * +ch(void) +{ + static chr chstr[] = { CHR('c'), CHR('h'), CHR('\0') }; + + return chstr; +} +#endif + +/* + - chrnamed - return the chr known by a given (chr string) name + * The code is a bit clumsy, but this routine gets only such specialized + * use that it hardly matters. + ^ static chr chrnamed(struct vars *, const chr *, const chr *, pchr); + */ +static chr +chrnamed( + struct vars *v, + const chr *startp, /* start of name */ + const chr *endp, /* just past end of name */ + pchr lastresort) /* what to return if name lookup fails */ +{ + celt c; + int errsave; + int e; + struct cvec *cv; + + errsave = v->err; + v->err = 0; + c = element(v, startp, endp); + e = v->err; + v->err = errsave; + + if (e != 0) { + return (chr)lastresort; + } + + cv = range(v, c, c, 0); + if (cv->nchrs == 0) { + return (chr)lastresort; + } + return cv->chrs[0]; +} + +/* + * Local Variables: + * mode: c + * c-basic-offset: 4 + * fill-column: 78 + * End: + */ diff --git a/contrib/hsrex/regc_locale.c b/contrib/hsrex/regc_locale.c new file mode 100644 index 0000000..a6bc3af --- /dev/null +++ b/contrib/hsrex/regc_locale.c @@ -0,0 +1,1163 @@ +/* + * regc_locale.c -- + * + * This file contains the Unicode locale specific regexp routines. + * This file is #included by regcomp.c. + * + * Copyright (c) 1998 by Scriptics Corporation. + * + * See the file "license.terms" for information on usage and redistribution of + * this file, and for a DISCLAIMER OF ALL WARRANTIES. + * + * RCS: @(#) $Id: regc_locale.c,v 1.20 2007/12/13 15:23:14 dgp Exp $ + */ + +/* ASCII character-name table */ + +static const struct cname { + const char *name; + const char code; +} cnames[] = { + {"NUL", '\0'}, + {"SOH", '\001'}, + {"STX", '\002'}, + {"ETX", '\003'}, + {"EOT", '\004'}, + {"ENQ", '\005'}, + {"ACK", '\006'}, + {"BEL", '\007'}, + {"alert", '\007'}, + {"BS", '\010'}, + {"backspace", '\b'}, + {"HT", '\011'}, + {"tab", '\t'}, + {"LF", '\012'}, + {"newline", '\n'}, + {"VT", '\013'}, + {"vertical-tab", '\v'}, + {"FF", '\014'}, + {"form-feed", '\f'}, + {"CR", '\015'}, + {"carriage-return", '\r'}, + {"SO", '\016'}, + {"SI", '\017'}, + {"DLE", '\020'}, + {"DC1", '\021'}, + {"DC2", '\022'}, + {"DC3", '\023'}, + {"DC4", '\024'}, + {"NAK", '\025'}, + {"SYN", '\026'}, + {"ETB", '\027'}, + {"CAN", '\030'}, + {"EM", '\031'}, + {"SUB", '\032'}, + {"ESC", '\033'}, + {"IS4", '\034'}, + {"FS", '\034'}, + {"IS3", '\035'}, + {"GS", '\035'}, + {"IS2", '\036'}, + {"RS", '\036'}, + {"IS1", '\037'}, + {"US", '\037'}, + {"space", ' '}, + {"exclamation-mark",'!'}, + {"quotation-mark", '"'}, + {"number-sign", '#'}, + {"dollar-sign", '$'}, + {"percent-sign", '%'}, + {"ampersand", '&'}, + {"apostrophe", '\''}, + {"left-parenthesis",'('}, + {"right-parenthesis", ')'}, + {"asterisk", '*'}, + {"plus-sign", '+'}, + {"comma", ','}, + {"hyphen", '-'}, + {"hyphen-minus", '-'}, + {"period", '.'}, + {"full-stop", '.'}, + {"slash", '/'}, + {"solidus", '/'}, + {"zero", '0'}, + {"one", '1'}, + {"two", '2'}, + {"three", '3'}, + {"four", '4'}, + {"five", '5'}, + {"six", '6'}, + {"seven", '7'}, + {"eight", '8'}, + {"nine", '9'}, + {"colon", ':'}, + {"semicolon", ';'}, + {"less-than-sign", '<'}, + {"equals-sign", '='}, + {"greater-than-sign", '>'}, + {"question-mark", '?'}, + {"commercial-at", '@'}, + {"left-square-bracket", '['}, + {"backslash", '\\'}, + {"reverse-solidus", '\\'}, + {"right-square-bracket", ']'}, + {"circumflex", '^'}, + {"circumflex-accent", '^'}, + {"underscore", '_'}, + {"low-line", '_'}, + {"grave-accent", '`'}, + {"left-brace", '{'}, + {"left-curly-bracket", '{'}, + {"vertical-line", '|'}, + {"right-brace", '}'}, + {"right-curly-bracket", '}'}, + {"tilde", '~'}, + {"DEL", '\177'}, + {NULL, 0} +}; + +/* + * Unicode character-class tables. + */ + +typedef struct crange { + chr start; + chr end; +} crange; + +#if defined(REGEX_STANDALONE) && ! defined(REGEX_WCHAR) + +static const crange alphaRangeTable[] = { + {0x41, 0x5a}, {0x61, 0x7a} +}; + +#define NUM_ALPHA_RANGE (sizeof(alphaRangeTable)/sizeof(crange)) + +static const chr alphaCharTable[] = { +}; + +#define NUM_ALPHA_CHAR (sizeof(alphaCharTable)/sizeof(chr)) + +static const crange digitRangeTable[] = { + {0x30, 0x39} +}; + +#define NUM_DIGIT_RANGE (sizeof(digitRangeTable)/sizeof(crange)) + +static const crange punctRangeTable[] = { + {0x21, 0x23}, {0x25, 0x2a}, {0x2c, 0x2f}, {0x5b, 0x5d}, +}; + +#define NUM_PUNCT_RANGE (sizeof(punctRangeTable)/sizeof(crange)) + +static const chr punctCharTable[] = { + 0x3a, 0x3b, 0x3f, 0x40, 0x5f, 0x7b, 0x7d +}; + +#define NUM_PUNCT_CHAR (sizeof(punctCharTable)/sizeof(chr)) + +static const crange spaceRangeTable[] = { + {0x09, 0x0d} +}; + +#define NUM_SPACE_RANGE (sizeof(spaceRangeTable)/sizeof(crange)) + +static const chr spaceCharTable[] = { + 0x20 +}; + +#define NUM_SPACE_CHAR (sizeof(spaceCharTable)/sizeof(chr)) + +static const crange lowerRangeTable[] = { + {0x61, 0x7a} +}; + +#define NUM_LOWER_RANGE (sizeof(lowerRangeTable)/sizeof(crange)) + +static const chr lowerCharTable[] = { +}; + +#define NUM_LOWER_CHAR (sizeof(lowerCharTable)/sizeof(chr)) + +static const crange upperRangeTable[] = { + {0x41, 0x5a} +}; + +#define NUM_UPPER_RANGE (sizeof(upperRangeTable)/sizeof(crange)) + +static const chr upperCharTable[] = { +}; + +#define NUM_UPPER_CHAR (sizeof(upperCharTable)/sizeof(chr)) + +static const crange graphRangeTable[] = { + {0x21, 0x7e} +}; + +#define NUM_GRAPH_RANGE (sizeof(graphRangeTable)/sizeof(crange)) + +static const chr graphCharTable[] = { +}; + +#define NUM_GRAPH_CHAR (sizeof(graphCharTable)/sizeof(chr)) + +static const crange printRangeTable[] = { + {0x20, 0x7E} +}; + +#define NUM_PRINT_RANGE (sizeof(printRangeTable)/sizeof(crange)) + +static const chr printCharTable[] = { +}; + +#define NUM_PRINT_CHAR (sizeof(printCharTable)/sizeof(chr)) +#else + +/* + * Declarations of Unicode character ranges. This code + * is automatically generated by the tools/uniClass.tcl script + * and used in generic/regc_locale.c. Do not modify by hand. + */ + +/* Unicode: alphabetic characters */ + +static const crange alphaRangeTable[] = { + {0x0041, 0x005a}, {0x0061, 0x007a}, {0x00c0, 0x00d6}, {0x00d8, 0x00f6}, + {0x00f8, 0x021f}, {0x0222, 0x0233}, {0x0250, 0x02ad}, {0x02b0, 0x02b8}, + {0x02bb, 0x02c1}, {0x02e0, 0x02e4}, {0x0388, 0x038a}, {0x038e, 0x03a1}, + {0x03a3, 0x03ce}, {0x03d0, 0x03d7}, {0x03da, 0x03f5}, {0x0400, 0x0481}, + {0x048c, 0x04c4}, {0x04d0, 0x04f5}, {0x0531, 0x0556}, {0x0561, 0x0587}, + {0x05d0, 0x05ea}, {0x05f0, 0x05f2}, {0x0621, 0x063a}, {0x0640, 0x064a}, + {0x0671, 0x06d3}, {0x06fa, 0x06fc}, {0x0712, 0x072c}, {0x0780, 0x07a5}, + {0x0905, 0x0939}, {0x0958, 0x0961}, {0x0985, 0x098c}, {0x0993, 0x09a8}, + {0x09aa, 0x09b0}, {0x09b6, 0x09b9}, {0x09df, 0x09e1}, {0x0a05, 0x0a0a}, + {0x0a13, 0x0a28}, {0x0a2a, 0x0a30}, {0x0a59, 0x0a5c}, {0x0a72, 0x0a74}, + {0x0a85, 0x0a8b}, {0x0a8f, 0x0a91}, {0x0a93, 0x0aa8}, {0x0aaa, 0x0ab0}, + {0x0ab5, 0x0ab9}, {0x0b05, 0x0b0c}, {0x0b13, 0x0b28}, {0x0b2a, 0x0b30}, + {0x0b36, 0x0b39}, {0x0b5f, 0x0b61}, {0x0b85, 0x0b8a}, {0x0b8e, 0x0b90}, + {0x0b92, 0x0b95}, {0x0ba8, 0x0baa}, {0x0bae, 0x0bb5}, {0x0bb7, 0x0bb9}, + {0x0c05, 0x0c0c}, {0x0c0e, 0x0c10}, {0x0c12, 0x0c28}, {0x0c2a, 0x0c33}, + {0x0c35, 0x0c39}, {0x0c85, 0x0c8c}, {0x0c8e, 0x0c90}, {0x0c92, 0x0ca8}, + {0x0caa, 0x0cb3}, {0x0cb5, 0x0cb9}, {0x0d05, 0x0d0c}, {0x0d0e, 0x0d10}, + {0x0d12, 0x0d28}, {0x0d2a, 0x0d39}, {0x0d85, 0x0d96}, {0x0d9a, 0x0db1}, + {0x0db3, 0x0dbb}, {0x0dc0, 0x0dc6}, {0x0e01, 0x0e30}, {0x0e40, 0x0e46}, + {0x0e94, 0x0e97}, {0x0e99, 0x0e9f}, {0x0ea1, 0x0ea3}, {0x0ead, 0x0eb0}, + {0x0ec0, 0x0ec4}, {0x0f40, 0x0f47}, {0x0f49, 0x0f6a}, {0x0f88, 0x0f8b}, + {0x1000, 0x1021}, {0x1023, 0x1027}, {0x1050, 0x1055}, {0x10a0, 0x10c5}, + {0x10d0, 0x10f6}, {0x1100, 0x1159}, {0x115f, 0x11a2}, {0x11a8, 0x11f9}, + {0x1200, 0x1206}, {0x1208, 0x1246}, {0x124a, 0x124d}, {0x1250, 0x1256}, + {0x125a, 0x125d}, {0x1260, 0x1286}, {0x128a, 0x128d}, {0x1290, 0x12ae}, + {0x12b2, 0x12b5}, {0x12b8, 0x12be}, {0x12c2, 0x12c5}, {0x12c8, 0x12ce}, + {0x12d0, 0x12d6}, {0x12d8, 0x12ee}, {0x12f0, 0x130e}, {0x1312, 0x1315}, + {0x1318, 0x131e}, {0x1320, 0x1346}, {0x1348, 0x135a}, {0x13a0, 0x13f4}, + {0x1401, 0x166c}, {0x166f, 0x1676}, {0x1681, 0x169a}, {0x16a0, 0x16ea}, + {0x1780, 0x17b3}, {0x1820, 0x1877}, {0x1880, 0x18a8}, {0x1e00, 0x1e9b}, + {0x1ea0, 0x1ef9}, {0x1f00, 0x1f15}, {0x1f18, 0x1f1d}, {0x1f20, 0x1f45}, + {0x1f48, 0x1f4d}, {0x1f50, 0x1f57}, {0x1f5f, 0x1f7d}, {0x1f80, 0x1fb4}, + {0x1fb6, 0x1fbc}, {0x1fc2, 0x1fc4}, {0x1fc6, 0x1fcc}, {0x1fd0, 0x1fd3}, + {0x1fd6, 0x1fdb}, {0x1fe0, 0x1fec}, {0x1ff2, 0x1ff4}, {0x1ff6, 0x1ffc}, + {0x210a, 0x2113}, {0x2119, 0x211d}, {0x212a, 0x212d}, {0x212f, 0x2131}, + {0x2133, 0x2139}, {0x3031, 0x3035}, {0x3041, 0x3094}, {0x30a1, 0x30fa}, + {0x30fc, 0x30fe}, {0x3105, 0x312c}, {0x3131, 0x318e}, {0x31a0, 0x31b7}, + {0x3400, 0x4db5}, {0x4e00, 0x9fa5}, {0xa000, 0xa48c}, {0xac00, 0xd7a3}, + {0xf900, 0xfa2d}, {0xfb00, 0xfb06}, {0xfb13, 0xfb17}, {0xfb1f, 0xfb28}, + {0xfb2a, 0xfb36}, {0xfb38, 0xfb3c}, {0xfb46, 0xfbb1}, {0xfbd3, 0xfd3d}, + {0xfd50, 0xfd8f}, {0xfd92, 0xfdc7}, {0xfdf0, 0xfdfb}, {0xfe70, 0xfe72}, + {0xfe76, 0xfefc}, {0xff21, 0xff3a}, {0xff41, 0xff5a}, {0xff66, 0xffbe}, + {0xffc2, 0xffc7}, {0xffca, 0xffcf}, {0xffd2, 0xffd7}, {0xffda, 0xffdc} +}; + +#define NUM_ALPHA_RANGE (sizeof(alphaRangeTable)/sizeof(crange)) + +static const chr alphaCharTable[] = { + 0x00aa, 0x00b5, 0x00ba, 0x02d0, 0x02d1, 0x02ee, 0x037a, 0x0386, 0x038c, + 0x04c7, 0x04c8, 0x04cb, 0x04cc, 0x04f8, 0x04f9, 0x0559, 0x06d5, 0x06e5, + 0x06e6, 0x0710, 0x093d, 0x0950, 0x098f, 0x0990, 0x09b2, 0x09dc, 0x09dd, + 0x09f0, 0x09f1, 0x0a0f, 0x0a10, 0x0a32, 0x0a33, 0x0a35, 0x0a36, 0x0a38, + 0x0a39, 0x0a5e, 0x0a8d, 0x0ab2, 0x0ab3, 0x0abd, 0x0ad0, 0x0ae0, 0x0b0f, + 0x0b10, 0x0b32, 0x0b33, 0x0b3d, 0x0b5c, 0x0b5d, 0x0b99, 0x0b9a, 0x0b9c, + 0x0b9e, 0x0b9f, 0x0ba3, 0x0ba4, 0x0c60, 0x0c61, 0x0cde, 0x0ce0, 0x0ce1, + 0x0d60, 0x0d61, 0x0dbd, 0x0e32, 0x0e33, 0x0e81, 0x0e82, 0x0e84, 0x0e87, + 0x0e88, 0x0e8a, 0x0e8d, 0x0ea5, 0x0ea7, 0x0eaa, 0x0eab, 0x0eb2, 0x0eb3, + 0x0ebd, 0x0ec6, 0x0edc, 0x0edd, 0x0f00, 0x1029, 0x102a, 0x1248, 0x1258, + 0x1288, 0x12b0, 0x12c0, 0x1310, 0x1f59, 0x1f5b, 0x1f5d, 0x1fbe, 0x207f, + 0x2102, 0x2107, 0x2115, 0x2124, 0x2126, 0x2128, 0x3005, 0x3006, 0x309d, + 0x309e, 0xfb1d, 0xfb3e, 0xfb40, 0xfb41, 0xfb43, 0xfb44, 0xfe74, 0xfffe +}; + +#define NUM_ALPHA_CHAR (sizeof(alphaCharTable)/sizeof(chr)) + +/* + * Unicode: decimal digit characters + */ + +static const crange digitRangeTable[] = { + {0x0030, 0x0039}, {0x0660, 0x0669}, {0x06f0, 0x06f9}, {0x0966, 0x096f}, + {0x09e6, 0x09ef}, {0x0a66, 0x0a6f}, {0x0ae6, 0x0aef}, {0x0b66, 0x0b6f}, + {0x0be7, 0x0bef}, {0x0c66, 0x0c6f}, {0x0ce6, 0x0cef}, {0x0d66, 0x0d6f}, + {0x0e50, 0x0e59}, {0x0ed0, 0x0ed9}, {0x0f20, 0x0f29}, {0x1040, 0x1049}, + {0x1369, 0x1371}, {0x17e0, 0x17e9}, {0x1810, 0x1819}, {0xff10, 0xff19} +}; + +#define NUM_DIGIT_RANGE (sizeof(digitRangeTable)/sizeof(crange)) + +/* + * no singletons of digit characters. + */ + +/* + * Unicode: punctuation characters. + */ + +static const crange punctRangeTable[] = { + {0x0021, 0x0023}, {0x0025, 0x002a}, {0x002c, 0x002f}, {0x005b, 0x005d}, + {0x055a, 0x055f}, {0x066a, 0x066d}, {0x0700, 0x070d}, {0x0f04, 0x0f12}, + {0x0f3a, 0x0f3d}, {0x104a, 0x104f}, {0x1361, 0x1368}, {0x16eb, 0x16ed}, + {0x17d4, 0x17da}, {0x1800, 0x180a}, {0x2010, 0x2027}, {0x2030, 0x2043}, + {0x2048, 0x204d}, {0x3001, 0x3003}, {0x3008, 0x3011}, {0x3014, 0x301f}, + {0xfe30, 0xfe44}, {0xfe49, 0xfe52}, {0xfe54, 0xfe61}, {0xff01, 0xff03}, + {0xff05, 0xff0a}, {0xff0c, 0xff0f}, {0xff3b, 0xff3d}, {0xff61, 0xff65} +}; + +#define NUM_PUNCT_RANGE (sizeof(punctRangeTable)/sizeof(crange)) + +static const chr punctCharTable[] = { + 0x003a, 0x003b, 0x003f, 0x0040, 0x005f, 0x007b, 0x007d, 0x00a1, 0x00ab, + 0x00ad, 0x00b7, 0x00bb, 0x00bf, 0x037e, 0x0387, 0x0589, 0x058a, 0x05be, + 0x05c0, 0x05c3, 0x05f3, 0x05f4, 0x060c, 0x061b, 0x061f, 0x06d4, 0x0964, + 0x0965, 0x0970, 0x0df4, 0x0e4f, 0x0e5a, 0x0e5b, 0x0f85, 0x10fb, 0x166d, + 0x166e, 0x169b, 0x169c, 0x17dc, 0x2045, 0x2046, 0x207d, 0x207e, 0x208d, + 0x208e, 0x2329, 0x232a, 0x3030, 0x30fb, 0xfd3e, 0xfd3f, 0xfe63, 0xfe68, + 0xfe6a, 0xfe6b, 0xff1a, 0xff1b, 0xff1f, 0xff20, 0xff3f, 0xff5b, 0xff5d +}; + +#define NUM_PUNCT_CHAR (sizeof(punctCharTable)/sizeof(chr)) + +/* + * Unicode: white space characters. + */ + +static const crange spaceRangeTable[] = { + {0x0009, 0x000d}, {0x2000, 0x200b} +}; + +#define NUM_SPACE_RANGE (sizeof(spaceRangeTable)/sizeof(crange)) + +static const chr spaceCharTable[] = { + 0x0020, 0x00a0, 0x1680, 0x2028, 0x2029, 0x202f, 0x3000 +}; + +#define NUM_SPACE_CHAR (sizeof(spaceCharTable)/sizeof(chr)) + +/* + * Unicode: lowercase characters + */ + +static const crange lowerRangeTable[] = { + {0x0061, 0x007a}, {0x00df, 0x00f6}, {0x00f8, 0x00ff}, {0x017e, 0x0180}, + {0x0199, 0x019b}, {0x01bd, 0x01bf}, {0x0250, 0x02ad}, {0x03ac, 0x03ce}, + {0x03d5, 0x03d7}, {0x03ef, 0x03f3}, {0x0430, 0x045f}, {0x0561, 0x0587}, + {0x1e95, 0x1e9b}, {0x1f00, 0x1f07}, {0x1f10, 0x1f15}, {0x1f20, 0x1f27}, + {0x1f30, 0x1f37}, {0x1f40, 0x1f45}, {0x1f50, 0x1f57}, {0x1f60, 0x1f67}, + {0x1f70, 0x1f7d}, {0x1f80, 0x1f87}, {0x1f90, 0x1f97}, {0x1fa0, 0x1fa7}, + {0x1fb0, 0x1fb4}, {0x1fc2, 0x1fc4}, {0x1fd0, 0x1fd3}, {0x1fe0, 0x1fe7}, + {0x1ff2, 0x1ff4}, {0xfb00, 0xfb06}, {0xfb13, 0xfb17}, {0xff41, 0xff5a} +}; + +#define NUM_LOWER_RANGE (sizeof(lowerRangeTable)/sizeof(crange)) + +static const chr lowerCharTable[] = { + 0x00aa, 0x00b5, 0x00ba, 0x0101, 0x0103, 0x0105, 0x0107, 0x0109, 0x010b, + 0x010d, 0x010f, 0x0111, 0x0113, 0x0115, 0x0117, 0x0119, 0x011b, 0x011d, + 0x011f, 0x0121, 0x0123, 0x0125, 0x0127, 0x0129, 0x012b, 0x012d, 0x012f, + 0x0131, 0x0133, 0x0135, 0x0137, 0x0138, 0x013a, 0x013c, 0x013e, 0x0140, + 0x0142, 0x0144, 0x0146, 0x0148, 0x0149, 0x014b, 0x014d, 0x014f, 0x0151, + 0x0153, 0x0155, 0x0157, 0x0159, 0x015b, 0x015d, 0x015f, 0x0161, 0x0163, + 0x0165, 0x0167, 0x0169, 0x016b, 0x016d, 0x016f, 0x0171, 0x0173, 0x0175, + 0x0177, 0x017a, 0x017c, 0x0183, 0x0185, 0x0188, 0x018c, 0x018d, 0x0192, + 0x0195, 0x019e, 0x01a1, 0x01a3, 0x01a5, 0x01a8, 0x01aa, 0x01ab, 0x01ad, + 0x01b0, 0x01b4, 0x01b6, 0x01b9, 0x01ba, 0x01c6, 0x01c9, 0x01cc, 0x01ce, + 0x01d0, 0x01d2, 0x01d4, 0x01d6, 0x01d8, 0x01da, 0x01dc, 0x01dd, 0x01df, + 0x01e1, 0x01e3, 0x01e5, 0x01e7, 0x01e9, 0x01eb, 0x01ed, 0x01ef, 0x01f0, + 0x01f3, 0x01f5, 0x01f9, 0x01fb, 0x01fd, 0x01ff, 0x0201, 0x0203, 0x0205, + 0x0207, 0x0209, 0x020b, 0x020d, 0x020f, 0x0211, 0x0213, 0x0215, 0x0217, + 0x0219, 0x021b, 0x021d, 0x021f, 0x0223, 0x0225, 0x0227, 0x0229, 0x022b, + 0x022d, 0x022f, 0x0231, 0x0233, 0x0390, 0x03d0, 0x03d1, 0x03db, 0x03dd, + 0x03df, 0x03e1, 0x03e3, 0x03e5, 0x03e7, 0x03e9, 0x03eb, 0x03ed, 0x03f5, + 0x0461, 0x0463, 0x0465, 0x0467, 0x0469, 0x046b, 0x046d, 0x046f, 0x0471, + 0x0473, 0x0475, 0x0477, 0x0479, 0x047b, 0x047d, 0x047f, 0x0481, 0x048d, + 0x048f, 0x0491, 0x0493, 0x0495, 0x0497, 0x0499, 0x049b, 0x049d, 0x049f, + 0x04a1, 0x04a3, 0x04a5, 0x04a7, 0x04a9, 0x04ab, 0x04ad, 0x04af, 0x04b1, + 0x04b3, 0x04b5, 0x04b7, 0x04b9, 0x04bb, 0x04bd, 0x04bf, 0x04c2, 0x04c4, + 0x04c8, 0x04cc, 0x04d1, 0x04d3, 0x04d5, 0x04d7, 0x04d9, 0x04db, 0x04dd, + 0x04df, 0x04e1, 0x04e3, 0x04e5, 0x04e7, 0x04e9, 0x04eb, 0x04ed, 0x04ef, + 0x04f1, 0x04f3, 0x04f5, 0x04f9, 0x1e01, 0x1e03, 0x1e05, 0x1e07, 0x1e09, + 0x1e0b, 0x1e0d, 0x1e0f, 0x1e11, 0x1e13, 0x1e15, 0x1e17, 0x1e19, 0x1e1b, + 0x1e1d, 0x1e1f, 0x1e21, 0x1e23, 0x1e25, 0x1e27, 0x1e29, 0x1e2b, 0x1e2d, + 0x1e2f, 0x1e31, 0x1e33, 0x1e35, 0x1e37, 0x1e39, 0x1e3b, 0x1e3d, 0x1e3f, + 0x1e41, 0x1e43, 0x1e45, 0x1e47, 0x1e49, 0x1e4b, 0x1e4d, 0x1e4f, 0x1e51, + 0x1e53, 0x1e55, 0x1e57, 0x1e59, 0x1e5b, 0x1e5d, 0x1e5f, 0x1e61, 0x1e63, + 0x1e65, 0x1e67, 0x1e69, 0x1e6b, 0x1e6d, 0x1e6f, 0x1e71, 0x1e73, 0x1e75, + 0x1e77, 0x1e79, 0x1e7b, 0x1e7d, 0x1e7f, 0x1e81, 0x1e83, 0x1e85, 0x1e87, + 0x1e89, 0x1e8b, 0x1e8d, 0x1e8f, 0x1e91, 0x1e93, 0x1ea1, 0x1ea3, 0x1ea5, + 0x1ea7, 0x1ea9, 0x1eab, 0x1ead, 0x1eaf, 0x1eb1, 0x1eb3, 0x1eb5, 0x1eb7, + 0x1eb9, 0x1ebb, 0x1ebd, 0x1ebf, 0x1ec1, 0x1ec3, 0x1ec5, 0x1ec7, 0x1ec9, + 0x1ecb, 0x1ecd, 0x1ecf, 0x1ed1, 0x1ed3, 0x1ed5, 0x1ed7, 0x1ed9, 0x1edb, + 0x1edd, 0x1edf, 0x1ee1, 0x1ee3, 0x1ee5, 0x1ee7, 0x1ee9, 0x1eeb, 0x1eed, + 0x1eef, 0x1ef1, 0x1ef3, 0x1ef5, 0x1ef7, 0x1ef9, 0x1fb6, 0x1fb7, 0x1fbe, + 0x1fc6, 0x1fc7, 0x1fd6, 0x1fd7, 0x1ff6, 0x1ff7, 0x207f, 0x210a, 0x210e, + 0x210f, 0x2113, 0x212f, 0x2134, 0x2139 +}; + +#define NUM_LOWER_CHAR (sizeof(lowerCharTable)/sizeof(chr)) + +/* + * Unicode: uppercase characters. + */ + +static const crange upperRangeTable[] = { + {0x0041, 0x005a}, {0x00c0, 0x00d6}, {0x00d8, 0x00de}, {0x0189, 0x018b}, + {0x018e, 0x0191}, {0x0196, 0x0198}, {0x01b1, 0x01b3}, {0x01f6, 0x01f8}, + {0x0388, 0x038a}, {0x0391, 0x03a1}, {0x03a3, 0x03ab}, {0x03d2, 0x03d4}, + {0x0400, 0x042f}, {0x0531, 0x0556}, {0x10a0, 0x10c5}, {0x1f08, 0x1f0f}, + {0x1f18, 0x1f1d}, {0x1f28, 0x1f2f}, {0x1f38, 0x1f3f}, {0x1f48, 0x1f4d}, + {0x1f68, 0x1f6f}, {0x1fb8, 0x1fbb}, {0x1fc8, 0x1fcb}, {0x1fd8, 0x1fdb}, + {0x1fe8, 0x1fec}, {0x1ff8, 0x1ffb}, {0x210b, 0x210d}, {0x2110, 0x2112}, + {0x2119, 0x211d}, {0x212a, 0x212d}, {0xff21, 0xff3a} +}; + +#define NUM_UPPER_RANGE (sizeof(upperRangeTable)/sizeof(crange)) + +static const chr upperCharTable[] = { + 0x0100, 0x0102, 0x0104, 0x0106, 0x0108, 0x010a, 0x010c, 0x010e, 0x0110, + 0x0112, 0x0114, 0x0116, 0x0118, 0x011a, 0x011c, 0x011e, 0x0120, 0x0122, + 0x0124, 0x0126, 0x0128, 0x012a, 0x012c, 0x012e, 0x0130, 0x0132, 0x0134, + 0x0136, 0x0139, 0x013b, 0x013d, 0x013f, 0x0141, 0x0143, 0x0145, 0x0147, + 0x014a, 0x014c, 0x014e, 0x0150, 0x0152, 0x0154, 0x0156, 0x0158, 0x015a, + 0x015c, 0x015e, 0x0160, 0x0162, 0x0164, 0x0166, 0x0168, 0x016a, 0x016c, + 0x016e, 0x0170, 0x0172, 0x0174, 0x0176, 0x0178, 0x0179, 0x017b, 0x017d, + 0x0181, 0x0182, 0x0184, 0x0186, 0x0187, 0x0193, 0x0194, 0x019c, 0x019d, + 0x019f, 0x01a0, 0x01a2, 0x01a4, 0x01a6, 0x01a7, 0x01a9, 0x01ac, 0x01ae, + 0x01af, 0x01b5, 0x01b7, 0x01b8, 0x01bc, 0x01c4, 0x01c7, 0x01ca, 0x01cd, + 0x01cf, 0x01d1, 0x01d3, 0x01d5, 0x01d7, 0x01d9, 0x01db, 0x01de, 0x01e0, + 0x01e2, 0x01e4, 0x01e6, 0x01e8, 0x01ea, 0x01ec, 0x01ee, 0x01f1, 0x01f4, + 0x01fa, 0x01fc, 0x01fe, 0x0200, 0x0202, 0x0204, 0x0206, 0x0208, 0x020a, + 0x020c, 0x020e, 0x0210, 0x0212, 0x0214, 0x0216, 0x0218, 0x021a, 0x021c, + 0x021e, 0x0222, 0x0224, 0x0226, 0x0228, 0x022a, 0x022c, 0x022e, 0x0230, + 0x0232, 0x0386, 0x038c, 0x038e, 0x038f, 0x03da, 0x03dc, 0x03de, 0x03e0, + 0x03e2, 0x03e4, 0x03e6, 0x03e8, 0x03ea, 0x03ec, 0x03ee, 0x03f4, 0x0460, + 0x0462, 0x0464, 0x0466, 0x0468, 0x046a, 0x046c, 0x046e, 0x0470, 0x0472, + 0x0474, 0x0476, 0x0478, 0x047a, 0x047c, 0x047e, 0x0480, 0x048c, 0x048e, + 0x0490, 0x0492, 0x0494, 0x0496, 0x0498, 0x049a, 0x049c, 0x049e, 0x04a0, + 0x04a2, 0x04a4, 0x04a6, 0x04a8, 0x04aa, 0x04ac, 0x04ae, 0x04b0, 0x04b2, + 0x04b4, 0x04b6, 0x04b8, 0x04ba, 0x04bc, 0x04be, 0x04c0, 0x04c1, 0x04c3, + 0x04c7, 0x04cb, 0x04d0, 0x04d2, 0x04d4, 0x04d6, 0x04d8, 0x04da, 0x04dc, + 0x04de, 0x04e0, 0x04e2, 0x04e4, 0x04e6, 0x04e8, 0x04ea, 0x04ec, 0x04ee, + 0x04f0, 0x04f2, 0x04f4, 0x04f8, 0x1e00, 0x1e02, 0x1e04, 0x1e06, 0x1e08, + 0x1e0a, 0x1e0c, 0x1e0e, 0x1e10, 0x1e12, 0x1e14, 0x1e16, 0x1e18, 0x1e1a, + 0x1e1c, 0x1e1e, 0x1e20, 0x1e22, 0x1e24, 0x1e26, 0x1e28, 0x1e2a, 0x1e2c, + 0x1e2e, 0x1e30, 0x1e32, 0x1e34, 0x1e36, 0x1e38, 0x1e3a, 0x1e3c, 0x1e3e, + 0x1e40, 0x1e42, 0x1e44, 0x1e46, 0x1e48, 0x1e4a, 0x1e4c, 0x1e4e, 0x1e50, + 0x1e52, 0x1e54, 0x1e56, 0x1e58, 0x1e5a, 0x1e5c, 0x1e5e, 0x1e60, 0x1e62, + 0x1e64, 0x1e66, 0x1e68, 0x1e6a, 0x1e6c, 0x1e6e, 0x1e70, 0x1e72, 0x1e74, + 0x1e76, 0x1e78, 0x1e7a, 0x1e7c, 0x1e7e, 0x1e80, 0x1e82, 0x1e84, 0x1e86, + 0x1e88, 0x1e8a, 0x1e8c, 0x1e8e, 0x1e90, 0x1e92, 0x1e94, 0x1ea0, 0x1ea2, + 0x1ea4, 0x1ea6, 0x1ea8, 0x1eaa, 0x1eac, 0x1eae, 0x1eb0, 0x1eb2, 0x1eb4, + 0x1eb6, 0x1eb8, 0x1eba, 0x1ebc, 0x1ebe, 0x1ec0, 0x1ec2, 0x1ec4, 0x1ec6, + 0x1ec8, 0x1eca, 0x1ecc, 0x1ece, 0x1ed0, 0x1ed2, 0x1ed4, 0x1ed6, 0x1ed8, + 0x1eda, 0x1edc, 0x1ede, 0x1ee0, 0x1ee2, 0x1ee4, 0x1ee6, 0x1ee8, 0x1eea, + 0x1eec, 0x1eee, 0x1ef0, 0x1ef2, 0x1ef4, 0x1ef6, 0x1ef8, 0x1f59, 0x1f5b, + 0x1f5d, 0x1f5f, 0x2102, 0x2107, 0x2115, 0x2124, 0x2126, 0x2128, 0x2130, + 0x2131, 0x2133 +}; + +#define NUM_UPPER_CHAR (sizeof(upperCharTable)/sizeof(chr)) + +/* + * Unicode: unicode print characters excluding space. + */ + +static const crange graphRangeTable[] = { + {0x0021, 0x007e}, {0x00a0, 0x011f}, {0x0121, 0x021f}, {0x0222, 0x0233}, + {0x0250, 0x02ad}, {0x02b0, 0x02ee}, {0x0300, 0x031f}, {0x0321, 0x034e}, + {0x0360, 0x0362}, {0x0384, 0x038a}, {0x038e, 0x03a1}, {0x03a3, 0x03ce}, + {0x03d0, 0x03d7}, {0x03da, 0x03f5}, {0x0400, 0x041f}, {0x0421, 0x0486}, + {0x048c, 0x04c4}, {0x04d0, 0x04f5}, {0x0531, 0x0556}, {0x0559, 0x055f}, + {0x0561, 0x0587}, {0x0591, 0x05a1}, {0x05a3, 0x05b9}, {0x05bb, 0x05c4}, + {0x05d0, 0x05ea}, {0x05f0, 0x05f4}, {0x0621, 0x063a}, {0x0640, 0x0655}, + {0x0660, 0x066d}, {0x0670, 0x06ed}, {0x06f0, 0x06fe}, {0x0700, 0x070d}, + {0x0710, 0x071f}, {0x0721, 0x072c}, {0x0730, 0x074a}, {0x0780, 0x07b0}, + {0x0901, 0x0903}, {0x0905, 0x091f}, {0x0921, 0x0939}, {0x093c, 0x094d}, + {0x0950, 0x0954}, {0x0958, 0x0970}, {0x0981, 0x0983}, {0x0985, 0x098c}, + {0x0993, 0x09a8}, {0x09aa, 0x09b0}, {0x09b6, 0x09b9}, {0x09be, 0x09c4}, + {0x09cb, 0x09cd}, {0x09df, 0x09e3}, {0x09e6, 0x09fa}, {0x0a05, 0x0a0a}, + {0x0a13, 0x0a1f}, {0x0a21, 0x0a28}, {0x0a2a, 0x0a30}, {0x0a3e, 0x0a42}, + {0x0a4b, 0x0a4d}, {0x0a59, 0x0a5c}, {0x0a66, 0x0a74}, {0x0a81, 0x0a83}, + {0x0a85, 0x0a8b}, {0x0a8f, 0x0a91}, {0x0a93, 0x0aa8}, {0x0aaa, 0x0ab0}, + {0x0ab5, 0x0ab9}, {0x0abc, 0x0ac5}, {0x0ac7, 0x0ac9}, {0x0acb, 0x0acd}, + {0x0ae6, 0x0aef}, {0x0b01, 0x0b03}, {0x0b05, 0x0b0c}, {0x0b13, 0x0b1f}, + {0x0b21, 0x0b28}, {0x0b2a, 0x0b30}, {0x0b36, 0x0b39}, {0x0b3c, 0x0b43}, + {0x0b4b, 0x0b4d}, {0x0b5f, 0x0b61}, {0x0b66, 0x0b70}, {0x0b85, 0x0b8a}, + {0x0b8e, 0x0b90}, {0x0b92, 0x0b95}, {0x0ba8, 0x0baa}, {0x0bae, 0x0bb5}, + {0x0bb7, 0x0bb9}, {0x0bbe, 0x0bc2}, {0x0bc6, 0x0bc8}, {0x0bca, 0x0bcd}, + {0x0be7, 0x0bf2}, {0x0c01, 0x0c03}, {0x0c05, 0x0c0c}, {0x0c0e, 0x0c10}, + {0x0c12, 0x0c1f}, {0x0c21, 0x0c28}, {0x0c2a, 0x0c33}, {0x0c35, 0x0c39}, + {0x0c3e, 0x0c44}, {0x0c46, 0x0c48}, {0x0c4a, 0x0c4d}, {0x0c66, 0x0c6f}, + {0x0c85, 0x0c8c}, {0x0c8e, 0x0c90}, {0x0c92, 0x0ca8}, {0x0caa, 0x0cb3}, + {0x0cb5, 0x0cb9}, {0x0cbe, 0x0cc4}, {0x0cc6, 0x0cc8}, {0x0cca, 0x0ccd}, + {0x0ce6, 0x0cef}, {0x0d05, 0x0d0c}, {0x0d0e, 0x0d10}, {0x0d12, 0x0d1f}, + {0x0d21, 0x0d28}, {0x0d2a, 0x0d39}, {0x0d3e, 0x0d43}, {0x0d46, 0x0d48}, + {0x0d4a, 0x0d4d}, {0x0d66, 0x0d6f}, {0x0d85, 0x0d96}, {0x0d9a, 0x0db1}, + {0x0db3, 0x0dbb}, {0x0dc0, 0x0dc6}, {0x0dcf, 0x0dd4}, {0x0dd8, 0x0ddf}, + {0x0df2, 0x0df4}, {0x0e01, 0x0e1f}, {0x0e21, 0x0e3a}, {0x0e3f, 0x0e5b}, + {0x0e94, 0x0e97}, {0x0e99, 0x0e9f}, {0x0ea1, 0x0ea3}, {0x0ead, 0x0eb9}, + {0x0ebb, 0x0ebd}, {0x0ec0, 0x0ec4}, {0x0ec8, 0x0ecd}, {0x0ed0, 0x0ed9}, + {0x0f00, 0x0f1f}, {0x0f21, 0x0f47}, {0x0f49, 0x0f6a}, {0x0f71, 0x0f8b}, + {0x0f90, 0x0f97}, {0x0f99, 0x0fbc}, {0x0fbe, 0x0fcc}, {0x1000, 0x101f}, + {0x1023, 0x1027}, {0x102c, 0x1032}, {0x1036, 0x1039}, {0x1040, 0x1059}, + {0x10a0, 0x10c5}, {0x10d0, 0x10f6}, {0x1100, 0x111f}, {0x1121, 0x1159}, + {0x115f, 0x11a2}, {0x11a8, 0x11f9}, {0x1200, 0x1206}, {0x1208, 0x121f}, + {0x1221, 0x1246}, {0x124a, 0x124d}, {0x1250, 0x1256}, {0x125a, 0x125d}, + {0x1260, 0x1286}, {0x128a, 0x128d}, {0x1290, 0x12ae}, {0x12b2, 0x12b5}, + {0x12b8, 0x12be}, {0x12c2, 0x12c5}, {0x12c8, 0x12ce}, {0x12d0, 0x12d6}, + {0x12d8, 0x12ee}, {0x12f0, 0x130e}, {0x1312, 0x1315}, {0x1318, 0x131e}, + {0x1321, 0x1346}, {0x1348, 0x135a}, {0x1361, 0x137c}, {0x13a0, 0x13f4}, + {0x1401, 0x141f}, {0x1421, 0x151f}, {0x1521, 0x161f}, {0x1621, 0x1676}, + {0x1680, 0x169c}, {0x16a0, 0x16f0}, {0x1780, 0x17dc}, {0x17e0, 0x17e9}, + {0x1800, 0x180a}, {0x1810, 0x1819}, {0x1821, 0x1877}, {0x1880, 0x18a9}, + {0x1e00, 0x1e1f}, {0x1e21, 0x1e9b}, {0x1ea0, 0x1ef9}, {0x1f00, 0x1f15}, + {0x1f18, 0x1f1d}, {0x1f21, 0x1f45}, {0x1f48, 0x1f4d}, {0x1f50, 0x1f57}, + {0x1f5f, 0x1f7d}, {0x1f80, 0x1fb4}, {0x1fb6, 0x1fc4}, {0x1fc6, 0x1fd3}, + {0x1fd6, 0x1fdb}, {0x1fdd, 0x1fef}, {0x1ff2, 0x1ff4}, {0x1ff6, 0x1ffe}, + {0x2000, 0x200b}, {0x2010, 0x201f}, {0x2021, 0x2029}, {0x202f, 0x2046}, + {0x2048, 0x204d}, {0x2074, 0x208e}, {0x20a0, 0x20af}, {0x20d0, 0x20e3}, + {0x2100, 0x211f}, {0x2121, 0x213a}, {0x2153, 0x2183}, {0x2190, 0x21f3}, + {0x2200, 0x221f}, {0x2221, 0x22f1}, {0x2300, 0x231f}, {0x2321, 0x237b}, + {0x237d, 0x239a}, {0x2400, 0x241f}, {0x2421, 0x2426}, {0x2440, 0x244a}, + {0x2460, 0x24ea}, {0x2500, 0x251f}, {0x2521, 0x2595}, {0x25a0, 0x25f7}, + {0x2600, 0x2613}, {0x2619, 0x261f}, {0x2621, 0x2671}, {0x2701, 0x2704}, + {0x2706, 0x2709}, {0x270c, 0x271f}, {0x2721, 0x2727}, {0x2729, 0x274b}, + {0x274f, 0x2752}, {0x2758, 0x275e}, {0x2761, 0x2767}, {0x2776, 0x2794}, + {0x2798, 0x27af}, {0x27b1, 0x27be}, {0x2800, 0x281f}, {0x2821, 0x28ff}, + {0x2e80, 0x2e99}, {0x2e9b, 0x2ef3}, {0x2f00, 0x2f1f}, {0x2f21, 0x2fd5}, + {0x2ff0, 0x2ffb}, {0x3000, 0x301f}, {0x3021, 0x303a}, {0x3041, 0x3094}, + {0x3099, 0x309e}, {0x30a1, 0x30fe}, {0x3105, 0x311f}, {0x3121, 0x312c}, + {0x3131, 0x318e}, {0x3190, 0x31b7}, {0x3200, 0x321c}, {0x3221, 0x3243}, + {0x3260, 0x327b}, {0x327f, 0x32b0}, {0x32c0, 0x32cb}, {0x32d0, 0x32fe}, + {0x3300, 0x331f}, {0x3321, 0x3376}, {0x337b, 0x33dd}, {0x33e0, 0x33fe}, + {0x3400, 0x341f}, {0x3421, 0x351f}, {0x3521, 0x361f}, {0x3621, 0x371f}, + {0x3721, 0x381f}, {0x3821, 0x391f}, {0x3921, 0x3a1f}, {0x3a21, 0x3b1f}, + {0x3b21, 0x3c1f}, {0x3c21, 0x3d1f}, {0x3d21, 0x3e1f}, {0x3e21, 0x3f1f}, + {0x3f21, 0x401f}, {0x4021, 0x411f}, {0x4121, 0x421f}, {0x4221, 0x431f}, + {0x4321, 0x441f}, {0x4421, 0x451f}, {0x4521, 0x461f}, {0x4621, 0x471f}, + {0x4721, 0x481f}, {0x4821, 0x491f}, {0x4921, 0x4a1f}, {0x4a21, 0x4b1f}, + {0x4b21, 0x4c1f}, {0x4c21, 0x4d1f}, {0x4d21, 0x4db5}, {0x4e00, 0x4e1f}, + {0x4e21, 0x4f1f}, {0x4f21, 0x501f}, {0x5021, 0x511f}, {0x5121, 0x521f}, + {0x5221, 0x531f}, {0x5321, 0x541f}, {0x5421, 0x551f}, {0x5521, 0x561f}, + {0x5621, 0x571f}, {0x5721, 0x581f}, {0x5821, 0x591f}, {0x5921, 0x5a1f}, + {0x5a21, 0x5b1f}, {0x5b21, 0x5c1f}, {0x5c21, 0x5d1f}, {0x5d21, 0x5e1f}, + {0x5e21, 0x5f1f}, {0x5f21, 0x601f}, {0x6021, 0x611f}, {0x6121, 0x621f}, + {0x6221, 0x631f}, {0x6321, 0x641f}, {0x6421, 0x651f}, {0x6521, 0x661f}, + {0x6621, 0x671f}, {0x6721, 0x681f}, {0x6821, 0x691f}, {0x6921, 0x6a1f}, + {0x6a21, 0x6b1f}, {0x6b21, 0x6c1f}, {0x6c21, 0x6d1f}, {0x6d21, 0x6e1f}, + {0x6e21, 0x6f1f}, {0x6f21, 0x701f}, {0x7021, 0x711f}, {0x7121, 0x721f}, + {0x7221, 0x731f}, {0x7321, 0x741f}, {0x7421, 0x751f}, {0x7521, 0x761f}, + {0x7621, 0x771f}, {0x7721, 0x781f}, {0x7821, 0x791f}, {0x7921, 0x7a1f}, + {0x7a21, 0x7b1f}, {0x7b21, 0x7c1f}, {0x7c21, 0x7d1f}, {0x7d21, 0x7e1f}, + {0x7e21, 0x7f1f}, {0x7f21, 0x801f}, {0x8021, 0x811f}, {0x8121, 0x821f}, + {0x8221, 0x831f}, {0x8321, 0x841f}, {0x8421, 0x851f}, {0x8521, 0x861f}, + {0x8621, 0x871f}, {0x8721, 0x881f}, {0x8821, 0x891f}, {0x8921, 0x8a1f}, + {0x8a21, 0x8b1f}, {0x8b21, 0x8c1f}, {0x8c21, 0x8d1f}, {0x8d21, 0x8e1f}, + {0x8e21, 0x8f1f}, {0x8f21, 0x901f}, {0x9021, 0x911f}, {0x9121, 0x921f}, + {0x9221, 0x931f}, {0x9321, 0x941f}, {0x9421, 0x951f}, {0x9521, 0x961f}, + {0x9621, 0x971f}, {0x9721, 0x981f}, {0x9821, 0x991f}, {0x9921, 0x9a1f}, + {0x9a21, 0x9b1f}, {0x9b21, 0x9c1f}, {0x9c21, 0x9d1f}, {0x9d21, 0x9e1f}, + {0x9e21, 0x9f1f}, {0x9f21, 0x9fa5}, {0xa000, 0xa01f}, {0xa021, 0xa11f}, + {0xa121, 0xa21f}, {0xa221, 0xa31f}, {0xa321, 0xa41f}, {0xa421, 0xa48c}, + {0xa490, 0xa4a1}, {0xa4a4, 0xa4b3}, {0xa4b5, 0xa4c0}, {0xa4c2, 0xa4c4}, + {0xac00, 0xac1f}, {0xac21, 0xad1f}, {0xad21, 0xae1f}, {0xae21, 0xaf1f}, + {0xaf21, 0xb01f}, {0xb021, 0xb11f}, {0xb121, 0xb21f}, {0xb221, 0xb31f}, + {0xb321, 0xb41f}, {0xb421, 0xb51f}, {0xb521, 0xb61f}, {0xb621, 0xb71f}, + {0xb721, 0xb81f}, {0xb821, 0xb91f}, {0xb921, 0xba1f}, {0xba21, 0xbb1f}, + {0xbb21, 0xbc1f}, {0xbc21, 0xbd1f}, {0xbd21, 0xbe1f}, {0xbe21, 0xbf1f}, + {0xbf21, 0xc01f}, {0xc021, 0xc11f}, {0xc121, 0xc21f}, {0xc221, 0xc31f}, + {0xc321, 0xc41f}, {0xc421, 0xc51f}, {0xc521, 0xc61f}, {0xc621, 0xc71f}, + {0xc721, 0xc81f}, {0xc821, 0xc91f}, {0xc921, 0xca1f}, {0xca21, 0xcb1f}, + {0xcb21, 0xcc1f}, {0xcc21, 0xcd1f}, {0xcd21, 0xce1f}, {0xce21, 0xcf1f}, + {0xcf21, 0xd01f}, {0xd021, 0xd11f}, {0xd121, 0xd21f}, {0xd221, 0xd31f}, + {0xd321, 0xd41f}, {0xd421, 0xd51f}, {0xd521, 0xd61f}, {0xd621, 0xd71f}, + {0xd721, 0xd7a3}, {0xf900, 0xf91f}, {0xf921, 0xfa1f}, {0xfa21, 0xfa2d}, + {0xfb00, 0xfb06}, {0xfb13, 0xfb17}, {0xfb1d, 0xfb1f}, {0xfb21, 0xfb36}, + {0xfb38, 0xfb3c}, {0xfb46, 0xfbb1}, {0xfbd3, 0xfc1f}, {0xfc21, 0xfd1f}, + {0xfd21, 0xfd3f}, {0xfd50, 0xfd8f}, {0xfd92, 0xfdc7}, {0xfdf0, 0xfdfb}, + {0xfe21, 0xfe23}, {0xfe30, 0xfe44}, {0xfe49, 0xfe52}, {0xfe54, 0xfe66}, + {0xfe68, 0xfe6b}, {0xfe70, 0xfe72}, {0xfe76, 0xfefc}, {0xff01, 0xff1f}, + {0xff21, 0xff5e}, {0xff61, 0xffbe}, {0xffc2, 0xffc7}, {0xffca, 0xffcf}, + {0xffd2, 0xffd7}, {0xffda, 0xffdc}, {0xffe0, 0xffe6}, {0xffe8, 0xffee}, + {0xfffc, 0xffff} +}; + +#define NUM_GRAPH_RANGE (sizeof(graphRangeTable)/sizeof(crange)) + +static const chr graphCharTable[] = { + 0x0374, 0x0375, 0x037a, 0x037e, 0x038c, 0x0488, 0x0489, 0x04c7, 0x04c8, + 0x04cb, 0x04cc, 0x04f8, 0x04f9, 0x0589, 0x058a, 0x060c, 0x061b, 0x061f, + 0x098f, 0x0990, 0x09b2, 0x09bc, 0x09c7, 0x09c8, 0x09d7, 0x09dc, 0x09dd, + 0x0a02, 0x0a0f, 0x0a10, 0x0a32, 0x0a33, 0x0a35, 0x0a36, 0x0a38, 0x0a39, + 0x0a3c, 0x0a47, 0x0a48, 0x0a5e, 0x0a8d, 0x0ab2, 0x0ab3, 0x0ad0, 0x0ae0, + 0x0b0f, 0x0b10, 0x0b32, 0x0b33, 0x0b47, 0x0b48, 0x0b56, 0x0b57, 0x0b5c, + 0x0b5d, 0x0b82, 0x0b83, 0x0b99, 0x0b9a, 0x0b9c, 0x0b9e, 0x0b9f, 0x0ba3, + 0x0ba4, 0x0bd7, 0x0c55, 0x0c56, 0x0c60, 0x0c61, 0x0c82, 0x0c83, 0x0cd5, + 0x0cd6, 0x0cde, 0x0ce0, 0x0ce1, 0x0d02, 0x0d03, 0x0d57, 0x0d60, 0x0d61, + 0x0d82, 0x0d83, 0x0dbd, 0x0dca, 0x0dd6, 0x0e81, 0x0e82, 0x0e84, 0x0e87, + 0x0e88, 0x0e8a, 0x0e8d, 0x0ea5, 0x0ea7, 0x0eaa, 0x0eab, 0x0ec6, 0x0edc, + 0x0edd, 0x0fcf, 0x1021, 0x1029, 0x102a, 0x10fb, 0x1248, 0x1258, 0x1288, + 0x12b0, 0x12c0, 0x1310, 0x1f59, 0x1f5b, 0x1f5d, 0x2070, 0x274d, 0x2756, + 0x303e, 0x303f, 0xa4c6, 0xfb3e, 0xfb40, 0xfb41, 0xfb43, 0xfb44, 0xfe74 +}; + +#define NUM_GRAPH_CHAR (sizeof(graphCharTable)/sizeof(chr)) + +/* + * Unicode: unicode print characters including space, i.e. all Letters (class + * L*), Numbers (N*), Punctuation (P*), Symbols (S*) and Spaces (Zs). + */ + +static const crange printRangeTable[] = { + {0x0020, 0x007E}, {0x00A0, 0x01F5}, {0x01FA, 0x0217}, {0x0250, 0x02A8}, + {0x02B0, 0x02DE}, {0x02E0, 0x02E9}, {0x0374, 0x0375}, {0x0384, 0x038A}, + {0x038E, 0x03A1}, {0x03A3, 0x03CE}, {0x03D0, 0x03D6}, {0x03E2, 0x03F3}, + {0x0401, 0x040C}, {0x040E, 0x044F}, {0x0451, 0x045C}, {0x045E, 0x0482}, + {0x0490, 0x04C4}, {0x04C7, 0x04C8}, {0x04CB, 0x04CC}, {0x04D0, 0x04EB}, + {0x04EE, 0x04F5}, {0x04F8, 0x04F9}, {0x0531, 0x0556}, {0x0559, 0x055F}, + {0x0561, 0x0587}, {0x05D0, 0x05EA}, {0x05F0, 0x05F4}, {0x0621, 0x063A}, + {0x0640, 0x064A}, {0x0660, 0x066D}, {0x0671, 0x06B7}, {0x06BA, 0x06BE}, + {0x06C0, 0x06CE}, {0x06D0, 0x06D5}, {0x06E5, 0x06E6}, {0x06F0, 0x06F9}, + {0x0905, 0x0939}, {0x0958, 0x0961}, {0x0964, 0x0970}, {0x0985, 0x098C}, + {0x098F, 0x0990}, {0x0993, 0x09A8}, {0x09AA, 0x09B0}, {0x09B6, 0x09B9}, + {0x09DC, 0x09DD}, {0x09DF, 0x09E1}, {0x09E6, 0x09FA}, {0x0A05, 0x0A0A}, + {0x0A0F, 0x0A10}, {0x0A13, 0x0A28}, {0x0A2A, 0x0A30}, {0x0A32, 0x0A33}, + {0x0A35, 0x0A36}, {0x0A38, 0x0A39}, {0x0A59, 0x0A5C}, {0x0A66, 0x0A6F}, + {0x0A72, 0x0A74}, {0x0A85, 0x0A8B}, {0x0A8F, 0x0A91}, {0x0A93, 0x0AA8}, + {0x0AAA, 0x0AB0}, {0x0AB2, 0x0AB3}, {0x0AB5, 0x0AB9}, {0x0AE6, 0x0AEF}, + {0x0B05, 0x0B0C}, {0x0B0F, 0x0B10}, {0x0B13, 0x0B28}, {0x0B2A, 0x0B30}, + {0x0B32, 0x0B33}, {0x0B36, 0x0B39}, {0x0B5C, 0x0B5D}, {0x0B5F, 0x0B61}, + {0x0B66, 0x0B70}, {0x0B85, 0x0B8A}, {0x0B8E, 0x0B90}, {0x0B92, 0x0B95}, + {0x0B99, 0x0B9A}, {0x0B9E, 0x0B9F}, {0x0BA3, 0x0BA4}, {0x0BA8, 0x0BAA}, + {0x0BAE, 0x0BB5}, {0x0BB7, 0x0BB9}, {0x0BE7, 0x0BF2}, {0x0C05, 0x0C0C}, + {0x0C0E, 0x0C10}, {0x0C12, 0x0C28}, {0x0C2A, 0x0C33}, {0x0C35, 0x0C39}, + {0x0C60, 0x0C61}, {0x0C66, 0x0C6F}, {0x0C85, 0x0C8C}, {0x0C8E, 0x0C90}, + {0x0C92, 0x0CA8}, {0x0CAA, 0x0CB3}, {0x0CB5, 0x0CB9}, {0x0CE0, 0x0CE1}, + {0x0CE6, 0x0CEF}, {0x0D05, 0x0D0C}, {0x0D0E, 0x0D10}, {0x0D12, 0x0D28}, + {0x0D2A, 0x0D39}, {0x0D60, 0x0D61}, {0x0D66, 0x0D6F}, {0x0E3F, 0x0E46}, + {0x0E4F, 0x0E5B}, {0x0E99, 0x0E9F}, {0x0EA1, 0x0EA3}, {0x0EAA, 0x0EAB}, + {0x0EAD, 0x0EB0}, {0x0EB2, 0x0EB3}, {0x0EC0, 0x0EC4}, {0x0ED0, 0x0ED9}, + {0x0EDC, 0x0EDD}, {0x0F00, 0x0F17}, {0x0F1A, 0x0F34}, {0x0F3A, 0x0F3D}, + {0x0F40, 0x0F47}, {0x0F49, 0x0F69}, {0x0F88, 0x0F8B}, {0x10A0, 0x10C5}, + {0x10D0, 0x10F6}, {0x1100, 0x1159}, {0x115F, 0x11A2}, {0x11A8, 0x11F9}, + {0x1E00, 0x1E9B}, {0x1EA0, 0x1EF9}, {0x1F00, 0x1F15}, {0x1F18, 0x1F1D}, + {0x1F20, 0x1F45}, {0x1F48, 0x1F4D}, {0x1F50, 0x1F57}, {0x1F5F, 0x1F7D}, + {0x1F80, 0x1FB4}, {0x1FB6, 0x1FC4}, {0x1FC6, 0x1FD3}, {0x1FD6, 0x1FDB}, + {0x1FDD, 0x1FEF}, {0x1FF2, 0x1FF4}, {0x1FF6, 0x1FFE}, {0x2000, 0x200B}, + {0x2010, 0x2027}, {0x2030, 0x2046}, {0x2074, 0x208E}, {0x20A0, 0x20AC}, + {0x2100, 0x2138}, {0x2153, 0x2182}, {0x2190, 0x21EA}, {0x2200, 0x22F1}, + {0x2302, 0x237A}, {0x2400, 0x2424}, {0x2440, 0x244A}, {0x2460, 0x24EA}, + {0x2500, 0x2595}, {0x25A0, 0x25EF}, {0x2600, 0x2613}, {0x261A, 0x266F}, + {0x2701, 0x2704}, {0x2706, 0x2709}, {0x270C, 0x2727}, {0x2729, 0x274B}, + {0x274F, 0x2752}, {0x2758, 0x275E}, {0x2761, 0x2767}, {0x2776, 0x2794}, + {0x2798, 0x27AF}, {0x27B1, 0x27BE}, {0x3000, 0x3029}, {0x3030, 0x3037}, + {0x3041, 0x3094}, {0x309B, 0x309E}, {0x30A1, 0x30FE}, {0x3105, 0x312C}, + {0x3131, 0x318E}, {0x3190, 0x319F}, {0x3200, 0x321C}, {0x3220, 0x3243}, + {0x3260, 0x327B}, {0x327F, 0x32B0}, {0x32C0, 0x32CB}, {0x32D0, 0x32FE}, + {0x3300, 0x3376}, {0x337B, 0x33DD}, {0x33E0, 0x33FE}, {0x4E00, 0x9FA5}, + {0xAC00, 0xD7A3}, {0xF900, 0xFA2D}, {0xFB00, 0xFB06}, {0xFB13, 0xFB17}, + {0xFB1F, 0xFB36}, {0xFB38, 0xFB3C}, {0xFB40, 0xFB41}, {0xFB43, 0xFB44}, + {0xFB46, 0xFBB1}, {0xFBD3, 0xFD3F}, {0xFD50, 0xFD8F}, {0xFD92, 0xFDC7}, + {0xFDF0, 0xFDFB}, {0xFE30, 0xFE44}, {0xFE49, 0xFE52}, {0xFE54, 0xFE66}, + {0xFE68, 0xFE6B}, {0xFE70, 0xFE72}, {0xFE76, 0xFEFC}, {0xFF01, 0xFF5E}, + {0xFF61, 0xFFBE}, {0xFFC2, 0xFFC7}, {0xFFCA, 0xFFCF}, {0xFFD2, 0xFFD7}, + {0xFFDA, 0xFFDC}, {0xFFE0, 0xFFE6}, {0xFFE8, 0xFFEE}, {0xFFFC, 0xFFFD} +}; + +#define NUM_PRINT_RANGE (sizeof(printRangeTable)/sizeof(crange)) + +static const chr printCharTable[] = { + 0x037A, 0x037E, 0x038C, 0x03DA, 0x03DC, 0x03DE, 0x03E0, 0x0589, 0x05BE, + 0x05C0, 0x05C3, 0x060C, 0x061B, 0x061F, 0x06E9, 0x093D, 0x0950, 0x09B2, + 0x0A5E, 0x0A8D, 0x0ABD, 0x0AD0, 0x0AE0, 0x0B3D, 0x0B9C, 0x0CDE, 0x0E01, + 0x0E32, 0x0E81, 0x0E84, 0x0E87, 0x0E8A, 0x0E8D, 0x0E94, 0x0EA5, 0x0EA7, + 0x0EBD, 0x0EC6, 0x0F36, 0x0F38, 0x0F85, 0x10FB, 0x1F59, 0x1F5B, 0x1F5D, + 0x2070, 0x2300, 0x274D, 0x2756, 0x303F, 0xFB3E, 0xFE74 +}; + +#define NUM_PRINT_CHAR (sizeof(printCharTable)/sizeof(chr)) +#endif + +/* + * End of auto-generated Unicode character ranges declarations. + */ + +#define CH NOCELT + +/* + - element - map collating-element name to celt + ^ static celt element(struct vars *, const chr *, const chr *); + */ +static celt +element( + struct vars *v, /* context */ + const chr *startp, /* points to start of name */ + const chr *endp) /* points just past end of name */ +{ + const struct cname *cn; + size_t len; + Tcl_DString ds; + const char *np; + + /* + * Generic: one-chr names stand for themselves. + */ + + assert(startp < endp); + len = endp - startp; + if (len == 1) { + return *startp; + } + + NOTE(REG_ULOCALE); + + /* + * Search table. + */ + + Tcl_DStringInit(&ds); + np = Tcl_UniCharToUtfDString(startp, (int)len, &ds); + for (cn=cnames; cn->name!=NULL; cn++) { + if (strlen(cn->name)==len && strncmp(cn->name, np, len)==0) { + break; /* NOTE BREAK OUT */ + } + } + Tcl_DStringFree(&ds); + if (cn->name != NULL) { + return CHR(cn->code); + } + + /* + * Couldn't find it. + */ + + ERR(REG_ECOLLATE); + return 0; +} + +/* + - range - supply cvec for a range, including legality check + ^ static struct cvec *range(struct vars *, celt, celt, int); + */ +static struct cvec * +range( + struct vars *v, /* context */ + celt a, /* range start */ + celt b, /* range end, might equal a */ + int cases) /* case-independent? */ +{ + int nchrs; + struct cvec *cv; + celt c, lc, uc, tc; + + if (a != b && !before(a, b)) { + ERR(REG_ERANGE); + return NULL; + } + + if (!cases) { /* easy version */ + cv = getcvec(v, 0, 1); + NOERRN(); + addrange(cv, a, b); + return cv; + } + + /* + * When case-independent, it's hard to decide when cvec ranges are usable, + * so for now at least, we won't try. We allocate enough space for two + * case variants plus a little extra for the two title case variants. + */ + + nchrs = (b - a + 1)*2 + 4; + + cv = getcvec(v, nchrs, 0); + NOERRN(); + + for (c=a; c<=b; c++) { + addchr(cv, c); + lc = Tcl_UniCharToLower((chr)c); + uc = Tcl_UniCharToUpper((chr)c); + tc = Tcl_UniCharToTitle((chr)c); + if (c != lc) { + addchr(cv, lc); + } + if (c != uc) { + addchr(cv, uc); + } + if (c != tc && tc != uc) { + addchr(cv, tc); + } + } + + return cv; +} + +/* + - before - is celt x before celt y, for purposes of range legality? + ^ static int before(celt, celt); + */ +static int /* predicate */ +before( + celt x, celt y) /* collating elements */ +{ + if (x < y) { + return 1; + } + return 0; +} + +/* + - eclass - supply cvec for an equivalence class + * Must include case counterparts on request. + ^ static struct cvec *eclass(struct vars *, celt, int); + */ +static struct cvec * +eclass( + struct vars *v, /* context */ + celt c, /* Collating element representing the + * equivalence class. */ + int cases) /* all cases? */ +{ + struct cvec *cv; + + /* + * Crude fake equivalence class for testing. + */ + + if ((v->cflags®_FAKE) && c == 'x') { + cv = getcvec(v, 4, 0); + addchr(cv, (chr)'x'); + addchr(cv, (chr)'y'); + if (cases) { + addchr(cv, (chr)'X'); + addchr(cv, (chr)'Y'); + } + return cv; + } + + /* + * Otherwise, none. + */ + + if (cases) { + return allcases(v, c); + } + cv = getcvec(v, 1, 0); + assert(cv != NULL); + addchr(cv, (chr)c); + return cv; +} + +/* + - cclass - supply cvec for a character class + * Must include case counterparts on request. + ^ static struct cvec *cclass(struct vars *, const chr *, const chr *, int); + */ +static struct cvec * +cclass( + struct vars *v, /* context */ + const chr *startp, /* where the name starts */ + const chr *endp, /* just past the end of the name */ + int cases) /* case-independent? */ +{ + size_t len; + struct cvec *cv = NULL; + Tcl_DString ds; + const char *np; + const char **namePtr; + int i, index; + + /* + * The following arrays define the valid character class names. + */ + + static const char *classNames[] = { + "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph", + "lower", "print", "punct", "space", "upper", "xdigit", NULL + }; + + enum classes { + CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH, + CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT + }; + + + /* + * Extract the class name + */ + + len = endp - startp; + Tcl_DStringInit(&ds); + np = Tcl_UniCharToUtfDString(startp, (int)len, &ds); + + /* + * Remap lower and upper to alpha if the match is case insensitive. + */ + + if (cases && len == 5 && (strncmp("lower", np, 5) == 0 + || strncmp("upper", np, 5) == 0)) { + np = "alpha"; + } + + /* + * Map the name to the corresponding enumerated value. + */ + + index = -1; + for (namePtr=classNames,i=0 ; *namePtr!=NULL ; namePtr++,i++) { + if ((strlen(*namePtr) == len) && (strncmp(*namePtr, np, len) == 0)) { + index = i; + break; + } + } + Tcl_DStringFree(&ds); + if (index == -1) { + ERR(REG_ECTYPE); + return NULL; + } + + /* + * Now compute the character class contents. + */ + + switch((enum classes) index) { + case CC_PRINT: + cv = getcvec(v, NUM_PRINT_CHAR, NUM_PRINT_RANGE); + if (cv) { + for (i=0 ; (size_t)i<NUM_PRINT_CHAR ; i++) { + addchr(cv, printCharTable[i]); + } + for (i=0 ; (size_t)i<NUM_PRINT_RANGE ; i++) { + addrange(cv, printRangeTable[i].start, + printRangeTable[i].end); + } + } + break; + case CC_ALNUM: + cv = getcvec(v, NUM_ALPHA_CHAR, NUM_DIGIT_RANGE + NUM_ALPHA_RANGE); + if (cv) { + for (i=0 ; (size_t)i<NUM_ALPHA_CHAR ; i++) { + addchr(cv, alphaCharTable[i]); + } + for (i=0 ; (size_t)i<NUM_ALPHA_RANGE ; i++) { + addrange(cv, alphaRangeTable[i].start, + alphaRangeTable[i].end); + } + for (i=0 ; (size_t)i<NUM_DIGIT_RANGE ; i++) { + addrange(cv, digitRangeTable[i].start, + digitRangeTable[i].end); + } + } + break; + case CC_ALPHA: + cv = getcvec(v, NUM_ALPHA_CHAR, NUM_ALPHA_RANGE); + if (cv) { + for (i=0 ; (size_t)i<NUM_ALPHA_RANGE ; i++) { + addrange(cv, alphaRangeTable[i].start, + alphaRangeTable[i].end); + } + for (i=0 ; (size_t)i<NUM_ALPHA_CHAR ; i++) { + addchr(cv, alphaCharTable[i]); + } + } + break; + case CC_ASCII: + cv = getcvec(v, 0, 1); + if (cv) { + addrange(cv, 0, 0x7f); + } + break; + case CC_BLANK: + cv = getcvec(v, 2, 0); + addchr(cv, '\t'); + addchr(cv, ' '); + break; + case CC_CNTRL: + cv = getcvec(v, 0, 2); + addrange(cv, 0x0, 0x1f); + addrange(cv, 0x7f, 0x9f); + break; + case CC_DIGIT: + cv = getcvec(v, 0, NUM_DIGIT_RANGE); + if (cv) { + for (i=0 ; (size_t)i<NUM_DIGIT_RANGE ; i++) { + addrange(cv, digitRangeTable[i].start, + digitRangeTable[i].end); + } + } + break; + case CC_PUNCT: + cv = getcvec(v, NUM_PUNCT_CHAR, NUM_PUNCT_RANGE); + if (cv) { + for (i=0 ; (size_t)i<NUM_PUNCT_RANGE ; i++) { + addrange(cv, punctRangeTable[i].start, + punctRangeTable[i].end); + } + for (i=0 ; (size_t)i<NUM_PUNCT_CHAR ; i++) { + addchr(cv, punctCharTable[i]); + } + } + break; + case CC_XDIGIT: + /* + * This is a 3 instead of (NUM_DIGIT_RANGE+2) because I've no idea how + * to define the digits 'a' through 'f' in non-western locales. The + * concept is quite possibly non portable, or only used in contextx + * where the characters used would be the western ones anyway! + * Whatever is actually the case, the number of ranges is fixed (until + * someone comes up with a better arrangement!) + */ + + cv = getcvec(v, 0, 3); + if (cv) { + addrange(cv, '0', '9'); + addrange(cv, 'a', 'f'); + addrange(cv, 'A', 'F'); + } + break; + case CC_SPACE: + cv = getcvec(v, NUM_SPACE_CHAR, NUM_SPACE_RANGE); + if (cv) { + for (i=0 ; (size_t)i<NUM_SPACE_RANGE ; i++) { + addrange(cv, spaceRangeTable[i].start, + spaceRangeTable[i].end); + } + for (i=0 ; (size_t)i<NUM_SPACE_CHAR ; i++) { + addchr(cv, spaceCharTable[i]); + } + } + break; + case CC_LOWER: + cv = getcvec(v, NUM_LOWER_CHAR, NUM_LOWER_RANGE); + if (cv) { + for (i=0 ; (size_t)i<NUM_LOWER_RANGE ; i++) { + addrange(cv, lowerRangeTable[i].start, + lowerRangeTable[i].end); + } + for (i=0 ; (size_t)i<NUM_LOWER_CHAR ; i++) { + addchr(cv, lowerCharTable[i]); + } + } + break; + case CC_UPPER: + cv = getcvec(v, NUM_UPPER_CHAR, NUM_UPPER_RANGE); + if (cv) { + for (i=0 ; (size_t)i<NUM_UPPER_RANGE ; i++) { + addrange(cv, upperRangeTable[i].start, + upperRangeTable[i].end); + } + for (i=0 ; (size_t)i<NUM_UPPER_CHAR ; i++) { + addchr(cv, upperCharTable[i]); + } + } + break; + case CC_GRAPH: + cv = getcvec(v, NUM_GRAPH_CHAR, NUM_GRAPH_RANGE); + if (cv) { + for (i=0 ; (size_t)i<NUM_GRAPH_RANGE ; i++) { + addrange(cv, graphRangeTable[i].start, + graphRangeTable[i].end); + } + for (i=0 ; (size_t)i<NUM_GRAPH_CHAR ; i++) { + addchr(cv, graphCharTable[i]); + } + } + break; + } + if (cv == NULL) { + ERR(REG_ESPACE); + } + return cv; +} + +/* + - allcases - supply cvec for all case counterparts of a chr (including itself) + * This is a shortcut, preferably an efficient one, for simple characters; + * messy cases are done via range(). + ^ static struct cvec *allcases(struct vars *, pchr); + */ +static struct cvec * +allcases( + struct vars *v, /* context */ + pchr pc) /* character to get case equivs of */ +{ + struct cvec *cv; + chr c = (chr)pc; + chr lc, uc, tc; + + lc = Tcl_UniCharToLower((chr)c); + uc = Tcl_UniCharToUpper((chr)c); + tc = Tcl_UniCharToTitle((chr)c); + + if (tc != uc) { + cv = getcvec(v, 3, 0); + addchr(cv, tc); + } else { + cv = getcvec(v, 2, 0); + } + addchr(cv, lc); + if (lc != uc) { + addchr(cv, uc); + } + return cv; +} + +/* + - cmp - chr-substring compare + * Backrefs need this. It should preferably be efficient. + * Note that it does not need to report anything except equal/unequal. + * Note also that the length is exact, and the comparison should not + * stop at embedded NULs! + ^ static int cmp(const chr *, const chr *, size_t); + */ +static int /* 0 for equal, nonzero for unequal */ +cmp( + const chr *x, const chr *y, /* strings to compare */ + size_t len) /* exact length of comparison */ +{ + return memcmp(VS(x), VS(y), len*sizeof(chr)); +} + +/* + - casecmp - case-independent chr-substring compare + * REG_ICASE backrefs need this. It should preferably be efficient. + * Note that it does not need to report anything except equal/unequal. + * Note also that the length is exact, and the comparison should not + * stop at embedded NULs! + ^ static int casecmp(const chr *, const chr *, size_t); + */ +static int /* 0 for equal, nonzero for unequal */ +casecmp( + const chr *x, const chr *y, /* strings to compare */ + size_t len) /* exact length of comparison */ +{ + for (; len > 0; len--, x++, y++) { + if ((*x!=*y) && (Tcl_UniCharToLower(*x) != Tcl_UniCharToLower(*y))) { + return 1; + } + } + return 0; +} + +/* + * Local Variables: + * mode: c + * c-basic-offset: 4 + * fill-column: 78 + * End: + */ diff --git a/contrib/hsrex/regc_nfa.c b/contrib/hsrex/regc_nfa.c new file mode 100644 index 0000000..04d2f46 --- /dev/null +++ b/contrib/hsrex/regc_nfa.c @@ -0,0 +1,1873 @@ +/* + * NFA utilities. + * This file is #included by regcomp.c. + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation of + * software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * One or two things that technically ought to be in here are actually in + * color.c, thanks to some incestuous relationships in the color chains. + */ + +#define NISERR() VISERR(nfa->v) +#define NERR(e) VERR(nfa->v, (e)) + +/* + - newnfa - set up an NFA + ^ static struct nfa *newnfa(struct vars *, struct colormap *, struct nfa *); + */ +static struct nfa * /* the NFA, or NULL */ +newnfa( + struct vars *v, + struct colormap *cm, + struct nfa *parent) /* NULL if primary NFA */ +{ + struct nfa *nfa; + + nfa = (struct nfa *) MALLOC(sizeof(struct nfa)); + if (nfa == NULL) { + return NULL; + } + + nfa->states = NULL; + nfa->slast = NULL; + nfa->free = NULL; + nfa->nstates = 0; + nfa->cm = cm; + nfa->v = v; + nfa->size = 0; + nfa->bos[0] = nfa->bos[1] = COLORLESS; + nfa->eos[0] = nfa->eos[1] = COLORLESS; + nfa->parent = parent; /* Precedes newfstate so parent is valid. */ + nfa->post = newfstate(nfa, '@'); /* number 0 */ + nfa->pre = newfstate(nfa, '>'); /* number 1 */ + + nfa->init = newstate(nfa); /* May become invalid later. */ + nfa->final = newstate(nfa); + if (ISERR()) { + freenfa(nfa); + return NULL; + } + rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->pre, nfa->init); + newarc(nfa, '^', 1, nfa->pre, nfa->init); + newarc(nfa, '^', 0, nfa->pre, nfa->init); + rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->final, nfa->post); + newarc(nfa, '$', 1, nfa->final, nfa->post); + newarc(nfa, '$', 0, nfa->final, nfa->post); + + if (ISERR()) { + freenfa(nfa); + return NULL; + } + return nfa; +} + +/* + - TooManyStates - checks if the max states exceeds the compile-time value + ^ static int TooManyStates(struct nfa *); + */ +static int +TooManyStates( + struct nfa *nfa) +{ + struct nfa *parent = nfa->parent; + size_t sz = nfa->size; + + while (parent != NULL) { + sz = parent->size; + parent = parent->parent; + } + if (sz > REG_MAX_STATES) { + return 1; + } + return 0; +} + +/* + - IncrementSize - increases the tracked size of the NFA and its parents. + ^ static void IncrementSize(struct nfa *); + */ +static void +IncrementSize( + struct nfa *nfa) +{ + struct nfa *parent = nfa->parent; + + nfa->size++; + while (parent != NULL) { + parent->size++; + parent = parent->parent; + } +} + +/* + - DecrementSize - increases the tracked size of the NFA and its parents. + ^ static void DecrementSize(struct nfa *); + */ +static void +DecrementSize( + struct nfa *nfa) +{ + struct nfa *parent = nfa->parent; + + nfa->size--; + while (parent != NULL) { + parent->size--; + parent = parent->parent; + } +} + +/* + - freenfa - free an entire NFA + ^ static void freenfa(struct nfa *); + */ +static void +freenfa( + struct nfa *nfa) +{ + struct state *s; + + while ((s = nfa->states) != NULL) { + s->nins = s->nouts = 0; /* don't worry about arcs */ + freestate(nfa, s); + } + while ((s = nfa->free) != NULL) { + nfa->free = s->next; + destroystate(nfa, s); + } + + nfa->slast = NULL; + nfa->nstates = -1; + nfa->pre = NULL; + nfa->post = NULL; + FREE(nfa); +} + +/* + - newstate - allocate an NFA state, with zero flag value + ^ static struct state *newstate(struct nfa *); + */ +static struct state * /* NULL on error */ +newstate( + struct nfa *nfa) +{ + struct state *s; + + if (TooManyStates(nfa)) { + /* XXX: add specific error for this */ + NERR(REG_ETOOBIG); + return NULL; + } + if (nfa->free != NULL) { + s = nfa->free; + nfa->free = s->next; + } else { + s = (struct state *) MALLOC(sizeof(struct state)); + if (s == NULL) { + NERR(REG_ESPACE); + return NULL; + } + s->oas.next = NULL; + s->free = NULL; + s->noas = 0; + } + + assert(nfa->nstates >= 0); + s->no = nfa->nstates++; + s->flag = 0; + if (nfa->states == NULL) { + nfa->states = s; + } + s->nins = 0; + s->ins = NULL; + s->nouts = 0; + s->outs = NULL; + s->tmp = NULL; + s->next = NULL; + if (nfa->slast != NULL) { + assert(nfa->slast->next == NULL); + nfa->slast->next = s; + } + s->prev = nfa->slast; + nfa->slast = s; + + /* + * Track the current size and the parent size. + */ + + IncrementSize(nfa); + return s; +} + +/* + - newfstate - allocate an NFA state with a specified flag value + ^ static struct state *newfstate(struct nfa *, int flag); + */ +static struct state * /* NULL on error */ +newfstate( + struct nfa *nfa, + int flag) +{ + struct state *s; + + s = newstate(nfa); + if (s != NULL) { + s->flag = (char) flag; + } + return s; +} + +/* + - dropstate - delete a state's inarcs and outarcs and free it + ^ static void dropstate(struct nfa *, struct state *); + */ +static void +dropstate( + struct nfa *nfa, + struct state *s) +{ + struct arc *a; + + while ((a = s->ins) != NULL) { + freearc(nfa, a); + } + while ((a = s->outs) != NULL) { + freearc(nfa, a); + } + freestate(nfa, s); +} + +/* + - freestate - free a state, which has no in-arcs or out-arcs + ^ static void freestate(struct nfa *, struct state *); + */ +static void +freestate( + struct nfa *nfa, + struct state *s) +{ + assert(s != NULL); + assert(s->nins == 0 && s->nouts == 0); + + s->no = FREESTATE; + s->flag = 0; + if (s->next != NULL) { + s->next->prev = s->prev; + } else { + assert(s == nfa->slast); + nfa->slast = s->prev; + } + if (s->prev != NULL) { + s->prev->next = s->next; + } else { + assert(s == nfa->states); + nfa->states = s->next; + } + s->prev = NULL; + s->next = nfa->free; /* don't delete it, put it on the free list */ + nfa->free = s; + DecrementSize(nfa); +} + +/* + - destroystate - really get rid of an already-freed state + ^ static void destroystate(struct nfa *, struct state *); + */ +static void +destroystate( + struct nfa *nfa, + struct state *s) +{ + struct arcbatch *ab; + struct arcbatch *abnext; + + assert(s->no == FREESTATE); + for (ab=s->oas.next ; ab!=NULL ; ab=abnext) { + abnext = ab->next; + FREE(ab); + } + s->ins = NULL; + s->outs = NULL; + s->next = NULL; + FREE(s); +} + +/* + - newarc - set up a new arc within an NFA + ^ static void newarc(struct nfa *, int, pcolor, struct state *, + ^ struct state *); + */ +static void +newarc( + struct nfa *nfa, + int t, + pcolor co, + struct state *from, + struct state *to) +{ + struct arc *a; + + assert(from != NULL && to != NULL); + + /* + * Check for duplicates. + */ + + for (a=from->outs ; a!=NULL ; a=a->outchain) { + if (a->to == to && a->co == co && a->type == t) { + return; + } + } + + a = allocarc(nfa, from); + if (NISERR()) { + return; + } + assert(a != NULL); + + a->type = t; + a->co = (color) co; + a->to = to; + a->from = from; + + /* + * Put the new arc on the beginning, not the end, of the chains. Not only + * is this easier, it has the very useful side effect that deleting the + * most-recently-added arc is the cheapest case rather than the most + * expensive one. + */ + + a->inchain = to->ins; + to->ins = a; + a->outchain = from->outs; + from->outs = a; + + from->nouts++; + to->nins++; + + if (COLORED(a) && nfa->parent == NULL) { + colorchain(nfa->cm, a); + } +} + +/* + - allocarc - allocate a new out-arc within a state + ^ static struct arc *allocarc(struct nfa *, struct state *); + */ +static struct arc * /* NULL for failure */ +allocarc( + struct nfa *nfa, + struct state *s) +{ + struct arc *a; + + /* + * Shortcut + */ + + if (s->free == NULL && s->noas < ABSIZE) { + a = &s->oas.a[s->noas]; + s->noas++; + return a; + } + + /* + * if none at hand, get more + */ + + if (s->free == NULL) { + struct arcbatch *newAb = (struct arcbatch *) + MALLOC(sizeof(struct arcbatch)); + int i; + + if (newAb == NULL) { + NERR(REG_ESPACE); + return NULL; + } + newAb->next = s->oas.next; + s->oas.next = newAb; + + for (i=0 ; i<ABSIZE ; i++) { + newAb->a[i].type = 0; + newAb->a[i].freechain = &newAb->a[i+1]; + } + newAb->a[ABSIZE-1].freechain = NULL; + s->free = &newAb->a[0]; + } + assert(s->free != NULL); + + a = s->free; + s->free = a->freechain; + return a; +} + +/* + - freearc - free an arc + ^ static void freearc(struct nfa *, struct arc *); + */ +static void +freearc( + struct nfa *nfa, + struct arc *victim) +{ + struct state *from = victim->from; + struct state *to = victim->to; + struct arc *a; + + assert(victim->type != 0); + + /* + * Take it off color chain if necessary. + */ + + if (COLORED(victim) && nfa->parent == NULL) { + uncolorchain(nfa->cm, victim); + } + + /* + * Take it off source's out-chain. + */ + + assert(from != NULL); + assert(from->outs != NULL); + a = from->outs; + if (a == victim) { /* simple case: first in chain */ + from->outs = victim->outchain; + } else { + for (; a!=NULL && a->outchain!=victim ; a=a->outchain) { + continue; + } + assert(a != NULL); + a->outchain = victim->outchain; + } + from->nouts--; + + /* + * Take it off target's in-chain. + */ + + assert(to != NULL); + assert(to->ins != NULL); + a = to->ins; + if (a == victim) { /* simple case: first in chain */ + to->ins = victim->inchain; + } else { + for (; a->inchain!=victim ; a=a->inchain) { + assert(a->inchain != NULL); + continue; + } + a->inchain = victim->inchain; + } + to->nins--; + + /* + * Clean up and place on free list. + */ + + victim->type = 0; + victim->from = NULL; /* precautions... */ + victim->to = NULL; + victim->inchain = NULL; + victim->outchain = NULL; + victim->freechain = from->free; + from->free = victim; +} + +/* + - findarc - find arc, if any, from given source with given type and color + * If there is more than one such arc, the result is random. + ^ static struct arc *findarc(struct state *, int, pcolor); + */ +static struct arc * +findarc( + struct state *s, + int type, + pcolor co) +{ + struct arc *a; + + for (a=s->outs ; a!=NULL ; a=a->outchain) { + if (a->type == type && a->co == co) { + return a; + } + } + return NULL; +} + +/* + - cparc - allocate a new arc within an NFA, copying details from old one + ^ static void cparc(struct nfa *, struct arc *, struct state *, + ^ struct state *); + */ +static void +cparc( + struct nfa *nfa, + struct arc *oa, + struct state *from, + struct state *to) +{ + newarc(nfa, oa->type, oa->co, from, to); +} + +/* + - moveins - move all in arcs of a state to another state + * You might think this could be done better by just updating the + * existing arcs, and you would be right if it weren't for the desire + * for duplicate suppression, which makes it easier to just make new + * ones to exploit the suppression built into newarc. + ^ static void moveins(struct nfa *, struct state *, struct state *); + */ +static void +moveins( + struct nfa *nfa, + struct state *oldState, + struct state *newState) +{ + struct arc *a; + + assert(oldState != newState); + + while ((a = oldState->ins) != NULL) { + cparc(nfa, a, a->from, newState); + freearc(nfa, a); + } + assert(oldState->nins == 0); + assert(oldState->ins == NULL); +} + +/* + - copyins - copy all in arcs of a state to another state + ^ static void copyins(struct nfa *, struct state *, struct state *); + */ +static void +copyins( + struct nfa *nfa, + struct state *oldState, + struct state *newState) +{ + struct arc *a; + + assert(oldState != newState); + + for (a=oldState->ins ; a!=NULL ; a=a->inchain) { + cparc(nfa, a, a->from, newState); + } +} + +/* + - moveouts - move all out arcs of a state to another state + ^ static void moveouts(struct nfa *, struct state *, struct state *); + */ +static void +moveouts( + struct nfa *nfa, + struct state *oldState, + struct state *newState) +{ + struct arc *a; + + assert(oldState != newState); + + while ((a = oldState->outs) != NULL) { + cparc(nfa, a, newState, a->to); + freearc(nfa, a); + } +} + +/* + - copyouts - copy all out arcs of a state to another state + ^ static void copyouts(struct nfa *, struct state *, struct state *); + */ +static void +copyouts( + struct nfa *nfa, + struct state *oldState, + struct state *newState) +{ + struct arc *a; + + assert(oldState != newState); + + for (a=oldState->outs ; a!=NULL ; a=a->outchain) { + cparc(nfa, a, newState, a->to); + } +} + +/* + - cloneouts - copy out arcs of a state to another state pair, modifying type + ^ static void cloneouts(struct nfa *, struct state *, struct state *, + ^ struct state *, int); + */ +static void +cloneouts( + struct nfa *nfa, + struct state *old, + struct state *from, + struct state *to, + int type) +{ + struct arc *a; + + assert(old != from); + + for (a=old->outs ; a!=NULL ; a=a->outchain) { + newarc(nfa, type, a->co, from, to); + } +} + +/* + - delsub - delete a sub-NFA, updating subre pointers if necessary + * This uses a recursive traversal of the sub-NFA, marking already-seen + * states using their tmp pointer. + ^ static void delsub(struct nfa *, struct state *, struct state *); + */ +static void +delsub( + struct nfa *nfa, + struct state *lp, /* the sub-NFA goes from here... */ + struct state *rp) /* ...to here, *not* inclusive */ +{ + assert(lp != rp); + + rp->tmp = rp; /* mark end */ + + deltraverse(nfa, lp, lp); + assert(lp->nouts == 0 && rp->nins == 0); /* did the job */ + assert(lp->no != FREESTATE && rp->no != FREESTATE); /* no more */ + + rp->tmp = NULL; /* unmark end */ + lp->tmp = NULL; /* and begin, marked by deltraverse */ +} + +/* + - deltraverse - the recursive heart of delsub + * This routine's basic job is to destroy all out-arcs of the state. + ^ static void deltraverse(struct nfa *, struct state *, struct state *); + */ +static void +deltraverse( + struct nfa *nfa, + struct state *leftend, + struct state *s) +{ + struct arc *a; + struct state *to; + + if (s->nouts == 0) { + return; /* nothing to do */ + } + if (s->tmp != NULL) { + return; /* already in progress */ + } + + s->tmp = s; /* mark as in progress */ + + while ((a = s->outs) != NULL) { + to = a->to; + deltraverse(nfa, leftend, to); + assert(to->nouts == 0 || to->tmp != NULL); + freearc(nfa, a); + if (to->nins == 0 && to->tmp == NULL) { + assert(to->nouts == 0); + freestate(nfa, to); + } + } + + assert(s->no != FREESTATE); /* we're still here */ + assert(s == leftend || s->nins != 0); /* and still reachable */ + assert(s->nouts == 0); /* but have no outarcs */ + + s->tmp = NULL; /* we're done here */ +} + +/* + - dupnfa - duplicate sub-NFA + * Another recursive traversal, this time using tmp to point to duplicates as + * well as mark already-seen states. (You knew there was a reason why it's a + * state pointer, didn't you? :-)) + ^ static void dupnfa(struct nfa *, struct state *, struct state *, + ^ struct state *, struct state *); + */ +static void +dupnfa( + struct nfa *nfa, + struct state *start, /* duplicate of subNFA starting here */ + struct state *stop, /* and stopping here */ + struct state *from, /* stringing duplicate from here */ + struct state *to) /* to here */ +{ + if (start == stop) { + newarc(nfa, EMPTY, 0, from, to); + return; + } + + stop->tmp = to; + duptraverse(nfa, start, from, 0); + /* done, except for clearing out the tmp pointers */ + + stop->tmp = NULL; + cleartraverse(nfa, start); +} + +/* + - duptraverse - recursive heart of dupnfa + ^ static void duptraverse(struct nfa *, struct state *, struct state *); + */ +static void +duptraverse( + struct nfa *nfa, + struct state *s, + struct state *stmp, /* s's duplicate, or NULL */ + int depth) +{ + struct arc *a; + + if (s->tmp != NULL) { + return; /* already done */ + } + + s->tmp = (stmp == NULL) ? newstate(nfa) : stmp; + if (s->tmp == NULL) { + assert(NISERR()); + return; + } + + /* + * Arbitrary depth limit. Needs tuning, but this value is sufficient to + * make all normal tests (not reg-33.14) pass. + */ + /* Updated from 500 to 1204 to support REs with 99 group patterns. + * Why to limit the tree depth ? + * If long REs are not needed then just don't write long REs. + */ +#define DUPTRAVERSE_MAX_DEPTH 1204 + + if (depth++ > DUPTRAVERSE_MAX_DEPTH) { + NERR(REG_ESPACE); + } + + for (a=s->outs ; a!=NULL && !NISERR() ; a=a->outchain) { + duptraverse(nfa, a->to, NULL, depth); + if (NISERR()) { + break; + } + assert(a->to->tmp != NULL); + cparc(nfa, a, s->tmp, a->to->tmp); + } +} + +/* + - cleartraverse - recursive cleanup for algorithms that leave tmp ptrs set + ^ static void cleartraverse(struct nfa *, struct state *); + */ +static void +cleartraverse( + struct nfa *nfa, + struct state *s) +{ + struct arc *a; + + if (s->tmp == NULL) { + return; + } + s->tmp = NULL; + + for (a=s->outs ; a!=NULL ; a=a->outchain) { + cleartraverse(nfa, a->to); + } +} + +/* + - specialcolors - fill in special colors for an NFA + ^ static void specialcolors(struct nfa *); + */ +static void +specialcolors( + struct nfa *nfa) +{ + /* + * False colors for BOS, BOL, EOS, EOL + */ + + if (nfa->parent == NULL) { + nfa->bos[0] = pseudocolor(nfa->cm); + nfa->bos[1] = pseudocolor(nfa->cm); + nfa->eos[0] = pseudocolor(nfa->cm); + nfa->eos[1] = pseudocolor(nfa->cm); + } else { + assert(nfa->parent->bos[0] != COLORLESS); + nfa->bos[0] = nfa->parent->bos[0]; + assert(nfa->parent->bos[1] != COLORLESS); + nfa->bos[1] = nfa->parent->bos[1]; + assert(nfa->parent->eos[0] != COLORLESS); + nfa->eos[0] = nfa->parent->eos[0]; + assert(nfa->parent->eos[1] != COLORLESS); + nfa->eos[1] = nfa->parent->eos[1]; + } +} + +/* + - optimize - optimize an NFA + ^ static long optimize(struct nfa *, FILE *); + */ +static long /* re_info bits */ +optimize( + struct nfa *nfa, + FILE *f) /* for debug output; NULL none */ +{ + int verbose = (f != NULL) ? 1 : 0; + + if (verbose) { + fprintf(f, "\ninitial cleanup:\n"); + } + cleanup(nfa); /* may simplify situation */ + if (verbose) { + dumpnfa(nfa, f); + } + if (verbose) { + fprintf(f, "\nempties:\n"); + } + fixempties(nfa, f); /* get rid of EMPTY arcs */ + if (verbose) { + fprintf(f, "\nconstraints:\n"); + } + pullback(nfa, f); /* pull back constraints backward */ + pushfwd(nfa, f); /* push fwd constraints forward */ + if (verbose) { + fprintf(f, "\nfinal cleanup:\n"); + } + cleanup(nfa); /* final tidying */ + return analyze(nfa); /* and analysis */ +} + +/* + - pullback - pull back constraints backward to (with luck) eliminate them + ^ static void pullback(struct nfa *, FILE *); + */ +static void +pullback( + struct nfa *nfa, + FILE *f) /* for debug output; NULL none */ +{ + struct state *s; + struct state *nexts; + struct arc *a; + struct arc *nexta; + int progress; + + /* + * Find and pull until there are no more. + */ + + do { + progress = 0; + for (s=nfa->states ; s!=NULL && !NISERR() ; s=nexts) { + nexts = s->next; + for (a=s->outs ; a!=NULL && !NISERR() ; a=nexta) { + nexta = a->outchain; + if (a->type == '^' || a->type == BEHIND) { + if (pull(nfa, a)) { + progress = 1; + } + } + assert(nexta == NULL || s->no != FREESTATE); + } + } + if (progress && f != NULL) { + dumpnfa(nfa, f); + } + } while (progress && !NISERR()); + if (NISERR()) { + return; + } + + for (a=nfa->pre->outs ; a!=NULL ; a=nexta) { + nexta = a->outchain; + if (a->type == '^') { + assert(a->co == 0 || a->co == 1); + newarc(nfa, PLAIN, nfa->bos[a->co], a->from, a->to); + freearc(nfa, a); + } + } +} + +/* + - pull - pull a back constraint backward past its source state + * A significant property of this function is that it deletes at most + * one state -- the constraint's from state -- and only if the constraint + * was that state's last outarc. + ^ static int pull(struct nfa *, struct arc *); + */ +static int /* 0 couldn't, 1 could */ +pull( + struct nfa *nfa, + struct arc *con) +{ + struct state *from = con->from; + struct state *to = con->to; + struct arc *a; + struct arc *nexta; + struct state *s; + + if (from == to) { /* circular constraint is pointless */ + freearc(nfa, con); + return 1; + } + if (from->flag) { /* can't pull back beyond start */ + return 0; + } + if (from->nins == 0) { /* unreachable */ + freearc(nfa, con); + return 1; + } + + /* + * DGP 2007-11-15: Cloning a state with a circular constraint on its list + * of outs can lead to trouble [Bug 1810038], so get rid of them first. + */ + + for (a = from->outs; a != NULL; a = nexta) { + nexta = a->outchain; + switch (a->type) { + case '^': + case '$': + case BEHIND: + case AHEAD: + if (from == a->to) { + freearc(nfa, a); + } + break; + } + } + + /* + * First, clone from state if necessary to avoid other outarcs. + */ + + if (from->nouts > 1) { + s = newstate(nfa); + if (NISERR()) { + return 0; + } + assert(to != from); /* con is not an inarc */ + copyins(nfa, from, s); /* duplicate inarcs */ + cparc(nfa, con, s, to); /* move constraint arc */ + freearc(nfa, con); + from = s; + con = from->outs; + } + assert(from->nouts == 1); + + /* + * Propagate the constraint into the from state's inarcs. + */ + + for (a=from->ins ; a!=NULL ; a=nexta) { + nexta = a->inchain; + switch (combine(con, a)) { + case INCOMPATIBLE: /* destroy the arc */ + freearc(nfa, a); + break; + case SATISFIED: /* no action needed */ + break; + case COMPATIBLE: /* swap the two arcs, more or less */ + s = newstate(nfa); + if (NISERR()) { + return 0; + } + cparc(nfa, a, s, to); /* anticipate move */ + cparc(nfa, con, a->from, s); + if (NISERR()) { + return 0; + } + freearc(nfa, a); + break; + default: + assert(NOTREACHED); + break; + } + } + + /* + * Remaining inarcs, if any, incorporate the constraint. + */ + + moveins(nfa, from, to); + dropstate(nfa, from); /* will free the constraint */ + return 1; +} + +/* + - pushfwd - push forward constraints forward to (with luck) eliminate them + ^ static void pushfwd(struct nfa *, FILE *); + */ +static void +pushfwd( + struct nfa *nfa, + FILE *f) /* for debug output; NULL none */ +{ + struct state *s; + struct state *nexts; + struct arc *a; + struct arc *nexta; + int progress; + + /* + * Find and push until there are no more. + */ + + do { + progress = 0; + for (s=nfa->states ; s!=NULL && !NISERR() ; s=nexts) { + nexts = s->next; + for (a = s->ins; a != NULL && !NISERR(); a = nexta) { + nexta = a->inchain; + if (a->type == '$' || a->type == AHEAD) { + if (push(nfa, a)) { + progress = 1; + } + } + assert(nexta == NULL || s->no != FREESTATE); + } + } + if (progress && f != NULL) { + dumpnfa(nfa, f); + } + } while (progress && !NISERR()); + if (NISERR()) { + return; + } + + for (a = nfa->post->ins; a != NULL; a = nexta) { + nexta = a->inchain; + if (a->type == '$') { + assert(a->co == 0 || a->co == 1); + newarc(nfa, PLAIN, nfa->eos[a->co], a->from, a->to); + freearc(nfa, a); + } + } +} + +/* + - push - push a forward constraint forward past its destination state + * A significant property of this function is that it deletes at most + * one state -- the constraint's to state -- and only if the constraint + * was that state's last inarc. + ^ static int push(struct nfa *, struct arc *); + */ +static int /* 0 couldn't, 1 could */ +push( + struct nfa *nfa, + struct arc *con) +{ + struct state *from = con->from; + struct state *to = con->to; + struct arc *a; + struct arc *nexta; + struct state *s; + + if (to == from) { /* circular constraint is pointless */ + freearc(nfa, con); + return 1; + } + if (to->flag) { /* can't push forward beyond end */ + return 0; + } + if (to->nouts == 0) { /* dead end */ + freearc(nfa, con); + return 1; + } + + /* + * DGP 2007-11-15: Here we duplicate the same protections as appear + * in pull() above to avoid troubles with cloning a state with a + * circular constraint on its list of ins. It is not clear whether + * this is necessary, or is protecting against a "can't happen". + * Any test case that actually leads to a freearc() call here would + * be a welcome addition to the test suite. + */ + + for (a = to->ins; a != NULL; a = nexta) { + nexta = a->inchain; + switch (a->type) { + case '^': + case '$': + case BEHIND: + case AHEAD: + if (a->from == to) { + freearc(nfa, a); + } + break; + } + } + /* + * First, clone to state if necessary to avoid other inarcs. + */ + + if (to->nins > 1) { + s = newstate(nfa); + if (NISERR()) { + return 0; + } + copyouts(nfa, to, s); /* duplicate outarcs */ + cparc(nfa, con, from, s); /* move constraint */ + freearc(nfa, con); + to = s; + con = to->ins; + } + assert(to->nins == 1); + + /* + * Propagate the constraint into the to state's outarcs. + */ + + for (a = to->outs; a != NULL; a = nexta) { + nexta = a->outchain; + switch (combine(con, a)) { + case INCOMPATIBLE: /* destroy the arc */ + freearc(nfa, a); + break; + case SATISFIED: /* no action needed */ + break; + case COMPATIBLE: /* swap the two arcs, more or less */ + s = newstate(nfa); + if (NISERR()) { + return 0; + } + cparc(nfa, con, s, a->to); /* anticipate move */ + cparc(nfa, a, from, s); + if (NISERR()) { + return 0; + } + freearc(nfa, a); + break; + default: + assert(NOTREACHED); + break; + } + } + + /* + * Remaining outarcs, if any, incorporate the constraint. + */ + + moveouts(nfa, to, from); + dropstate(nfa, to); /* will free the constraint */ + return 1; +} + +/* + - combine - constraint lands on an arc, what happens? + ^ #def INCOMPATIBLE 1 // destroys arc + ^ #def SATISFIED 2 // constraint satisfied + ^ #def COMPATIBLE 3 // compatible but not satisfied yet + ^ static int combine(struct arc *, struct arc *); + */ +static int +combine( + struct arc *con, + struct arc *a) +{ +#define CA(ct,at) (((ct)<<CHAR_BIT) | (at)) + + switch (CA(con->type, a->type)) { + case CA('^', PLAIN): /* newlines are handled separately */ + case CA('$', PLAIN): + return INCOMPATIBLE; + break; + case CA(AHEAD, PLAIN): /* color constraints meet colors */ + case CA(BEHIND, PLAIN): + if (con->co == a->co) { + return SATISFIED; + } + return INCOMPATIBLE; + break; + case CA('^', '^'): /* collision, similar constraints */ + case CA('$', '$'): + case CA(AHEAD, AHEAD): + case CA(BEHIND, BEHIND): + if (con->co == a->co) { /* true duplication */ + return SATISFIED; + } + return INCOMPATIBLE; + break; + case CA('^', BEHIND): /* collision, dissimilar constraints */ + case CA(BEHIND, '^'): + case CA('$', AHEAD): + case CA(AHEAD, '$'): + return INCOMPATIBLE; + break; + case CA('^', '$'): /* constraints passing each other */ + case CA('^', AHEAD): + case CA(BEHIND, '$'): + case CA(BEHIND, AHEAD): + case CA('$', '^'): + case CA('$', BEHIND): + case CA(AHEAD, '^'): + case CA(AHEAD, BEHIND): + case CA('^', LACON): + case CA(BEHIND, LACON): + case CA('$', LACON): + case CA(AHEAD, LACON): + return COMPATIBLE; + break; + } + assert(NOTREACHED); + return INCOMPATIBLE; /* for benefit of blind compilers */ +} + +/* + - fixempties - get rid of EMPTY arcs + ^ static void fixempties(struct nfa *, FILE *); + */ +static void +fixempties( + struct nfa *nfa, + FILE *f) /* for debug output; NULL none */ +{ + struct state *s; + struct state *nexts; + struct arc *a; + struct arc *nexta; + int progress; + + /* + * Find and eliminate empties until there are no more. + */ + + do { + progress = 0; + for (s = nfa->states; s != NULL && !NISERR() + && s->no != FREESTATE; s = nexts) { + nexts = s->next; + for (a = s->outs; a != NULL && !NISERR(); a = nexta) { + nexta = a->outchain; + if (a->type == EMPTY && unempty(nfa, a)) { + progress = 1; + } + assert(nexta == NULL || s->no != FREESTATE); + } + } + if (progress && f != NULL) { + dumpnfa(nfa, f); + } + } while (progress && !NISERR()); +} + +/* + - unempty - optimize out an EMPTY arc, if possible + * Actually, as it stands this function always succeeds, but the return value + * is kept with an eye on possible future changes. + ^ static int unempty(struct nfa *, struct arc *); + */ +static int /* 0 couldn't, 1 could */ +unempty( + struct nfa *nfa, + struct arc *a) +{ + struct state *from = a->from; + struct state *to = a->to; + int usefrom; /* work on from, as opposed to to? */ + + assert(a->type == EMPTY); + assert(from != nfa->pre && to != nfa->post); + + if (from == to) { /* vacuous loop */ + freearc(nfa, a); + return 1; + } + + /* + * Decide which end to work on. + */ + + usefrom = 1; /* default: attack from */ + if (from->nouts > to->nins) { + usefrom = 0; + } else if (from->nouts == to->nins) { + /* + * Decide on secondary issue: move/copy fewest arcs. + */ + + if (from->nins > to->nouts) { + usefrom = 0; + } + } + + freearc(nfa, a); + if (usefrom) { + if (from->nouts == 0) { + /* + * Was the state's only outarc. + */ + + moveins(nfa, from, to); + freestate(nfa, from); + } else { + copyins(nfa, from, to); + } + } else { + if (to->nins == 0) { + /* + * Was the state's only inarc. + */ + + moveouts(nfa, to, from); + freestate(nfa, to); + } else { + copyouts(nfa, to, from); + } + } + + return 1; +} + +/* + - cleanup - clean up NFA after optimizations + ^ static void cleanup(struct nfa *); + */ +static void +cleanup( + struct nfa *nfa) +{ + struct state *s; + struct state *nexts; + int n; + + /* + * Clear out unreachable or dead-end states. Use pre to mark reachable, + * then post to mark can-reach-post. + */ + + markreachable(nfa, nfa->pre, NULL, nfa->pre); + markcanreach(nfa, nfa->post, nfa->pre, nfa->post); + for (s = nfa->states; s != NULL; s = nexts) { + nexts = s->next; + if (s->tmp != nfa->post && !s->flag) { + dropstate(nfa, s); + } + } + assert(nfa->post->nins == 0 || nfa->post->tmp == nfa->post); + cleartraverse(nfa, nfa->pre); + assert(nfa->post->nins == 0 || nfa->post->tmp == NULL); + /* the nins==0 (final unreachable) case will be caught later */ + + /* + * Renumber surviving states. + */ + + n = 0; + for (s = nfa->states; s != NULL; s = s->next) { + s->no = n++; + } + nfa->nstates = n; +} + +/* + - markreachable - recursive marking of reachable states + ^ static void markreachable(struct nfa *, struct state *, struct state *, + ^ struct state *); + */ +static void +markreachable( + struct nfa *nfa, + struct state *s, + struct state *okay, /* consider only states with this mark */ + struct state *mark) /* the value to mark with */ +{ + struct arc *a; + + if (s->tmp != okay) { + return; + } + s->tmp = mark; + + for (a = s->outs; a != NULL; a = a->outchain) { + markreachable(nfa, a->to, okay, mark); + } +} + +/* + - markcanreach - recursive marking of states which can reach here + ^ static void markcanreach(struct nfa *, struct state *, struct state *, + ^ struct state *); + */ +static void +markcanreach( + struct nfa *nfa, + struct state *s, + struct state *okay, /* consider only states with this mark */ + struct state *mark) /* the value to mark with */ +{ + struct arc *a; + + if (s->tmp != okay) { + return; + } + s->tmp = mark; + + for (a = s->ins; a != NULL; a = a->inchain) { + markcanreach(nfa, a->from, okay, mark); + } +} + +/* + - analyze - ascertain potentially-useful facts about an optimized NFA + ^ static long analyze(struct nfa *); + */ +static long /* re_info bits to be ORed in */ +analyze( + struct nfa *nfa) +{ + struct arc *a; + struct arc *aa; + + if (nfa->pre->outs == NULL) { + return REG_UIMPOSSIBLE; + } + for (a = nfa->pre->outs; a != NULL; a = a->outchain) { + for (aa = a->to->outs; aa != NULL; aa = aa->outchain) { + if (aa->to == nfa->post) { + return REG_UEMPTYMATCH; + } + } + } + return 0; +} + +/* + - compact - compact an NFA + ^ static void compact(struct nfa *, struct cnfa *); + */ +static void +compact( + struct nfa *nfa, + struct cnfa *cnfa) +{ + struct state *s; + struct arc *a; + size_t nstates; + size_t narcs; + struct carc *ca; + struct carc *first; + + assert(!NISERR()); + + nstates = 0; + narcs = 0; + for (s = nfa->states; s != NULL; s = s->next) { + nstates++; + narcs += 1 + s->nouts + 1; + /* 1 as a fake for flags, nouts for arcs, 1 as endmarker */ + } + + cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *)); + cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc)); + if (cnfa->states == NULL || cnfa->arcs == NULL) { + if (cnfa->states != NULL) { + FREE(cnfa->states); + } + if (cnfa->arcs != NULL) { + FREE(cnfa->arcs); + } + NERR(REG_ESPACE); + return; + } + cnfa->nstates = nstates; + cnfa->pre = nfa->pre->no; + cnfa->post = nfa->post->no; + cnfa->bos[0] = nfa->bos[0]; + cnfa->bos[1] = nfa->bos[1]; + cnfa->eos[0] = nfa->eos[0]; + cnfa->eos[1] = nfa->eos[1]; + cnfa->ncolors = maxcolor(nfa->cm) + 1; + cnfa->flags = 0; + + ca = cnfa->arcs; + for (s = nfa->states; s != NULL; s = s->next) { + assert((size_t) s->no < nstates); + cnfa->states[s->no] = ca; + ca->co = 0; /* clear and skip flags "arc" */ + ca++; + first = ca; + for (a = s->outs; a != NULL; a = a->outchain) { + switch (a->type) { + case PLAIN: + ca->co = a->co; + ca->to = a->to->no; + ca++; + break; + case LACON: + assert(s->no != cnfa->pre); + ca->co = (color) (cnfa->ncolors + a->co); + ca->to = a->to->no; + ca++; + cnfa->flags |= HASLACONS; + break; + default: + assert(NOTREACHED); + break; + } + } + carcsort(first, ca-1); + ca->co = COLORLESS; + ca->to = 0; + ca++; + } + assert(ca == &cnfa->arcs[narcs]); + assert(cnfa->nstates != 0); + + /* + * Mark no-progress states. + */ + + for (a = nfa->pre->outs; a != NULL; a = a->outchain) { + cnfa->states[a->to->no]->co = 1; + } + cnfa->states[nfa->pre->no]->co = 1; +} + +/* + - carcsort - sort compacted-NFA arcs by color + * Really dumb algorithm, but if the list is long enough for that to matter, + * you're in real trouble anyway. + ^ static void carcsort(struct carc *, struct carc *); + */ +static void +carcsort( + struct carc *first, + struct carc *last) +{ + struct carc *p; + struct carc *q; + struct carc tmp; + + if (last - first <= 1) { + return; + } + + for (p = first; p <= last; p++) { + for (q = p; q <= last; q++) { + if (p->co > q->co || (p->co == q->co && p->to > q->to)) { + assert(p != q); + tmp = *p; + *p = *q; + *q = tmp; + } + } + } +} + +/* + - freecnfa - free a compacted NFA + ^ static void freecnfa(struct cnfa *); + */ +static void +freecnfa( + struct cnfa *cnfa) +{ + assert(cnfa->nstates != 0); /* not empty already */ + cnfa->nstates = 0; + FREE(cnfa->states); + FREE(cnfa->arcs); +} + +/* + - dumpnfa - dump an NFA in human-readable form + ^ static void dumpnfa(struct nfa *, FILE *); + */ +static void +dumpnfa( + struct nfa *nfa, + FILE *f) +{ +#ifdef REG_DEBUG + struct state *s; + + fprintf(f, "pre %d, post %d", nfa->pre->no, nfa->post->no); + if (nfa->bos[0] != COLORLESS) { + fprintf(f, ", bos [%ld]", (long) nfa->bos[0]); + } + if (nfa->bos[1] != COLORLESS) { + fprintf(f, ", bol [%ld]", (long) nfa->bos[1]); + } + if (nfa->eos[0] != COLORLESS) { + fprintf(f, ", eos [%ld]", (long) nfa->eos[0]); + } + if (nfa->eos[1] != COLORLESS) { + fprintf(f, ", eol [%ld]", (long) nfa->eos[1]); + } + fprintf(f, "\n"); + for (s = nfa->states; s != NULL; s = s->next) { + dumpstate(s, f); + } + if (nfa->parent == NULL) { + dumpcolors(nfa->cm, f); + } + fflush(f); +#endif +} + +#ifdef REG_DEBUG /* subordinates of dumpnfa */ +/* + ^ #ifdef REG_DEBUG + */ + +/* + - dumpstate - dump an NFA state in human-readable form + ^ static void dumpstate(struct state *, FILE *); + */ +static void +dumpstate( + struct state *s, + FILE *f) +{ + struct arc *a; + + fprintf(f, "%d%s%c", s->no, (s->tmp != NULL) ? "T" : "", + (s->flag) ? s->flag : '.'); + if (s->prev != NULL && s->prev->next != s) { + fprintf(f, "\tstate chain bad\n"); + } + if (s->nouts == 0) { + fprintf(f, "\tno out arcs\n"); + } else { + dumparcs(s, f); + } + fflush(f); + for (a = s->ins; a != NULL; a = a->inchain) { + if (a->to != s) { + fprintf(f, "\tlink from %d to %d on %d's in-chain\n", + a->from->no, a->to->no, s->no); + } + } +} + +/* + - dumparcs - dump out-arcs in human-readable form + ^ static void dumparcs(struct state *, FILE *); + */ +static void +dumparcs( + struct state *s, + FILE *f) +{ + int pos; + + assert(s->nouts > 0); + /* printing arcs in reverse order is usually clearer */ + pos = dumprarcs(s->outs, s, f, 1); + if (pos != 1) { + fprintf(f, "\n"); + } +} + +/* + - dumprarcs - dump remaining outarcs, recursively, in reverse order + ^ static int dumprarcs(struct arc *, struct state *, FILE *, int); + */ +static int /* resulting print position */ +dumprarcs( + struct arc *a, + struct state *s, + FILE *f, + int pos) /* initial print position */ +{ + if (a->outchain != NULL) { + pos = dumprarcs(a->outchain, s, f, pos); + } + dumparc(a, s, f); + if (pos == 5) { + fprintf(f, "\n"); + pos = 1; + } else { + pos++; + } + return pos; +} + +/* + - dumparc - dump one outarc in readable form, including prefixing tab + ^ static void dumparc(struct arc *, struct state *, FILE *); + */ +static void +dumparc( + struct arc *a, + struct state *s, + FILE *f) +{ + struct arc *aa; + struct arcbatch *ab; + + fprintf(f, "\t"); + switch (a->type) { + case PLAIN: + fprintf(f, "[%ld]", (long) a->co); + break; + case AHEAD: + fprintf(f, ">%ld>", (long) a->co); + break; + case BEHIND: + fprintf(f, "<%ld<", (long) a->co); + break; + case LACON: + fprintf(f, ":%ld:", (long) a->co); + break; + case '^': + case '$': + fprintf(f, "%c%d", a->type, (int) a->co); + break; + case EMPTY: + break; + default: + fprintf(f, "0x%x/0%lo", a->type, (long) a->co); + break; + } + if (a->from != s) { + fprintf(f, "?%d?", a->from->no); + } + for (ab = &a->from->oas; ab != NULL; ab = ab->next) { + for (aa = &ab->a[0]; aa < &ab->a[ABSIZE]; aa++) { + if (aa == a) { + break; /* NOTE BREAK OUT */ + } + } + if (aa < &ab->a[ABSIZE]) { /* propagate break */ + break; /* NOTE BREAK OUT */ + } + } + if (ab == NULL) { + fprintf(f, "?!?"); /* not in allocated space */ + } + fprintf(f, "->"); + if (a->to == NULL) { + fprintf(f, "NULL"); + return; + } + fprintf(f, "%d", a->to->no); + for (aa = a->to->ins; aa != NULL; aa = aa->inchain) { + if (aa == a) { + break; /* NOTE BREAK OUT */ + } + } + if (aa == NULL) { + fprintf(f, "?!?"); /* missing from in-chain */ + } +} + +/* + ^ #endif + */ +#endif /* ifdef REG_DEBUG */ + +/* + - dumpcnfa - dump a compacted NFA in human-readable form + ^ static void dumpcnfa(struct cnfa *, FILE *); + */ +static void +dumpcnfa( + struct cnfa *cnfa, + FILE *f) +{ +#ifdef REG_DEBUG + int st; + + fprintf(f, "pre %d, post %d", cnfa->pre, cnfa->post); + if (cnfa->bos[0] != COLORLESS) { + fprintf(f, ", bos [%ld]", (long) cnfa->bos[0]); + } + if (cnfa->bos[1] != COLORLESS) { + fprintf(f, ", bol [%ld]", (long) cnfa->bos[1]); + } + if (cnfa->eos[0] != COLORLESS) { + fprintf(f, ", eos [%ld]", (long) cnfa->eos[0]); + } + if (cnfa->eos[1] != COLORLESS) { + fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]); + } + if (cnfa->flags&HASLACONS) { + fprintf(f, ", haslacons"); + } + fprintf(f, "\n"); + for (st = 0; st < cnfa->nstates; st++) { + dumpcstate(st, cnfa->states[st], cnfa, f); + } + fflush(f); +#endif +} + +#ifdef REG_DEBUG /* subordinates of dumpcnfa */ +/* + ^ #ifdef REG_DEBUG + */ + +/* + - dumpcstate - dump a compacted-NFA state in human-readable form + ^ static void dumpcstate(int, struct carc *, struct cnfa *, FILE *); + */ +static void +dumpcstate( + int st, + struct carc *ca, + struct cnfa *cnfa, + FILE *f) +{ + int i; + int pos; + + fprintf(f, "%d%s", st, (ca[0].co) ? ":" : "."); + pos = 1; + for (i = 1; ca[i].co != COLORLESS; i++) { + if (ca[i].co < cnfa->ncolors) { + fprintf(f, "\t[%ld]->%d", (long) ca[i].co, ca[i].to); + } else { + fprintf(f, "\t:%ld:->%d", (long) ca[i].co-cnfa->ncolors,ca[i].to); + } + if (pos == 5) { + fprintf(f, "\n"); + pos = 1; + } else { + pos++; + } + } + if (i == 1 || pos != 1) { + fprintf(f, "\n"); + } + fflush(f); +} + +/* + ^ #endif + */ +#endif /* ifdef REG_DEBUG */ + +/* + * Local Variables: + * mode: c + * c-basic-offset: 4 + * fill-column: 78 + * End: + */ diff --git a/contrib/hsrex/regcomp.c b/contrib/hsrex/regcomp.c new file mode 100644 index 0000000..8ff77ad --- /dev/null +++ b/contrib/hsrex/regcomp.c @@ -0,0 +1,2169 @@ +/* + * re_*comp and friends - compile REs + * This file #includes several others (see the bottom). + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation of + * software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "regguts.h" + +/* + * forward declarations, up here so forward datatypes etc. are defined early + */ +/* =====^!^===== begin forwards =====^!^===== */ +/* automatically gathered by fwd; do not hand-edit */ +/* === regcomp.c === */ +int compile(regex_t *, const chr *, size_t, int); +static void moresubs(struct vars *, int); +static int freev(struct vars *, int); +static void makesearch(struct vars *, struct nfa *); +static struct subre *parse(struct vars *, int, int, struct state *, struct state *); +static struct subre *parsebranch(struct vars *, int, int, struct state *, struct state *, int); +static void parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *); +static void nonword(struct vars *, int, struct state *, struct state *); +static void word(struct vars *, int, struct state *, struct state *); +static int scannum(struct vars *); +static void repeat(struct vars *, struct state *, struct state *, int, int); +static void bracket(struct vars *, struct state *, struct state *); +static void cbracket(struct vars *, struct state *, struct state *); +static void brackpart(struct vars *, struct state *, struct state *); +static const chr *scanplain(struct vars *); +static void onechr(struct vars *, pchr, struct state *, struct state *); +static void dovec(struct vars *, struct cvec *, struct state *, struct state *); +static void wordchrs(struct vars *); +static struct subre *subre(struct vars *, int, int, struct state *, struct state *); +static void freesubre(struct vars *, struct subre *); +static void freesrnode(struct vars *, struct subre *); +static void optst(struct vars *, struct subre *); +static int numst(struct subre *, int); +static void markst(struct subre *); +static void cleanst(struct vars *); +static long nfatree(struct vars *, struct subre *, FILE *); +static long nfanode(struct vars *, struct subre *, FILE *); +static int newlacon(struct vars *, struct state *, struct state *, int); +static void freelacons(struct subre *, int); +static void rfree(regex_t *); +static void dump(regex_t *, FILE *); +static void dumpst(struct subre *, FILE *, int); +static void stdump(struct subre *, FILE *, int); +static const char *stid(struct subre *, char *, size_t); +/* === regc_lex.c === */ +static void lexstart(struct vars *); +static void prefixes(struct vars *); +static void lexnest(struct vars *, const chr *, const chr *); +static void lexword(struct vars *); +static int next(struct vars *); +static int lexescape(struct vars *); +static chr lexdigits(struct vars *, int, int, int); +static int brenext(struct vars *, pchr); +static void skip(struct vars *); +static chr newline(NOPARMS); +#ifdef REG_DEBUG +static const chr *ch(NOPARMS); +#endif +static chr chrnamed(struct vars *, const chr *, const chr *, pchr); +/* === regc_color.c === */ +static void initcm(struct vars *, struct colormap *); +static void freecm(struct colormap *); +static void cmtreefree(struct colormap *, union tree *, int); +static color setcolor(struct colormap *, pchr, pcolor); +static color maxcolor(struct colormap *); +static color newcolor(struct colormap *); +static void freecolor(struct colormap *, pcolor); +static color pseudocolor(struct colormap *); +static color subcolor(struct colormap *, pchr c); +static color newsub(struct colormap *, pcolor); +static void subrange(struct vars *, pchr, pchr, struct state *, struct state *); +static void subblock(struct vars *, pchr, struct state *, struct state *); +static void okcolors(struct nfa *, struct colormap *); +static void colorchain(struct colormap *, struct arc *); +static void uncolorchain(struct colormap *, struct arc *); +static void rainbow(struct nfa *, struct colormap *, int, pcolor, struct state *, struct state *); +static void colorcomplement(struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *); +#ifdef REG_DEBUG +static void dumpcolors(struct colormap *, FILE *); +static void fillcheck(struct colormap *, union tree *, int, FILE *); +static void dumpchr(pchr, FILE *); +#endif +/* === regc_nfa.c === */ +static struct nfa *newnfa(struct vars *, struct colormap *, struct nfa *); +static void freenfa(struct nfa *); +static struct state *newstate(struct nfa *); +static struct state *newfstate(struct nfa *, int flag); +static void dropstate(struct nfa *, struct state *); +static void freestate(struct nfa *, struct state *); +static void destroystate(struct nfa *, struct state *); +static void newarc(struct nfa *, int, pcolor, struct state *, struct state *); +static struct arc *allocarc(struct nfa *, struct state *); +static void freearc(struct nfa *, struct arc *); +static struct arc *findarc(struct state *, int, pcolor); +static void cparc(struct nfa *, struct arc *, struct state *, struct state *); +static void moveins(struct nfa *, struct state *, struct state *); +static void copyins(struct nfa *, struct state *, struct state *); +static void moveouts(struct nfa *, struct state *, struct state *); +static void copyouts(struct nfa *, struct state *, struct state *); +static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int); +static void delsub(struct nfa *, struct state *, struct state *); +static void deltraverse(struct nfa *, struct state *, struct state *); +static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *); +static void duptraverse(struct nfa *, struct state *, struct state *, int); +static void cleartraverse(struct nfa *, struct state *); +static void specialcolors(struct nfa *); +static long optimize(struct nfa *, FILE *); +static void pullback(struct nfa *, FILE *); +static int pull(struct nfa *, struct arc *); +static void pushfwd(struct nfa *, FILE *); +static int push(struct nfa *, struct arc *); +#define INCOMPATIBLE 1 /* destroys arc */ +#define SATISFIED 2 /* constraint satisfied */ +#define COMPATIBLE 3 /* compatible but not satisfied yet */ +static int combine(struct arc *, struct arc *); +static void fixempties(struct nfa *, FILE *); +static int unempty(struct nfa *, struct arc *); +static void cleanup(struct nfa *); +static void markreachable(struct nfa *, struct state *, struct state *, struct state *); +static void markcanreach(struct nfa *, struct state *, struct state *, struct state *); +static long analyze(struct nfa *); +static void compact(struct nfa *, struct cnfa *); +static void carcsort(struct carc *, struct carc *); +static void freecnfa(struct cnfa *); +static void dumpnfa(struct nfa *, FILE *); +#ifdef REG_DEBUG +static void dumpstate(struct state *, FILE *); +static void dumparcs(struct state *, FILE *); +static int dumprarcs(struct arc *, struct state *, FILE *, int); +static void dumparc(struct arc *, struct state *, FILE *); +#endif +static void dumpcnfa(struct cnfa *, FILE *); +#ifdef REG_DEBUG +static void dumpcstate(int, struct carc *, struct cnfa *, FILE *); +#endif +/* === regc_cvec.c === */ +static struct cvec *clearcvec(struct cvec *); +static void addchr(struct cvec *, pchr); +static void addrange(struct cvec *, pchr, pchr); +static struct cvec *newcvec(int, int); +static struct cvec *getcvec(struct vars *, int, int); +static void freecvec(struct cvec *); +/* === regc_locale.c === */ +static celt element(struct vars *, const chr *, const chr *); +static struct cvec *range(struct vars *, celt, celt, int); +static int before(celt, celt); +static struct cvec *eclass(struct vars *, celt, int); +static struct cvec *cclass(struct vars *, const chr *, const chr *, int); +static struct cvec *allcases(struct vars *, pchr); +static int cmp(const chr *, const chr *, size_t); +static int casecmp(const chr *, const chr *, size_t); +/* automatically gathered by fwd; do not hand-edit */ +/* =====^!^===== end forwards =====^!^===== */ + +/* internal variables, bundled for easy passing around */ +struct vars { + regex_t *re; + const chr *now; /* scan pointer into string */ + const chr *stop; /* end of string */ + const chr *savenow; /* saved now and stop for "subroutine call" */ + const chr *savestop; + int err; /* error code (0 if none) */ + int cflags; /* copy of compile flags */ + int lasttype; /* type of previous token */ + int nexttype; /* type of next token */ + chr nextvalue; /* value (if any) of next token */ + int lexcon; /* lexical context type (see lex.c) */ + int nsubexp; /* subexpression count */ + struct subre **subs; /* subRE pointer vector */ + size_t nsubs; /* length of vector */ + struct subre *sub10[10]; /* initial vector, enough for most */ + struct nfa *nfa; /* the NFA */ + struct colormap *cm; /* character color map */ + color nlcolor; /* color of newline */ + struct state *wordchrs; /* state in nfa holding word-char outarcs */ + struct subre *tree; /* subexpression tree */ + struct subre *treechain; /* all tree nodes allocated */ + struct subre *treefree; /* any free tree nodes */ + int ntree; /* number of tree nodes */ + struct cvec *cv; /* interface cvec */ + struct cvec *cv2; /* utility cvec */ + struct subre *lacons; /* lookahead-constraint vector */ + int nlacons; /* size of lacons */ +}; + +/* parsing macros; most know that `v' is the struct vars pointer */ +#define NEXT() (next(v)) /* advance by one token */ +#define SEE(t) (v->nexttype == (t)) /* is next token this? */ +#define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */ +#define VISERR(vv) ((vv)->err != 0)/* have we seen an error yet? */ +#define ISERR() VISERR(v) +#define VERR(vv,e) \ + ((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err : ((vv)->err = (e))) +#define ERR(e) VERR(v, e) /* record an error */ +#define NOERR() {if (ISERR()) return;} /* if error seen, return */ +#define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */ +#define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */ +#define INSIST(c, e) ((c) ? 0 : ERR(e)) /* if condition false, error */ +#define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */ +#define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y) + +/* token type codes, some also used as NFA arc types */ +#define EMPTY 'n' /* no token present */ +#define EOS 'e' /* end of string */ +#define PLAIN 'p' /* ordinary character */ +#define DIGIT 'd' /* digit (in bound) */ +#define BACKREF 'b' /* back reference */ +#define COLLEL 'I' /* start of [. */ +#define ECLASS 'E' /* start of [= */ +#define CCLASS 'C' /* start of [: */ +#define END 'X' /* end of [. [= [: */ +#define RANGE 'R' /* - within [] which might be range delim. */ +#define LACON 'L' /* lookahead constraint subRE */ +#define AHEAD 'a' /* color-lookahead arc */ +#define BEHIND 'r' /* color-lookbehind arc */ +#define WBDRY 'w' /* word boundary constraint */ +#define NWBDRY 'W' /* non-word-boundary constraint */ +#define SBEGIN 'A' /* beginning of string (even if not BOL) */ +#define SEND 'Z' /* end of string (even if not EOL) */ +#define PREFER 'P' /* length preference */ + +/* is an arc colored, and hence on a color chain? */ +#define COLORED(a) \ + ((a)->type == PLAIN || (a)->type == AHEAD || (a)->type == BEHIND) + +/* static function list */ +static struct fns functions = { + rfree, /* regfree insides */ +}; + +/* + - compile - compile regular expression + ^ int compile(regex_t *, const chr *, size_t, int); + */ +int +compile( + regex_t *re, + const chr *string, + size_t len, + int flags) +{ + AllocVars(v); + struct guts *g; + int i; + size_t j; + FILE *debug = (flags®_PROGRESS) ? stdout : NULL; +#define CNOERR() { if (ISERR()) return freev(v, v->err); } + + /* + * Sanity checks. + */ + + if (re == NULL || string == NULL) { + FreeVars(v); + return REG_INVARG; + } + if ((flags®_QUOTE) && (flags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE))) { + FreeVars(v); + return REG_INVARG; + } + if (!(flags®_EXTENDED) && (flags®_ADVF)) { + FreeVars(v); + return REG_INVARG; + } + + /* + * Initial setup (after which freev() is callable). + */ + + v->re = re; + v->now = string; + v->stop = v->now + len; + v->savenow = v->savestop = NULL; + v->err = 0; + v->cflags = flags; + v->nsubexp = 0; + v->subs = v->sub10; + v->nsubs = 10; + for (j = 0; j < v->nsubs; j++) { + v->subs[j] = NULL; + } + v->nfa = NULL; + v->cm = NULL; + v->nlcolor = COLORLESS; + v->wordchrs = NULL; + v->tree = NULL; + v->treechain = NULL; + v->treefree = NULL; + v->cv = NULL; + v->cv2 = NULL; + v->lacons = NULL; + v->nlacons = 0; + re->re_magic = REMAGIC; + re->re_info = 0; /* bits get set during parse */ + re->re_csize = sizeof(chr); + re->re_guts = NULL; + re->re_fns = VS(&functions); + + /* + * More complex setup, malloced things. + */ + + re->re_guts = VS(MALLOC(sizeof(struct guts))); + if (re->re_guts == NULL) { + return freev(v, REG_ESPACE); + } + g = (struct guts *) re->re_guts; + g->tree = NULL; + initcm(v, &g->cmap); + v->cm = &g->cmap; + g->lacons = NULL; + g->nlacons = 0; + ZAPCNFA(g->search); + v->nfa = newnfa(v, v->cm, NULL); + CNOERR(); + v->cv = newcvec(100, 20); + if (v->cv == NULL) { + return freev(v, REG_ESPACE); + } + + /* + * Parsing. + */ + + lexstart(v); /* also handles prefixes */ + if ((v->cflags®_NLSTOP) || (v->cflags®_NLANCH)) { + /* + * Assign newline a unique color. + */ + + v->nlcolor = subcolor(v->cm, newline()); + okcolors(v->nfa, v->cm); + } + CNOERR(); + v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final); + assert(SEE(EOS)); /* even if error; ISERR() => SEE(EOS) */ + CNOERR(); + assert(v->tree != NULL); + + /* + * Finish setup of nfa and its subre tree. + */ + + specialcolors(v->nfa); + CNOERR(); + if (debug != NULL) { + fprintf(debug, "\n\n\n========= RAW ==========\n"); + dumpnfa(v->nfa, debug); + dumpst(v->tree, debug, 1); + } + optst(v, v->tree); + v->ntree = numst(v->tree, 1); + markst(v->tree); + cleanst(v); + if (debug != NULL) { + fprintf(debug, "\n\n\n========= TREE FIXED ==========\n"); + dumpst(v->tree, debug, 1); + } + + /* + * Build compacted NFAs for tree and lacons. + */ + + re->re_info |= nfatree(v, v->tree, debug); + CNOERR(); + assert(v->nlacons == 0 || v->lacons != NULL); + for (i = 1; i < v->nlacons; i++) { + if (debug != NULL) { + fprintf(debug, "\n\n\n========= LA%d ==========\n", i); + } + nfanode(v, &v->lacons[i], debug); + } + CNOERR(); + if (v->tree->flags&SHORTER) { + NOTE(REG_USHORTEST); + } + + /* + * Build compacted NFAs for tree, lacons, fast search. + */ + + if (debug != NULL) { + fprintf(debug, "\n\n\n========= SEARCH ==========\n"); + } + + /* + * Can sacrifice main NFA now, so use it as work area. + */ + + (DISCARD) optimize(v->nfa, debug); + CNOERR(); + makesearch(v, v->nfa); + CNOERR(); + compact(v->nfa, &g->search); + CNOERR(); + + /* + * Looks okay, package it up. + */ + + re->re_nsub = v->nsubexp; + v->re = NULL; /* freev no longer frees re */ + g->magic = GUTSMAGIC; + g->cflags = v->cflags; + g->info = re->re_info; + g->nsub = re->re_nsub; + g->tree = v->tree; + v->tree = NULL; + g->ntree = v->ntree; + g->compare = (v->cflags®_ICASE) ? casecmp : cmp; + g->lacons = v->lacons; + v->lacons = NULL; + g->nlacons = v->nlacons; + + if (flags®_DUMP) { + dump(re, stdout); + } + + assert(v->err == 0); + return freev(v, 0); +} + +/* + - moresubs - enlarge subRE vector + ^ static void moresubs(struct vars *, int); + */ +static void +moresubs( + struct vars *v, + int wanted) /* want enough room for this one */ +{ + struct subre **p; + size_t n; + + assert(wanted > 0 && (size_t)wanted >= v->nsubs); + n = (size_t)wanted * 3 / 2 + 1; + if (v->subs == v->sub10) { + p = (struct subre **) MALLOC(n * sizeof(struct subre *)); + if (p != NULL) { + memcpy(p, v->subs, v->nsubs * sizeof(struct subre *)); + } + } else { + p = (struct subre **) REALLOC(v->subs, n*sizeof(struct subre *)); + } + if (p == NULL) { + ERR(REG_ESPACE); + return; + } + + v->subs = p; + for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++) { + *p = NULL; + } + assert(v->nsubs == n); + assert((size_t)wanted < v->nsubs); +} + +/* + - freev - free vars struct's substructures where necessary + * Optionally does error-number setting, and always returns error code (if + * any), to make error-handling code terser. + ^ static int freev(struct vars *, int); + */ +static int +freev( + struct vars *v, + int err) +{ + register int ret; + + if (v->re != NULL) { + rfree(v->re); + } + if (v->subs != v->sub10) { + FREE(v->subs); + } + if (v->nfa != NULL) { + freenfa(v->nfa); + } + if (v->tree != NULL) { + freesubre(v, v->tree); + } + if (v->treechain != NULL) { + cleanst(v); + } + if (v->cv != NULL) { + freecvec(v->cv); + } + if (v->cv2 != NULL) { + freecvec(v->cv2); + } + if (v->lacons != NULL) { + freelacons(v->lacons, v->nlacons); + } + ERR(err); /* nop if err==0 */ + + ret = v->err; + FreeVars(v); + return ret; +} + +/* + - makesearch - turn an NFA into a search NFA (implicit prepend of .*?) + * NFA must have been optimize()d already. + ^ static void makesearch(struct vars *, struct nfa *); + */ +static void +makesearch( + struct vars *v, + struct nfa *nfa) +{ + struct arc *a, *b; + struct state *pre = nfa->pre; + struct state *s, *s2, *slist; + + /* + * No loops are needed if it's anchored. + */ + + for (a = pre->outs; a != NULL; a = a->outchain) { + assert(a->type == PLAIN); + if (a->co != nfa->bos[0] && a->co != nfa->bos[1]) { + break; + } + } + if (a != NULL) { + /* + * Add implicit .* in front. + */ + + rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre); + + /* + * And ^* and \A* too -- not always necessary, but harmless. + */ + + newarc(nfa, PLAIN, nfa->bos[0], pre, pre); + newarc(nfa, PLAIN, nfa->bos[1], pre, pre); + } + + /* + * Now here's the subtle part. Because many REs have no lookback + * constraints, often knowing when you were in the pre state tells you + * little; it's the next state(s) that are informative. But some of them + * may have other inarcs, i.e. it may be possible to make actual progress + * and then return to one of them. We must de-optimize such cases, + * splitting each such state into progress and no-progress states. + */ + + /* + * First, make a list of the states. + */ + + slist = NULL; + for (a=pre->outs ; a!=NULL ; a=a->outchain) { + s = a->to; + for (b=s->ins ; b!=NULL ; b=b->inchain) { + if (b->from != pre) { + break; + } + } + if (b != NULL && s->tmp == NULL) { + /* + * Must be split if not already in the list (fixes bugs 505048, + * 230589, 840258, 504785). + */ + + s->tmp = slist; + slist = s; + } + } + + /* + * Do the splits. + */ + + for (s=slist ; s!=NULL ; s=s2) { + s2 = newstate(nfa); + + copyouts(nfa, s, s2); + for (a=s->ins ; a!=NULL ; a=b) { + b = a->inchain; + + if (a->from != pre) { + cparc(nfa, a, a->from, s2); + freearc(nfa, a); + } + } + s2 = s->tmp; + s->tmp = NULL; /* clean up while we're at it */ + } +} + +/* + - parse - parse an RE + * This is actually just the top level, which parses a bunch of branches tied + * together with '|'. They appear in the tree as the left children of a chain + * of '|' subres. + ^ static struct subre *parse(struct vars *, int, int, struct state *, + ^ struct state *); + */ +static struct subre * +parse( + struct vars *v, + int stopper, /* EOS or ')' */ + int type, /* LACON (lookahead subRE) or PLAIN */ + struct state *init, /* initial state */ + struct state *final) /* final state */ +{ + struct state *left, *right; /* scaffolding for branch */ + struct subre *branches; /* top level */ + struct subre *branch; /* current branch */ + struct subre *t; /* temporary */ + int firstbranch; /* is this the first branch? */ + + assert(stopper == ')' || stopper == EOS); + + branches = subre(v, '|', LONGER, init, final); + NOERRN(); + branch = branches; + firstbranch = 1; + do { /* a branch */ + if (!firstbranch) { + /* + * Need a place to hang the branch. + */ + + branch->right = subre(v, '|', LONGER, init, final); + NOERRN(); + branch = branch->right; + } + firstbranch = 0; + left = newstate(v->nfa); + right = newstate(v->nfa); + NOERRN(); + EMPTYARC(init, left); + EMPTYARC(right, final); + NOERRN(); + branch->left = parsebranch(v, stopper, type, left, right, 0); + NOERRN(); + branch->flags |= UP(branch->flags | branch->left->flags); + if ((branch->flags &~ branches->flags) != 0) { /* new flags */ + for (t = branches; t != branch; t = t->right) { + t->flags |= branch->flags; + } + } + } while (EAT('|')); + assert(SEE(stopper) || SEE(EOS)); + + if (!SEE(stopper)) { + assert(stopper == ')' && SEE(EOS)); + ERR(REG_EPAREN); + } + + /* + * Optimize out simple cases. + */ + + if (branch == branches) { /* only one branch */ + assert(branch->right == NULL); + t = branch->left; + branch->left = NULL; + freesubre(v, branches); + branches = t; + } else if (!MESSY(branches->flags)) { /* no interesting innards */ + freesubre(v, branches->left); + branches->left = NULL; + freesubre(v, branches->right); + branches->right = NULL; + branches->op = '='; + } + + return branches; +} + +/* + - parsebranch - parse one branch of an RE + * This mostly manages concatenation, working closely with parseqatom(). + * Concatenated things are bundled up as much as possible, with separate + * ',' nodes introduced only when necessary due to substructure. + ^ static struct subre *parsebranch(struct vars *, int, int, struct state *, + ^ struct state *, int); + */ +static struct subre * +parsebranch( + struct vars *v, + int stopper, /* EOS or ')' */ + int type, /* LACON (lookahead subRE) or PLAIN */ + struct state *left, /* leftmost state */ + struct state *right, /* rightmost state */ + int partial) /* is this only part of a branch? */ +{ + struct state *lp; /* left end of current construct */ + int seencontent; /* is there anything in this branch yet? */ + struct subre *t; + + lp = left; + seencontent = 0; + t = subre(v, '=', 0, left, right); /* op '=' is tentative */ + NOERRN(); + while (!SEE('|') && !SEE(stopper) && !SEE(EOS)) { + if (seencontent) { /* implicit concat operator */ + lp = newstate(v->nfa); + NOERRN(); + moveins(v->nfa, right, lp); + } + seencontent = 1; + + /* NB, recursion in parseqatom() may swallow rest of branch */ + parseqatom(v, stopper, type, lp, right, t); + } + + if (!seencontent) { /* empty branch */ + if (!partial) { + NOTE(REG_UUNSPEC); + } + assert(lp == left); + EMPTYARC(left, right); + } + + return t; +} + +/* + - parseqatom - parse one quantified atom or constraint of an RE + * The bookkeeping near the end cooperates very closely with parsebranch(); in + * particular, it contains a recursion that can involve parsing the rest of + * the branch, making this function's name somewhat inaccurate. + ^ static void parseqatom(struct vars *, int, int, struct state *, + ^ struct state *, struct subre *); + */ +static void +parseqatom( + struct vars *v, + int stopper, /* EOS or ')' */ + int type, /* LACON (lookahead subRE) or PLAIN */ + struct state *lp, /* left state to hang it on */ + struct state *rp, /* right state to hang it on */ + struct subre *top) /* subtree top */ +{ + struct state *s; /* temporaries for new states */ + struct state *s2; +#define ARCV(t, val) newarc(v->nfa, t, val, lp, rp) + int m, n; + struct subre *atom; /* atom's subtree */ + struct subre *t; + int cap; /* capturing parens? */ + int pos; /* positive lookahead? */ + int subno; /* capturing-parens or backref number */ + int atomtype; + int qprefer; /* quantifier short/long preference */ + int f; + struct subre **atomp; /* where the pointer to atom is */ + + /* + * Initial bookkeeping. + */ + + atom = NULL; + assert(lp->nouts == 0); /* must string new code */ + assert(rp->nins == 0); /* between lp and rp */ + subno = 0; /* just to shut lint up */ + + /* + * An atom or constraint... + */ + + atomtype = v->nexttype; + switch (atomtype) { + /* first, constraints, which end by returning */ + case '^': + ARCV('^', 1); + if (v->cflags®_NLANCH) { + ARCV(BEHIND, v->nlcolor); + } + NEXT(); + return; + case '$': + ARCV('$', 1); + if (v->cflags®_NLANCH) { + ARCV(AHEAD, v->nlcolor); + } + NEXT(); + return; + case SBEGIN: + ARCV('^', 1); /* BOL */ + ARCV('^', 0); /* or BOS */ + NEXT(); + return; + case SEND: + ARCV('$', 1); /* EOL */ + ARCV('$', 0); /* or EOS */ + NEXT(); + return; + case '<': + wordchrs(v); /* does NEXT() */ + s = newstate(v->nfa); + NOERR(); + nonword(v, BEHIND, lp, s); + word(v, AHEAD, s, rp); + return; + case '>': + wordchrs(v); /* does NEXT() */ + s = newstate(v->nfa); + NOERR(); + word(v, BEHIND, lp, s); + nonword(v, AHEAD, s, rp); + return; + case WBDRY: + wordchrs(v); /* does NEXT() */ + s = newstate(v->nfa); + NOERR(); + nonword(v, BEHIND, lp, s); + word(v, AHEAD, s, rp); + s = newstate(v->nfa); + NOERR(); + word(v, BEHIND, lp, s); + nonword(v, AHEAD, s, rp); + return; + case NWBDRY: + wordchrs(v); /* does NEXT() */ + s = newstate(v->nfa); + NOERR(); + word(v, BEHIND, lp, s); + word(v, AHEAD, s, rp); + s = newstate(v->nfa); + NOERR(); + nonword(v, BEHIND, lp, s); + nonword(v, AHEAD, s, rp); + return; + case LACON: /* lookahead constraint */ + pos = v->nextvalue; + NEXT(); + s = newstate(v->nfa); + s2 = newstate(v->nfa); + NOERR(); + t = parse(v, ')', LACON, s, s2); + freesubre(v, t); /* internal structure irrelevant */ + assert(SEE(')') || ISERR()); + NEXT(); + n = newlacon(v, s, s2, pos); + NOERR(); + ARCV(LACON, n); + return; + + /* + * Then errors, to get them out of the way. + */ + + case '*': + case '+': + case '?': + case '{': + ERR(REG_BADRPT); + return; + default: + ERR(REG_ASSERT); + return; + + /* + * Then plain characters, and minor variants on that theme. + */ + + case ')': /* unbalanced paren */ + if ((v->cflags®_ADVANCED) != REG_EXTENDED) { + ERR(REG_EPAREN); + return; + } + + /* + * Legal in EREs due to specification botch. + */ + + NOTE(REG_UPBOTCH); + /* fallthrough into case PLAIN */ + case PLAIN: + onechr(v, v->nextvalue, lp, rp); + okcolors(v->nfa, v->cm); + NOERR(); + NEXT(); + break; + case '[': + if (v->nextvalue == 1) { + bracket(v, lp, rp); + } else { + cbracket(v, lp, rp); + } + assert(SEE(']') || ISERR()); + NEXT(); + break; + case '.': + rainbow(v->nfa, v->cm, PLAIN, + (v->cflags®_NLSTOP) ? v->nlcolor : COLORLESS, lp, rp); + NEXT(); + break; + + /* + * And finally the ugly stuff. + */ + + case '(': /* value flags as capturing or non */ + cap = (type == LACON) ? 0 : v->nextvalue; + if (cap) { + v->nsubexp++; + subno = v->nsubexp; + if ((size_t)subno >= v->nsubs) { + moresubs(v, subno); + } + assert((size_t)subno < v->nsubs); + } else { + atomtype = PLAIN; /* something that's not '(' */ + } + NEXT(); + + /* + * Need new endpoints because tree will contain pointers. + */ + + s = newstate(v->nfa); + s2 = newstate(v->nfa); + NOERR(); + EMPTYARC(lp, s); + EMPTYARC(s2, rp); + NOERR(); + atom = parse(v, ')', PLAIN, s, s2); + assert(SEE(')') || ISERR()); + NEXT(); + NOERR(); + if (cap) { + v->subs[subno] = atom; + t = subre(v, '(', atom->flags|CAP, lp, rp); + NOERR(); + t->subno = subno; + t->left = atom; + atom = t; + } + + /* + * Postpone everything else pending possible {0}. + */ + + break; + case BACKREF: /* the Feature From The Black Lagoon */ + INSIST(type != LACON, REG_ESUBREG); + INSIST(v->nextvalue < v->nsubs, REG_ESUBREG); + INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG); + NOERR(); + assert(v->nextvalue > 0); + atom = subre(v, 'b', BACKR, lp, rp); + subno = v->nextvalue; + atom->subno = subno; + EMPTYARC(lp, rp); /* temporarily, so there's something */ + NEXT(); + break; + } + + /* + * ...and an atom may be followed by a quantifier. + */ + + switch (v->nexttype) { + case '*': + m = 0; + n = INFINITY; + qprefer = (v->nextvalue) ? LONGER : SHORTER; + NEXT(); + break; + case '+': + m = 1; + n = INFINITY; + qprefer = (v->nextvalue) ? LONGER : SHORTER; + NEXT(); + break; + case '?': + m = 0; + n = 1; + qprefer = (v->nextvalue) ? LONGER : SHORTER; + NEXT(); + break; + case '{': + NEXT(); + m = scannum(v); + if (EAT(',')) { + if (SEE(DIGIT)) { + n = scannum(v); + } else { + n = INFINITY; + } + if (m > n) { + ERR(REG_BADBR); + return; + } + + /* + * {m,n} exercises preference, even if it's {m,m} + */ + + qprefer = (v->nextvalue) ? LONGER : SHORTER; + } else { + n = m; + /* + * {m} passes operand's preference through. + */ + + qprefer = 0; + } + if (!SEE('}')) { /* catches errors too */ + ERR(REG_BADBR); + return; + } + NEXT(); + break; + default: /* no quantifier */ + m = n = 1; + qprefer = 0; + break; + } + + /* + * Annoying special case: {0} or {0,0} cancels everything. + */ + + if (m == 0 && n == 0) { + if (atom != NULL) { + freesubre(v, atom); + } + if (atomtype == '(') { + v->subs[subno] = NULL; + } + delsub(v->nfa, lp, rp); + EMPTYARC(lp, rp); + return; + } + + /* + * If not a messy case, avoid hard part. + */ + + assert(!MESSY(top->flags)); + f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0); + if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f))) { + if (!(m == 1 && n == 1)) { + repeat(v, lp, rp, m, n); + } + if (atom != NULL) { + freesubre(v, atom); + } + top->flags = f; + return; + } + + /* + * hard part: something messy + * That is, capturing parens, back reference, short/long clash, or an atom + * with substructure containing one of those. + */ + + /* + * Now we'll need a subre for the contents even if they're boring. + */ + + if (atom == NULL) { + atom = subre(v, '=', 0, lp, rp); + NOERR(); + } + + /* + * Prepare a general-purpose state skeleton. + * + * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp] + * / / + * [lp] ----> [s2] ----bypass--------------------- + * + * where bypass is an empty, and prefix is some repetitions of atom + */ + + s = newstate(v->nfa); /* first, new endpoints for the atom */ + s2 = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, lp, s); + moveins(v->nfa, rp, s2); + NOERR(); + atom->begin = s; + atom->end = s2; + s = newstate(v->nfa); /* and spots for prefix and bypass */ + s2 = newstate(v->nfa); + NOERR(); + EMPTYARC(lp, s); + EMPTYARC(lp, s2); + NOERR(); + + /* + * Break remaining subRE into x{...} and what follows. + */ + + t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp); + t->left = atom; + atomp = &t->left; + + /* + * Here we should recurse... but we must postpone that to the end. + */ + + /* + * Split top into prefix and remaining. + */ + + assert(top->op == '=' && top->left == NULL && top->right == NULL); + top->left = subre(v, '=', top->flags, top->begin, lp); + top->op = '.'; + top->right = t; + + /* + * If it's a backref, now is the time to replicate the subNFA. + */ + + if (atomtype == BACKREF) { + assert(atom->begin->nouts == 1); /* just the EMPTY */ + delsub(v->nfa, atom->begin, atom->end); + assert(v->subs[subno] != NULL); + + /* + * And here's why the recursion got postponed: it must wait until the + * skeleton is filled in, because it may hit a backref that wants to + * copy the filled-in skeleton. + */ + + dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end, + atom->begin, atom->end); + NOERR(); + } + + /* + * It's quantifier time; first, turn x{0,...} into x{1,...}|empty + */ + + if (m == 0) { + EMPTYARC(s2, atom->end);/* the bypass */ + assert(PREF(qprefer) != 0); + f = COMBINE(qprefer, atom->flags); + t = subre(v, '|', f, lp, atom->end); + NOERR(); + t->left = atom; + t->right = subre(v, '|', PREF(f), s2, atom->end); + NOERR(); + t->right->left = subre(v, '=', 0, s2, atom->end); + NOERR(); + *atomp = t; + atomp = &t->left; + m = 1; + } + + /* + * Deal with the rest of the quantifier. + */ + + if (atomtype == BACKREF) { + /* + * Special case: backrefs have internal quantifiers. + */ + + EMPTYARC(s, atom->begin); /* empty prefix */ + + /* + * Just stuff everything into atom. + */ + + repeat(v, atom->begin, atom->end, m, n); + atom->min = (short) m; + atom->max = (short) n; + atom->flags |= COMBINE(qprefer, atom->flags); + } else if (m == 1 && n == 1) { + /* + * No/vacuous quantifier: done. + */ + + EMPTYARC(s, atom->begin); /* empty prefix */ + } else { + /* + * Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only second + * x + */ + + dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin); + assert(m >= 1 && m != INFINITY && n >= 1); + repeat(v, s, atom->begin, m-1, (n == INFINITY) ? n : n-1); + f = COMBINE(qprefer, atom->flags); + t = subre(v, '.', f, s, atom->end); /* prefix and atom */ + NOERR(); + t->left = subre(v, '=', PREF(f), s, atom->begin); + NOERR(); + t->right = atom; + *atomp = t; + } + + /* + * And finally, look after that postponed recursion. + */ + + t = top->right; + if (!(SEE('|') || SEE(stopper) || SEE(EOS))) { + t->right = parsebranch(v, stopper, type, atom->end, rp, 1); + } else { + EMPTYARC(atom->end, rp); + t->right = subre(v, '=', 0, atom->end, rp); + } + assert(SEE('|') || SEE(stopper) || SEE(EOS)); + t->flags |= COMBINE(t->flags, t->right->flags); + top->flags |= COMBINE(top->flags, t->flags); +} + +/* + - nonword - generate arcs for non-word-character ahead or behind + ^ static void nonword(struct vars *, int, struct state *, struct state *); + */ +static void +nonword( + struct vars *v, + int dir, /* AHEAD or BEHIND */ + struct state *lp, + struct state *rp) +{ + int anchor = (dir == AHEAD) ? '$' : '^'; + + assert(dir == AHEAD || dir == BEHIND); + newarc(v->nfa, anchor, 1, lp, rp); + newarc(v->nfa, anchor, 0, lp, rp); + colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp); + /* (no need for special attention to \n) */ +} + +/* + - word - generate arcs for word character ahead or behind + ^ static void word(struct vars *, int, struct state *, struct state *); + */ +static void +word( + struct vars *v, + int dir, /* AHEAD or BEHIND */ + struct state *lp, + struct state *rp) +{ + assert(dir == AHEAD || dir == BEHIND); + cloneouts(v->nfa, v->wordchrs, lp, rp, dir); + /* (no need for special attention to \n) */ +} + +/* + - scannum - scan a number + ^ static int scannum(struct vars *); + */ +static int /* value, <= DUPMAX */ +scannum( + struct vars *v) +{ + int n = 0; + + while (SEE(DIGIT) && n < DUPMAX) { + n = n*10 + v->nextvalue; + NEXT(); + } + if (SEE(DIGIT) || n > DUPMAX) { + ERR(REG_BADBR); + return 0; + } + return n; +} + +/* + - repeat - replicate subNFA for quantifiers + * The duplication sequences used here are chosen carefully so that any + * pointers starting out pointing into the subexpression end up pointing into + * the last occurrence. (Note that it may not be strung between the same left + * and right end states, however!) This used to be important for the subRE + * tree, although the important bits are now handled by the in-line code in + * parse(), and when this is called, it doesn't matter any more. + ^ static void repeat(struct vars *, struct state *, struct state *, int, int); + */ +static void +repeat( + struct vars *v, + struct state *lp, + struct state *rp, + int m, + int n) +{ +#define SOME 2 +#define INF 3 +#define PAIR(x, y) ((x)*4 + (y)) +#define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) ) + const int rm = REDUCE(m); + const int rn = REDUCE(n); + struct state *s, *s2; + + switch (PAIR(rm, rn)) { + case PAIR(0, 0): /* empty string */ + delsub(v->nfa, lp, rp); + EMPTYARC(lp, rp); + break; + case PAIR(0, 1): /* do as x| */ + EMPTYARC(lp, rp); + break; + case PAIR(0, SOME): /* do as x{1,n}| */ + repeat(v, lp, rp, 1, n); + NOERR(); + EMPTYARC(lp, rp); + break; + case PAIR(0, INF): /* loop x around */ + s = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, lp, s); + moveins(v->nfa, rp, s); + EMPTYARC(lp, s); + EMPTYARC(s, rp); + break; + case PAIR(1, 1): /* no action required */ + break; + case PAIR(1, SOME): /* do as x{0,n-1}x = (x{1,n-1}|)x */ + s = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, lp, s); + dupnfa(v->nfa, s, rp, lp, s); + NOERR(); + repeat(v, lp, s, 1, n-1); + NOERR(); + EMPTYARC(lp, s); + break; + case PAIR(1, INF): /* add loopback arc */ + s = newstate(v->nfa); + s2 = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, lp, s); + moveins(v->nfa, rp, s2); + EMPTYARC(lp, s); + EMPTYARC(s2, rp); + EMPTYARC(s2, s); + break; + case PAIR(SOME, SOME): /* do as x{m-1,n-1}x */ + s = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, lp, s); + dupnfa(v->nfa, s, rp, lp, s); + NOERR(); + repeat(v, lp, s, m-1, n-1); + break; + case PAIR(SOME, INF): /* do as x{m-1,}x */ + s = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, lp, s); + dupnfa(v->nfa, s, rp, lp, s); + NOERR(); + repeat(v, lp, s, m-1, n); + break; + default: + ERR(REG_ASSERT); + break; + } +} + +/* + - bracket - handle non-complemented bracket expression + * Also called from cbracket for complemented bracket expressions. + ^ static void bracket(struct vars *, struct state *, struct state *); + */ +static void +bracket( + struct vars *v, + struct state *lp, + struct state *rp) +{ + assert(SEE('[')); + NEXT(); + while (!SEE(']') && !SEE(EOS)) { + brackpart(v, lp, rp); + } + assert(SEE(']') || ISERR()); + okcolors(v->nfa, v->cm); +} + +/* + - cbracket - handle complemented bracket expression + * We do it by calling bracket() with dummy endpoints, and then complementing + * the result. The alternative would be to invoke rainbow(), and then delete + * arcs as the b.e. is seen... but that gets messy. + ^ static void cbracket(struct vars *, struct state *, struct state *); + */ +static void +cbracket( + struct vars *v, + struct state *lp, + struct state *rp) +{ + struct state *left = newstate(v->nfa); + struct state *right = newstate(v->nfa); + + NOERR(); + bracket(v, left, right); + if (v->cflags®_NLSTOP) { + newarc(v->nfa, PLAIN, v->nlcolor, left, right); + } + NOERR(); + + assert(lp->nouts == 0); /* all outarcs will be ours */ + + /* + * Easy part of complementing, and all there is to do since the MCCE code + * was removed. + */ + + colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp); + NOERR(); + dropstate(v->nfa, left); + assert(right->nins == 0); + freestate(v->nfa, right); + return; +} + +/* + - brackpart - handle one item (or range) within a bracket expression + ^ static void brackpart(struct vars *, struct state *, struct state *); + */ +static void +brackpart( + struct vars *v, + struct state *lp, + struct state *rp) +{ + celt startc, endc; + struct cvec *cv; + const chr *startp, *endp; + chr c[1]; + + /* + * Parse something, get rid of special cases, take shortcuts. + */ + + switch (v->nexttype) { + case RANGE: /* a-b-c or other botch */ + ERR(REG_ERANGE); + return; + break; + case PLAIN: + c[0] = v->nextvalue; + NEXT(); + + /* + * Shortcut for ordinary chr (not range). + */ + + if (!SEE(RANGE)) { + onechr(v, c[0], lp, rp); + return; + } + startc = element(v, c, c+1); + NOERR(); + break; + case COLLEL: + startp = v->now; + endp = scanplain(v); + INSIST(startp < endp, REG_ECOLLATE); + NOERR(); + startc = element(v, startp, endp); + NOERR(); + break; + case ECLASS: + startp = v->now; + endp = scanplain(v); + INSIST(startp < endp, REG_ECOLLATE); + NOERR(); + startc = element(v, startp, endp); + NOERR(); + cv = eclass(v, startc, (v->cflags®_ICASE)); + NOERR(); + dovec(v, cv, lp, rp); + return; + break; + case CCLASS: + startp = v->now; + endp = scanplain(v); + INSIST(startp < endp, REG_ECTYPE); + NOERR(); + cv = cclass(v, startp, endp, (v->cflags®_ICASE)); + NOERR(); + dovec(v, cv, lp, rp); + return; + break; + default: + ERR(REG_ASSERT); + return; + break; + } + + if (SEE(RANGE)) { + NEXT(); + switch (v->nexttype) { + case PLAIN: + case RANGE: + c[0] = v->nextvalue; + NEXT(); + endc = element(v, c, c+1); + NOERR(); + break; + case COLLEL: + startp = v->now; + endp = scanplain(v); + INSIST(startp < endp, REG_ECOLLATE); + NOERR(); + endc = element(v, startp, endp); + NOERR(); + break; + default: + ERR(REG_ERANGE); + return; + break; + } + } else { + endc = startc; + } + + /* + * Ranges are unportable. Actually, standard C does guarantee that digits + * are contiguous, but making that an exception is just too complicated. + */ + + if (startc != endc) { + NOTE(REG_UUNPORT); + } + cv = range(v, startc, endc, (v->cflags®_ICASE)); + NOERR(); + dovec(v, cv, lp, rp); +} + +/* + - scanplain - scan PLAIN contents of [. etc. + * Certain bits of trickery in lex.c know that this code does not try to look + * past the final bracket of the [. etc. + ^ static const chr *scanplain(struct vars *); + */ +static const chr * /* just after end of sequence */ +scanplain( + struct vars *v) +{ + const chr *endp; + + assert(SEE(COLLEL) || SEE(ECLASS) || SEE(CCLASS)); + NEXT(); + + endp = v->now; + while (SEE(PLAIN)) { + endp = v->now; + NEXT(); + } + + assert(SEE(END) || ISERR()); + NEXT(); + + return endp; +} + +/* + - onechr - fill in arcs for a plain character, and possible case complements + * This is mostly a shortcut for efficient handling of the common case. + ^ static void onechr(struct vars *, pchr, struct state *, struct state *); + */ +static void +onechr( + struct vars *v, + pchr c, + struct state *lp, + struct state *rp) +{ + if (!(v->cflags®_ICASE)) { + newarc(v->nfa, PLAIN, subcolor(v->cm, c), lp, rp); + return; + } + + /* + * Rats, need general case anyway... + */ + + dovec(v, allcases(v, c), lp, rp); +} + +/* + - dovec - fill in arcs for each element of a cvec + ^ static void dovec(struct vars *, struct cvec *, struct state *, + ^ struct state *); + */ +static void +dovec( + struct vars *v, + struct cvec *cv, + struct state *lp, + struct state *rp) +{ + chr ch, from, to; + const chr *p; + int i; + + for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) { + ch = *p; + newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, rp); + } + + for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--) { + from = *p; + to = *(p+1); + if (from <= to) { + subrange(v, from, to, lp, rp); + } + } + +} + +/* + - wordchrs - set up word-chr list for word-boundary stuff, if needed + * The list is kept as a bunch of arcs between two dummy states; it's disposed + * of by the unreachable-states sweep in NFA optimization. Does NEXT(). Must + * not be called from any unusual lexical context. This should be reconciled + * with the \w etc. handling in lex.c, and should be cleaned up to reduce + * dependencies on input scanning. + ^ static void wordchrs(struct vars *); + */ +static void +wordchrs( + struct vars *v) +{ + struct state *left, *right; + + if (v->wordchrs != NULL) { + NEXT(); /* for consistency */ + return; + } + + left = newstate(v->nfa); + right = newstate(v->nfa); + NOERR(); + + /* + * Fine point: implemented with [::], and lexer will set REG_ULOCALE. + */ + + lexword(v); + NEXT(); + assert(v->savenow != NULL && SEE('[')); + bracket(v, left, right); + assert((v->savenow != NULL && SEE(']')) || ISERR()); + NEXT(); + NOERR(); + v->wordchrs = left; +} + +/* + - subre - allocate a subre + ^ static struct subre *subre(struct vars *, int, int, struct state *, + ^ struct state *); + */ +static struct subre * +subre( + struct vars *v, + int op, + int flags, + struct state *begin, + struct state *end) +{ + struct subre *ret = v->treefree; + + if (ret != NULL) { + v->treefree = ret->left; + } else { + ret = (struct subre *) MALLOC(sizeof(struct subre)); + if (ret == NULL) { + ERR(REG_ESPACE); + return NULL; + } + ret->chain = v->treechain; + v->treechain = ret; + } + + assert(strchr("|.b(=", op) != NULL); + + ret->op = op; + ret->flags = flags; + ret->retry = 0; + ret->subno = 0; + ret->min = ret->max = 1; + ret->left = NULL; + ret->right = NULL; + ret->begin = begin; + ret->end = end; + ZAPCNFA(ret->cnfa); + + return ret; +} + +/* + - freesubre - free a subRE subtree + ^ static void freesubre(struct vars *, struct subre *); + */ +static void +freesubre( + struct vars *v, /* might be NULL */ + struct subre *sr) +{ + if (sr == NULL) { + return; + } + + if (sr->left != NULL) { + freesubre(v, sr->left); + } + if (sr->right != NULL) { + freesubre(v, sr->right); + } + + freesrnode(v, sr); +} + +/* + - freesrnode - free one node in a subRE subtree + ^ static void freesrnode(struct vars *, struct subre *); + */ +static void +freesrnode( + struct vars *v, /* might be NULL */ + struct subre *sr) +{ + if (sr == NULL) { + return; + } + + if (!NULLCNFA(sr->cnfa)) { + freecnfa(&sr->cnfa); + } + sr->flags = 0; + + if (v != NULL) { + sr->left = v->treefree; + v->treefree = sr; + } else { + FREE(sr); + } +} + +/* + - optst - optimize a subRE subtree + ^ static void optst(struct vars *, struct subre *); + */ +static void +optst( + struct vars *v, + struct subre *t) +{ + /* + * DGP (2007-11-13): I assume it was the programmer's intent to eventually + * come back and add code to optimize subRE trees, but the routine coded + * just spends effort traversing the tree and doing nothing. We can do + * nothing with less effort. + */ + + return; +} + +/* + - numst - number tree nodes (assigning retry indexes) + ^ static int numst(struct subre *, int); + */ +static int /* next number */ +numst( + struct subre *t, + int start) /* starting point for subtree numbers */ +{ + int i; + + assert(t != NULL); + + i = start; + t->retry = (short) i++; + if (t->left != NULL) { + i = numst(t->left, i); + } + if (t->right != NULL) { + i = numst(t->right, i); + } + return i; +} + +/* + - markst - mark tree nodes as INUSE + ^ static void markst(struct subre *); + */ +static void +markst( + struct subre *t) +{ + assert(t != NULL); + + t->flags |= INUSE; + if (t->left != NULL) { + markst(t->left); + } + if (t->right != NULL) { + markst(t->right); + } +} + +/* + - cleanst - free any tree nodes not marked INUSE + ^ static void cleanst(struct vars *); + */ +static void +cleanst( + struct vars *v) +{ + struct subre *t; + struct subre *next; + + for (t = v->treechain; t != NULL; t = next) { + next = t->chain; + if (!(t->flags&INUSE)) { + FREE(t); + } + } + v->treechain = NULL; + v->treefree = NULL; /* just on general principles */ +} + +/* + - nfatree - turn a subRE subtree into a tree of compacted NFAs + ^ static long nfatree(struct vars *, struct subre *, FILE *); + */ +static long /* optimize results from top node */ +nfatree( + struct vars *v, + struct subre *t, + FILE *f) /* for debug output */ +{ + assert(t != NULL && t->begin != NULL); + + if (t->left != NULL) { + (DISCARD) nfatree(v, t->left, f); + } + if (t->right != NULL) { + (DISCARD) nfatree(v, t->right, f); + } + + return nfanode(v, t, f); +} + +/* + - nfanode - do one NFA for nfatree + ^ static long nfanode(struct vars *, struct subre *, FILE *); + */ +static long /* optimize results */ +nfanode( + struct vars *v, + struct subre *t, + FILE *f) /* for debug output */ +{ + struct nfa *nfa; + long ret = 0; + char idbuf[50]; + + assert(t->begin != NULL); + + if (f != NULL) { + fprintf(f, "\n\n\n========= TREE NODE %s ==========\n", + stid(t, idbuf, sizeof(idbuf))); + } + nfa = newnfa(v, v->cm, v->nfa); + NOERRZ(); + dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final); + if (!ISERR()) { + specialcolors(nfa); + ret = optimize(nfa, f); + } + if (!ISERR()) { + compact(nfa, &t->cnfa); + } + + freenfa(nfa); + return ret; +} + +/* + - newlacon - allocate a lookahead-constraint subRE + ^ static int newlacon(struct vars *, struct state *, struct state *, int); + */ +static int /* lacon number */ +newlacon( + struct vars *v, + struct state *begin, + struct state *end, + int pos) +{ + struct subre *sub; + int n; + + if (v->nlacons == 0) { + v->lacons = (struct subre *) MALLOC(2 * sizeof(struct subre)); + n = 1; /* skip 0th */ + v->nlacons = 2; + } else { + v->lacons = (struct subre *) REALLOC(v->lacons, + (v->nlacons+1)*sizeof(struct subre)); + n = v->nlacons++; + } + + if (v->lacons == NULL) { + ERR(REG_ESPACE); + return 0; + } + + sub = &v->lacons[n]; + sub->begin = begin; + sub->end = end; + sub->subno = pos; + ZAPCNFA(sub->cnfa); + return n; +} + +/* + - freelacons - free lookahead-constraint subRE vector + ^ static void freelacons(struct subre *, int); + */ +static void +freelacons( + struct subre *subs, + int n) +{ + struct subre *sub; + int i; + + assert(n > 0); + for (sub=subs+1, i=n-1; i>0; sub++, i--) { /* no 0th */ + if (!NULLCNFA(sub->cnfa)) { + freecnfa(&sub->cnfa); + } + } + FREE(subs); +} + +/* + - rfree - free a whole RE (insides of regfree) + ^ static void rfree(regex_t *); + */ +static void +rfree( + regex_t *re) +{ + struct guts *g; + + if (re == NULL || re->re_magic != REMAGIC) { + return; + } + + re->re_magic = 0; /* invalidate RE */ + g = (struct guts *) re->re_guts; + re->re_guts = NULL; + re->re_fns = NULL; + g->magic = 0; + freecm(&g->cmap); + if (g->tree != NULL) { + freesubre(NULL, g->tree); + } + if (g->lacons != NULL) { + freelacons(g->lacons, g->nlacons); + } + if (!NULLCNFA(g->search)) { + freecnfa(&g->search); + } + FREE(g); +} + +/* + - dump - dump an RE in human-readable form + ^ static void dump(regex_t *, FILE *); + */ +static void +dump( + regex_t *re, + FILE *f) +{ +#ifdef REG_DEBUG + struct guts *g; + int i; + + if (re->re_magic != REMAGIC) { + fprintf(f, "bad magic number (0x%x not 0x%x)\n", + re->re_magic, REMAGIC); + } + if (re->re_guts == NULL) { + fprintf(f, "NULL guts!!!\n"); + return; + } + g = (struct guts *) re->re_guts; + if (g->magic != GUTSMAGIC) { + fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", + g->magic, GUTSMAGIC); + } + + fprintf(f, "\n\n\n========= DUMP ==========\n"); + fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n", + re->re_nsub, re->re_info, re->re_csize, g->ntree); + + dumpcolors(&g->cmap, f); + if (!NULLCNFA(g->search)) { + printf("\nsearch:\n"); + dumpcnfa(&g->search, f); + } + for (i = 1; i < g->nlacons; i++) { + fprintf(f, "\nla%d (%s):\n", i, + (g->lacons[i].subno) ? "positive" : "negative"); + dumpcnfa(&g->lacons[i].cnfa, f); + } + fprintf(f, "\n"); + dumpst(g->tree, f, 0); +#endif +} + +/* + - dumpst - dump a subRE tree + ^ static void dumpst(struct subre *, FILE *, int); + */ +static void +dumpst( + struct subre *t, + FILE *f, + int nfapresent) /* is the original NFA still around? */ +{ + if (t == NULL) { + fprintf(f, "null tree\n"); + } else { + stdump(t, f, nfapresent); + } + fflush(f); +} + +/* + - stdump - recursive guts of dumpst + ^ static void stdump(struct subre *, FILE *, int); + */ +static void +stdump( + struct subre *t, + FILE *f, + int nfapresent) /* is the original NFA still around? */ +{ + char idbuf[50]; + + fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op); + if (t->flags&LONGER) { + fprintf(f, " longest"); + } + if (t->flags&SHORTER) { + fprintf(f, " shortest"); + } + if (t->flags&MIXED) { + fprintf(f, " hasmixed"); + } + if (t->flags&CAP) { + fprintf(f, " hascapture"); + } + if (t->flags&BACKR) { + fprintf(f, " hasbackref"); + } + if (!(t->flags&INUSE)) { + fprintf(f, " UNUSED"); + } + if (t->subno != 0) { + fprintf(f, " (#%d)", t->subno); + } + if (t->min != 1 || t->max != 1) { + fprintf(f, " {%d,", t->min); + if (t->max != INFINITY) { + fprintf(f, "%d", t->max); + } + fprintf(f, "}"); + } + if (nfapresent) { + fprintf(f, " %ld-%ld", (long)t->begin->no, (long)t->end->no); + } + if (t->left != NULL) { + fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf))); + } + if (t->right != NULL) { + fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf))); + } + if (!NULLCNFA(t->cnfa)) { + fprintf(f, "\n"); + dumpcnfa(&t->cnfa, f); + } + fprintf(f, "\n"); + if (t->left != NULL) { + stdump(t->left, f, nfapresent); + } + if (t->right != NULL) { + stdump(t->right, f, nfapresent); + } +} + +/* + - stid - identify a subtree node for dumping + ^ static char *stid(struct subre *, char *, size_t); + */ +static const char * /* points to buf or constant string */ +stid( + struct subre *t, + char *buf, + size_t bufsize) +{ + /* + * Big enough for hex int or decimal t->retry? + */ + + if (bufsize < sizeof(void*)*2 + 3 || bufsize < sizeof(t->retry)*3 + 1) { + return "unable"; + } + if (t->retry != 0) { + sprintf(buf, "%d", t->retry); + } else { + sprintf(buf, "%p", t); + } + return buf; +} + +#include "regc_lex.c" +#include "regc_color.c" +#include "regc_nfa.c" +#include "regc_cvec.c" +#include "regc_locale.c" + +/* + * Local Variables: + * mode: c + * c-basic-offset: 4 + * fill-column: 78 + * End: + */ diff --git a/contrib/hsrex/regcustom.h b/contrib/hsrex/regcustom.h new file mode 100644 index 0000000..c341c23 --- /dev/null +++ b/contrib/hsrex/regcustom.h @@ -0,0 +1,185 @@ +/* + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms - with or without + * modification - are permitted for any purpose, provided that redistributions + * in source form retain this entire copyright notice and indicate the origin + * and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation of + * software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Headers if any. + */ + +#ifdef REGEX_STANDALONE +# include "regalone.h" +#else +# include "tclInt.h" +#endif + +/* + * Overrides for regguts.h definitions, if any. + */ + +#define FUNCPTR(name, args) (*name)args +#ifndef REGEX_STANDALONE +#define MALLOC(n) ckalloc(n) +#define FREE(p) ckfree(VS(p)) +#define REALLOC(p,n) ckrealloc(VS(p),n) +#endif + +/* + * Do not insert extras between the "begin" and "end" lines - this chunk is + * automatically extracted to be fitted into regex.h. + */ + +/* --- begin --- */ +/* Ensure certain things don't sneak in from system headers. */ +#ifdef __REG_WIDE_T +#undef __REG_WIDE_T +#endif +#ifdef __REG_WIDE_COMPILE +#undef __REG_WIDE_COMPILE +#endif +#ifdef __REG_WIDE_EXEC +#undef __REG_WIDE_EXEC +#endif +#ifdef __REG_REGOFF_T +#undef __REG_REGOFF_T +#endif +#ifdef __REG_VOID_T +#undef __REG_VOID_T +#endif +#ifdef __REG_CONST +#undef __REG_CONST +#endif +#ifdef __REG_NOFRONT +#undef __REG_NOFRONT +#endif +#ifdef __REG_NOCHAR +#undef __REG_NOCHAR +#endif +/* Interface types */ +#define __REG_WIDE_T Tcl_UniChar +#define __REG_REGOFF_T long /* Not really right, but good enough... */ +#define __REG_VOID_T void +#define __REG_CONST const +/* Names and declarations */ +#define __REG_WIDE_COMPILE TclReComp +#define __REG_WIDE_EXEC TclReExec +#define __REG_NOFRONT /* Don't want regcomp() and regexec() */ +#define __REG_NOCHAR /* Or the char versions */ +#define regfree TclReFree +#define regerror TclReError +/* --- end --- */ + +/* + * Internal character type and related. + */ + +#ifndef REGEX_STANDALONE +typedef Tcl_UniChar chr; /* The type itself. */ +#endif +typedef int pchr; /* What it promotes to. */ +typedef unsigned uchr; /* Unsigned type that will hold a chr. */ +typedef int celt; /* Type to hold chr, or NOCELT */ +#define NOCELT (-1) /* Celt value which is not valid chr */ +#define CHR(c) (UCHAR(c)) /* Turn char literal into chr literal */ +#define DIGITVAL(c) ((c)-'0') /* Turn chr digit into its value */ +#if TCL_UTF_MAX > 3 +#define CHRBITS 32 /* Bits in a chr; must not use sizeof */ +#define CHR_MIN 0x00000000 /* Smallest and largest chr; the value */ +#define CHR_MAX 0xffffffff /* CHR_MAX-CHR_MIN+1 should fit in uchr */ +#elif defined(REGEX_STANDALONE) && ! defined(REGEX_WCHAR) +# define CHRBITS 8 +# define CHR_MIN 0x00 +# define CHR_MAX 0xff +#else +#define CHRBITS 16 /* Bits in a chr; must not use sizeof */ +#define CHR_MIN 0x0000 /* Smallest and largest chr; the value */ +#define CHR_MAX 0xffff /* CHR_MAX-CHR_MIN+1 should fit in uchr */ +#endif + +/* + * Functions operating on chr. + */ + +#define iscalnum(x) Tcl_UniCharIsAlnum(x) +#define iscalpha(x) Tcl_UniCharIsAlpha(x) +#define iscdigit(x) Tcl_UniCharIsDigit(x) +#define iscspace(x) Tcl_UniCharIsSpace(x) + +/* + * Name the external functions. + */ + +#ifdef REGEX_STANDALONE +# ifdef REGEX_WCHAR +# define compile re_wcomp +# define exec re_wexec +# define __REG_NOCHAR +# else +# define compile re_comp +# define exec re_exec +# undef __REG_NOCHAR +# endif +#else +#define compile TclReComp +#define exec TclReExec +#endif + +/* +& Enable/disable debugging code (by whether REG_DEBUG is defined or not). +*/ + +#if 0 /* No debug unless requested by makefile. */ +#define REG_DEBUG /* */ +#endif + + +#ifndef REGEX_STANDALONE +/* + * Method of allocating a local workspace. We used a thread-specific data + * space to store this because the regular expression engine is never + * reentered from the same thread; it doesn't make any callbacks. + */ +#define AllocVars(vPtr) \ + static Tcl_ThreadDataKey varsKey; \ + register struct vars *vPtr = (struct vars *) \ + Tcl_GetThreadData(&varsKey, sizeof(struct vars)) +#elif 0 +/* + * This strategy for allocating workspace is "more proper" in some sense, but + * quite a bit slower. Using TSD (as above) leads to code that is quite a bit + * faster in practice (measured!) + */ +#define AllocVars(vPtr) \ + register struct vars *vPtr = (struct vars *) MALLOC(sizeof(struct vars)) +#define FreeVars(vPtr) \ + FREE(vPtr) +#endif + +/* + * And pick up the standard header. + */ + +#include "regex.h" diff --git a/contrib/hsrex/rege_dfa.c b/contrib/hsrex/rege_dfa.c new file mode 100644 index 0000000..fbeae20 --- /dev/null +++ b/contrib/hsrex/rege_dfa.c @@ -0,0 +1,816 @@ +/* + * DFA routines + * This file is #included by regexec.c. + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + - longest - longest-preferred matching engine + ^ static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *); + */ +static chr * /* endpoint, or NULL */ +longest( + struct vars *v, /* used only for debug and exec flags */ + struct dfa *d, + chr *start, /* where the match should start */ + chr *stop, /* match must end at or before here */ + int *hitstopp) /* record whether hit v->stop, if non-NULL */ +{ + chr *cp; + chr *realstop = (stop == v->stop) ? stop : stop + 1; + color co; + struct sset *css; + struct sset *ss; + chr *post; + int i; + struct colormap *cm = d->cm; + + /* + * Initialize. + */ + + css = initialize(v, d, start); + cp = start; + if (hitstopp != NULL) { + *hitstopp = 0; + } + + /* + * Startup. + */ + + FDEBUG(("+++ startup +++\n")); + if (cp == v->start) { + co = d->cnfa->bos[(v->eflags®_NOTBOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long)co)); + } else { + co = GETCOLOR(cm, *(cp - 1)); + FDEBUG(("char %c, color %ld\n", (char)*(cp-1), (long)co)); + } + css = miss(v, d, css, co, cp, start); + if (css == NULL) { + return NULL; + } + css->lastseen = cp; + + /* + * Main loop. + */ + + if (v->eflags®_FTRACE) { + while (cp < realstop) { + FDEBUG(("+++ at c%d +++\n", css - d->ssets)); + co = GETCOLOR(cm, *cp); + FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co)); + ss = css->outs[co]; + if (ss == NULL) { + ss = miss(v, d, css, co, cp+1, start); + if (ss == NULL) { + break; /* NOTE BREAK OUT */ + } + } + cp++; + ss->lastseen = cp; + css = ss; + } + } else { + while (cp < realstop) { + co = GETCOLOR(cm, *cp); + ss = css->outs[co]; + if (ss == NULL) { + ss = miss(v, d, css, co, cp+1, start); + if (ss == NULL) { + break; /* NOTE BREAK OUT */ + } + } + cp++; + ss->lastseen = cp; + css = ss; + } + } + + /* + * Shutdown. + */ + + FDEBUG(("+++ shutdown at c%d +++\n", css - d->ssets)); + if (cp == v->stop && stop == v->stop) { + if (hitstopp != NULL) { + *hitstopp = 1; + } + co = d->cnfa->eos[(v->eflags®_NOTEOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long)co)); + ss = miss(v, d, css, co, cp, start); + + /* + * Special case: match ended at eol? + */ + + if (ss != NULL && (ss->flags&POSTSTATE)) { + return cp; + } else if (ss != NULL) { + ss->lastseen = cp; /* to be tidy */ + } + } + + /* + * Find last match, if any. + */ + + post = d->lastpost; + for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--) { + if ((ss->flags&POSTSTATE) && (post != ss->lastseen) && + (post == NULL || post < ss->lastseen)) { + post = ss->lastseen; + } + } + if (post != NULL) { /* found one */ + return post - 1; + } + + return NULL; +} + +/* + - shortest - shortest-preferred matching engine + ^ static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, + ^ chr **, int *); + */ +static chr * /* endpoint, or NULL */ +shortest( + struct vars *v, + struct dfa *d, + chr *start, /* where the match should start */ + chr *min, /* match must end at or after here */ + chr *max, /* match must end at or before here */ + chr **coldp, /* store coldstart pointer here, if nonNULL */ + int *hitstopp) /* record whether hit v->stop, if non-NULL */ +{ + chr *cp; + chr *realmin = (min == v->stop) ? min : min + 1; + chr *realmax = (max == v->stop) ? max : max + 1; + color co; + struct sset *css; + struct sset *ss; + struct colormap *cm = d->cm; + + /* + * Initialize. + */ + + css = initialize(v, d, start); + cp = start; + if (hitstopp != NULL) { + *hitstopp = 0; + } + + /* + * Startup. + */ + + FDEBUG(("--- startup ---\n")); + if (cp == v->start) { + co = d->cnfa->bos[(v->eflags®_NOTBOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long)co)); + } else { + co = GETCOLOR(cm, *(cp - 1)); + FDEBUG(("char %c, color %ld\n", (char)*(cp-1), (long)co)); + } + css = miss(v, d, css, co, cp, start); + if (css == NULL) { + return NULL; + } + css->lastseen = cp; + ss = css; + + /* + * Main loop. + */ + + if (v->eflags®_FTRACE) { + while (cp < realmax) { + FDEBUG(("--- at c%d ---\n", css - d->ssets)); + co = GETCOLOR(cm, *cp); + FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co)); + ss = css->outs[co]; + if (ss == NULL) { + ss = miss(v, d, css, co, cp+1, start); + if (ss == NULL) { + break; /* NOTE BREAK OUT */ + } + } + cp++; + ss->lastseen = cp; + css = ss; + if ((ss->flags&POSTSTATE) && cp >= realmin) { + break; /* NOTE BREAK OUT */ + } + } + } else { + while (cp < realmax) { + co = GETCOLOR(cm, *cp); + ss = css->outs[co]; + if (ss == NULL) { + ss = miss(v, d, css, co, cp+1, start); + if (ss == NULL) { + break; /* NOTE BREAK OUT */ + } + } + cp++; + ss->lastseen = cp; + css = ss; + if ((ss->flags&POSTSTATE) && cp >= realmin) { + break; /* NOTE BREAK OUT */ + } + } + } + + if (ss == NULL) { + return NULL; + } + + if (coldp != NULL) { /* report last no-progress state set, if any */ + *coldp = lastcold(v, d); + } + + if ((ss->flags&POSTSTATE) && cp > min) { + assert(cp >= realmin); + cp--; + } else if (cp == v->stop && max == v->stop) { + co = d->cnfa->eos[(v->eflags®_NOTEOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long)co)); + ss = miss(v, d, css, co, cp, start); + + /* + * Match might have ended at eol. + */ + + if ((ss == NULL || !(ss->flags&POSTSTATE)) && hitstopp != NULL) { + *hitstopp = 1; + } + } + + if (ss == NULL || !(ss->flags&POSTSTATE)) { + return NULL; + } + + return cp; +} + +/* + - lastcold - determine last point at which no progress had been made + ^ static chr *lastcold(struct vars *, struct dfa *); + */ +static chr * /* endpoint, or NULL */ +lastcold( + struct vars *v, + struct dfa *d) +{ + struct sset *ss; + chr *nopr; + int i; + + nopr = d->lastnopr; + if (nopr == NULL) { + nopr = v->start; + } + for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--) { + if ((ss->flags&NOPROGRESS) && nopr < ss->lastseen) { + nopr = ss->lastseen; + } + } + return nopr; +} + +/* + - newdfa - set up a fresh DFA + ^ static struct dfa *newdfa(struct vars *, struct cnfa *, + ^ struct colormap *, struct smalldfa *); + */ +static struct dfa * +newdfa( + struct vars *v, + struct cnfa *cnfa, + struct colormap *cm, + struct smalldfa *small) /* preallocated space, may be NULL */ +{ + struct dfa *d; + size_t nss = cnfa->nstates * 2; + int wordsper = (cnfa->nstates + UBITS - 1) / UBITS; + struct smalldfa *smallwas = small; + + assert(cnfa != NULL && cnfa->nstates != 0); + + if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS) { + assert(wordsper == 1); + if (small == NULL) { + small = (struct smalldfa *) MALLOC(sizeof(struct smalldfa)); + if (small == NULL) { + ERR(REG_ESPACE); + return NULL; + } + } + d = &small->dfa; + d->ssets = small->ssets; + d->statesarea = small->statesarea; + d->work = &d->statesarea[nss]; + d->outsarea = small->outsarea; + d->incarea = small->incarea; + d->cptsmalloced = 0; + d->mallocarea = (smallwas == NULL) ? (char *)small : NULL; + } else { + d = (struct dfa *)MALLOC(sizeof(struct dfa)); + if (d == NULL) { + ERR(REG_ESPACE); + return NULL; + } + d->ssets = (struct sset *)MALLOC(nss * sizeof(struct sset)); + d->statesarea = (unsigned *) + MALLOC((nss+WORK) * wordsper * sizeof(unsigned)); + d->work = &d->statesarea[nss * wordsper]; + d->outsarea = (struct sset **) + MALLOC(nss * cnfa->ncolors * sizeof(struct sset *)); + d->incarea = (struct arcp *) + MALLOC(nss * cnfa->ncolors * sizeof(struct arcp)); + d->cptsmalloced = 1; + d->mallocarea = (char *)d; + if (d->ssets == NULL || d->statesarea == NULL || + d->outsarea == NULL || d->incarea == NULL) { + freedfa(d); + ERR(REG_ESPACE); + return NULL; + } + } + + d->nssets = (v->eflags®_SMALL) ? 7 : nss; + d->nssused = 0; + d->nstates = cnfa->nstates; + d->ncolors = cnfa->ncolors; + d->wordsper = wordsper; + d->cnfa = cnfa; + d->cm = cm; + d->lastpost = NULL; + d->lastnopr = NULL; + d->search = d->ssets; + + /* + * Initialization of sset fields is done as needed. + */ + + return d; +} + +/* + - freedfa - free a DFA + ^ static void freedfa(struct dfa *); + */ +static void +freedfa( + struct dfa *d) +{ + if (d->cptsmalloced) { + if (d->ssets != NULL) { + FREE(d->ssets); + } + if (d->statesarea != NULL) { + FREE(d->statesarea); + } + if (d->outsarea != NULL) { + FREE(d->outsarea); + } + if (d->incarea != NULL) { + FREE(d->incarea); + } + } + + if (d->mallocarea != NULL) { + FREE(d->mallocarea); + } +} + +/* + - hash - construct a hash code for a bitvector + * There are probably better ways, but they're more expensive. + ^ static unsigned hash(unsigned *, int); + */ +static unsigned +hash( + unsigned *uv, + int n) +{ + int i; + unsigned h; + + h = 0; + for (i = 0; i < n; i++) { + h ^= uv[i]; + } + return h; +} + +/* + - initialize - hand-craft a cache entry for startup, otherwise get ready + ^ static struct sset *initialize(struct vars *, struct dfa *, chr *); + */ +static struct sset * +initialize( + struct vars *v, /* used only for debug flags */ + struct dfa *d, + chr *start) +{ + struct sset *ss; + int i; + + /* + * Is previous one still there? + */ + + if (d->nssused > 0 && (d->ssets[0].flags&STARTER)) { + ss = &d->ssets[0]; + } else { /* no, must (re)build it */ + ss = getvacant(v, d, start, start); + for (i = 0; i < d->wordsper; i++) { + ss->states[i] = 0; + } + BSET(ss->states, d->cnfa->pre); + ss->hash = HASH(ss->states, d->wordsper); + assert(d->cnfa->pre != d->cnfa->post); + ss->flags = STARTER|LOCKED|NOPROGRESS; + + /* + * lastseen dealt with below + */ + } + + for (i = 0; i < d->nssused; i++) { + d->ssets[i].lastseen = NULL; + } + ss->lastseen = start; /* maybe untrue, but harmless */ + d->lastpost = NULL; + d->lastnopr = NULL; + return ss; +} + +/* + - miss - handle a cache miss + ^ static struct sset *miss(struct vars *, struct dfa *, struct sset *, + ^ pcolor, chr *, chr *); + */ +static struct sset * /* NULL if goes to empty set */ +miss( + struct vars *v, /* used only for debug flags */ + struct dfa *d, + struct sset *css, + pcolor co, + chr *cp, /* next chr */ + chr *start) /* where the attempt got started */ +{ + struct cnfa *cnfa = d->cnfa; + int i; + unsigned h; + struct carc *ca; + struct sset *p; + int ispost; + int noprogress; + int gotstate; + int dolacons; + int sawlacons; + + /* + * For convenience, we can be called even if it might not be a miss. + */ + + if (css->outs[co] != NULL) { + FDEBUG(("hit\n")); + return css->outs[co]; + } + FDEBUG(("miss\n")); + + /* + * First, what set of states would we end up in? + */ + + for (i = 0; i < d->wordsper; i++) { + d->work[i] = 0; + } + ispost = 0; + noprogress = 1; + gotstate = 0; + for (i = 0; i < d->nstates; i++) { + if (ISBSET(css->states, i)) { + for (ca = cnfa->states[i]+1; ca->co != COLORLESS; ca++) { + if (ca->co == co) { + BSET(d->work, ca->to); + gotstate = 1; + if (ca->to == cnfa->post) { + ispost = 1; + } + if (!cnfa->states[ca->to]->co) { + noprogress = 0; + } + FDEBUG(("%d -> %d\n", i, ca->to)); + } + } + } + } + dolacons = (gotstate) ? (cnfa->flags&HASLACONS) : 0; + sawlacons = 0; + while (dolacons) { /* transitive closure */ + dolacons = 0; + for (i = 0; i < d->nstates; i++) { + if (ISBSET(d->work, i)) { + for (ca = cnfa->states[i]+1; ca->co != COLORLESS; ca++) { + if (ca->co <= cnfa->ncolors) { + continue; /* NOTE CONTINUE */ + } + sawlacons = 1; + if (ISBSET(d->work, ca->to)) { + continue; /* NOTE CONTINUE */ + } + if (!lacon(v, cnfa, cp, ca->co)) { + continue; /* NOTE CONTINUE */ + } + BSET(d->work, ca->to); + dolacons = 1; + if (ca->to == cnfa->post) { + ispost = 1; + } + if (!cnfa->states[ca->to]->co) { + noprogress = 0; + } + FDEBUG(("%d :> %d\n", i, ca->to)); + } + } + } + } + if (!gotstate) { + return NULL; + } + h = HASH(d->work, d->wordsper); + + /* + * Next, is that in the cache? + */ + + for (p = d->ssets, i = d->nssused; i > 0; p++, i--) { + if (HIT(h, d->work, p, d->wordsper)) { + FDEBUG(("cached c%d\n", p - d->ssets)); + break; /* NOTE BREAK OUT */ + } + } + if (i == 0) { /* nope, need a new cache entry */ + p = getvacant(v, d, cp, start); + assert(p != css); + for (i = 0; i < d->wordsper; i++) { + p->states[i] = d->work[i]; + } + p->hash = h; + p->flags = (ispost) ? POSTSTATE : 0; + if (noprogress) { + p->flags |= NOPROGRESS; + } + + /* + * lastseen to be dealt with by caller + */ + } + + if (!sawlacons) { /* lookahead conds. always cache miss */ + FDEBUG(("c%d[%d]->c%d\n", css - d->ssets, co, p - d->ssets)); + css->outs[co] = p; + css->inchain[co] = p->ins; + p->ins.ss = css; + p->ins.co = (color)co; + } + return p; +} + +/* + - lacon - lookahead-constraint checker for miss() + ^ static int lacon(struct vars *, struct cnfa *, chr *, pcolor); + */ +static int /* predicate: constraint satisfied? */ +lacon( + struct vars *v, + struct cnfa *pcnfa, /* parent cnfa */ + chr *cp, + pcolor co) /* "color" of the lookahead constraint */ +{ + int n; + struct subre *sub; + struct dfa *d; + struct smalldfa sd; + chr *end; + + n = co - pcnfa->ncolors; + assert(n < v->g->nlacons && v->g->lacons != NULL); + FDEBUG(("=== testing lacon %d\n", n)); + sub = &v->g->lacons[n]; + d = newdfa(v, &sub->cnfa, &v->g->cmap, &sd); + if (d == NULL) { + ERR(REG_ESPACE); + return 0; + } + end = longest(v, d, cp, v->stop, (int *)NULL); + freedfa(d); + FDEBUG(("=== lacon %d match %d\n", n, (end != NULL))); + return (sub->subno) ? (end != NULL) : (end == NULL); +} + +/* + - getvacant - get a vacant state set + * This routine clears out the inarcs and outarcs, but does not otherwise + * clear the innards of the state set -- that's up to the caller. + ^ static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *); + */ +static struct sset * +getvacant( + struct vars *v, /* used only for debug flags */ + struct dfa *d, + chr *cp, + chr *start) +{ + int i; + struct sset *ss; + struct sset *p; + struct arcp ap; + struct arcp lastap = {NULL, 0}; /* silence gcc 4 warning */ + color co; + + ss = pickss(v, d, cp, start); + assert(!(ss->flags&LOCKED)); + + /* + * Clear out its inarcs, including self-referential ones. + */ + + ap = ss->ins; + while ((p = ap.ss) != NULL) { + co = ap.co; + FDEBUG(("zapping c%d's %ld outarc\n", p - d->ssets, (long)co)); + p->outs[co] = NULL; + ap = p->inchain[co]; + p->inchain[co].ss = NULL; /* paranoia */ + } + ss->ins.ss = NULL; + + /* + * Take it off the inarc chains of the ssets reached by its outarcs. + */ + + for (i = 0; i < d->ncolors; i++) { + p = ss->outs[i]; + assert(p != ss); /* not self-referential */ + if (p == NULL) { + continue; /* NOTE CONTINUE */ + } + FDEBUG(("del outarc %d from c%d's in chn\n", i, p - d->ssets)); + if (p->ins.ss == ss && p->ins.co == i) { + p->ins = ss->inchain[i]; + } else { + assert(p->ins.ss != NULL); + for (ap = p->ins; ap.ss != NULL && + !(ap.ss == ss && ap.co == i); + ap = ap.ss->inchain[ap.co]) { + lastap = ap; + } + assert(ap.ss != NULL); + lastap.ss->inchain[lastap.co] = ss->inchain[i]; + } + ss->outs[i] = NULL; + ss->inchain[i].ss = NULL; + } + + /* + * If ss was a success state, may need to remember location. + */ + + if ((ss->flags&POSTSTATE) && ss->lastseen != d->lastpost && + (d->lastpost == NULL || d->lastpost < ss->lastseen)) { + d->lastpost = ss->lastseen; + } + + /* + * Likewise for a no-progress state. + */ + + if ((ss->flags&NOPROGRESS) && ss->lastseen != d->lastnopr && + (d->lastnopr == NULL || d->lastnopr < ss->lastseen)) { + d->lastnopr = ss->lastseen; + } + + return ss; +} + +/* + - pickss - pick the next stateset to be used + ^ static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *); + */ +static struct sset * +pickss( + struct vars *v, /* used only for debug flags */ + struct dfa *d, + chr *cp, + chr *start) +{ + int i; + struct sset *ss; + struct sset *end; + chr *ancient; + + /* + * Shortcut for cases where cache isn't full. + */ + + if (d->nssused < d->nssets) { + i = d->nssused; + d->nssused++; + ss = &d->ssets[i]; + FDEBUG(("new c%d\n", i)); + + /* + * Set up innards. + */ + + ss->states = &d->statesarea[i * d->wordsper]; + ss->flags = 0; + ss->ins.ss = NULL; + ss->ins.co = WHITE; /* give it some value */ + ss->outs = &d->outsarea[i * d->ncolors]; + ss->inchain = &d->incarea[i * d->ncolors]; + for (i = 0; i < d->ncolors; i++) { + ss->outs[i] = NULL; + ss->inchain[i].ss = NULL; + } + return ss; + } + + /* + * Look for oldest, or old enough anyway. + */ + + if (cp - start > d->nssets*2/3) { /* oldest 33% are expendable */ + ancient = cp - d->nssets*2/3; + } else { + ancient = start; + } + for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++) { + if ((ss->lastseen == NULL || ss->lastseen < ancient) + && !(ss->flags&LOCKED)) { + d->search = ss + 1; + FDEBUG(("replacing c%d\n", ss - d->ssets)); + return ss; + } + } + for (ss = d->ssets, end = d->search; ss < end; ss++) { + if ((ss->lastseen == NULL || ss->lastseen < ancient) + && !(ss->flags&LOCKED)) { + d->search = ss + 1; + FDEBUG(("replacing c%d\n", ss - d->ssets)); + return ss; + } + } + + /* + * Nobody's old enough?!? -- something's really wrong. + */ + + FDEBUG(("can't find victim to replace!\n")); + assert(NOTREACHED); + ERR(REG_ASSERT); + return d->ssets; +} + +/* + * Local Variables: + * mode: c + * c-basic-offset: 4 + * fill-column: 78 + * End: + */ diff --git a/contrib/hsrex/regerror.c b/contrib/hsrex/regerror.c new file mode 100644 index 0000000..49b6f3e --- /dev/null +++ b/contrib/hsrex/regerror.c @@ -0,0 +1,129 @@ +/* + * regerror - error-code expansion + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation of + * software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "regguts.h" + +/* + * Unknown-error explanation. + */ + +static char unk[] = "*** unknown regex error code 0x%x ***"; + +/* + * Struct to map among codes, code names, and explanations. + */ + +static struct rerr { + int code; + const char *name; + const char *explain; +} rerrs[] = { + /* The actual table is built from regex.h */ +#include "regerrs.h" + { -1, "", "oops" }, /* explanation special-cased in code */ +}; + +/* + - regerror - the interface to error numbers + */ +/* ARGSUSED */ +size_t /* Actual space needed (including NUL) */ +regerror( + int code, /* Error code, or REG_ATOI or REG_ITOA */ + const regex_t *preg, /* Associated regex_t (unused at present) */ + char *errbuf, /* Result buffer (unless errbuf_size==0) */ + size_t errbuf_size) /* Available space in errbuf, can be 0 */ +{ + struct rerr *r; + const char *msg; + char convbuf[sizeof(unk)+50]; /* 50 = plenty for int */ + size_t len; + int icode; + + switch (code) { + case REG_ATOI: /* Convert name to number */ + for (r = rerrs; r->code >= 0; r++) { + if (strcmp(r->name, errbuf) == 0) { + break; + } + } + sprintf(convbuf, "%d", r->code); /* -1 for unknown */ + msg = convbuf; + break; + case REG_ITOA: /* Convert number to name */ + icode = atoi(errbuf); /* Not our problem if this fails */ + for (r = rerrs; r->code >= 0; r++) { + if (r->code == icode) { + break; + } + } + if (r->code >= 0) { + msg = r->name; + } else { /* Unknown; tell him the number */ + sprintf(convbuf, "REG_%u", (unsigned)icode); + msg = convbuf; + } + break; + default: /* A real, normal error code */ + for (r = rerrs; r->code >= 0; r++) { + if (r->code == code) { + break; + } + } + if (r->code >= 0) { + msg = r->explain; + } else { /* Unknown; say so */ + sprintf(convbuf, unk, code); + msg = convbuf; + } + break; + } + + len = strlen(msg) + 1; /* Space needed, including NUL */ + if (errbuf_size > 0) { + if (errbuf_size > len) { + strcpy(errbuf, msg); + } else { /* Truncate to fit */ + strncpy(errbuf, msg, errbuf_size-1); + errbuf[errbuf_size-1] = '\0'; + } + } + + return len; +} + +/* + * Local Variables: + * mode: c + * c-basic-offset: 4 + * fill-column: 78 + * End: + */ diff --git a/contrib/hsrex/regerrs.h b/contrib/hsrex/regerrs.h new file mode 100644 index 0000000..259c0cb --- /dev/null +++ b/contrib/hsrex/regerrs.h @@ -0,0 +1,19 @@ +{ REG_OKAY, "REG_OKAY", "no errors detected" }, +{ REG_NOMATCH, "REG_NOMATCH", "failed to match" }, +{ REG_BADPAT, "REG_BADPAT", "invalid regexp (reg version 0.8)" }, +{ REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element" }, +{ REG_ECTYPE, "REG_ECTYPE", "invalid character class" }, +{ REG_EESCAPE, "REG_EESCAPE", "invalid escape \\ sequence" }, +{ REG_ESUBREG, "REG_ESUBREG", "invalid backreference number" }, +{ REG_EBRACK, "REG_EBRACK", "brackets [] not balanced" }, +{ REG_EPAREN, "REG_EPAREN", "parentheses () not balanced" }, +{ REG_EBRACE, "REG_EBRACE", "braces {} not balanced" }, +{ REG_BADBR, "REG_BADBR", "invalid repetition count(s)" }, +{ REG_ERANGE, "REG_ERANGE", "invalid character range" }, +{ REG_ESPACE, "REG_ESPACE", "out of memory" }, +{ REG_BADRPT, "REG_BADRPT", "quantifier operand invalid" }, +{ REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug" }, +{ REG_INVARG, "REG_INVARG", "invalid argument to regex function" }, +{ REG_MIXED, "REG_MIXED", "character widths of regex and string differ" }, +{ REG_BADOPT, "REG_BADOPT", "invalid embedded option" }, +{ REG_ETOOBIG, "REG_ETOOBIG", "nfa has too many states" }, diff --git a/contrib/hsrex/regex.h b/contrib/hsrex/regex.h new file mode 100644 index 0000000..2ef538a --- /dev/null +++ b/contrib/hsrex/regex.h @@ -0,0 +1,336 @@ +#ifndef _REGEX_H_ +#define _REGEX_H_ /* never again */ +/* + * regular expressions + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation of + * software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * + * Prototypes etc. marked with "^" within comments get gathered up (and + * possibly edited) by the regfwd program and inserted near the bottom of this + * file. + * + * We offer the option of declaring one wide-character version of the RE + * functions as well as the char versions. To do that, define __REG_WIDE_T to + * the type of wide characters (unfortunately, there is no consensus that + * wchar_t is suitable) and __REG_WIDE_COMPILE and __REG_WIDE_EXEC to the + * names to be used for the compile and execute functions (suggestion: + * re_Xcomp and re_Xexec, where X is a letter suggestive of the wide type, + * e.g. re_ucomp and re_uexec for Unicode). For cranky old compilers, it may + * be necessary to do something like: + * #define __REG_WIDE_COMPILE(a,b,c,d) re_Xcomp(a,b,c,d) + * #define __REG_WIDE_EXEC(a,b,c,d,e,f,g) re_Xexec(a,b,c,d,e,f,g) + * rather than just #defining the names as parameterless macros. + * + * For some specialized purposes, it may be desirable to suppress the + * declarations of the "front end" functions, regcomp() and regexec(), or of + * the char versions of the compile and execute functions. To suppress the + * front-end functions, define __REG_NOFRONT. To suppress the char versions, + * define __REG_NOCHAR. + * + * The right place to do those defines (and some others you may want, see + * below) would be <sys/types.h>. If you don't have control of that file, the + * right place to add your own defines to this file is marked below. This is + * normally done automatically, by the makefile and regmkhdr, based on the + * contents of regcustom.h. + */ + +/* + * voodoo for C++ + */ +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Add your own defines, if needed, here. + */ + +/* + * Location where a chunk of regcustom.h is automatically spliced into this + * file (working from its prototype, regproto.h). + */ + +/* --- begin --- */ +/* ensure certain things don't sneak in from system headers */ +#ifdef __REG_WIDE_T +#undef __REG_WIDE_T +#endif +#ifdef __REG_WIDE_COMPILE +#undef __REG_WIDE_COMPILE +#endif +#ifdef __REG_WIDE_EXEC +#undef __REG_WIDE_EXEC +#endif +#ifdef __REG_REGOFF_T +#undef __REG_REGOFF_T +#endif +#ifdef __REG_VOID_T +#undef __REG_VOID_T +#endif +#ifdef __REG_CONST +#undef __REG_CONST +#endif +#ifdef __REG_NOFRONT +#undef __REG_NOFRONT +#endif +#ifdef __REG_NOCHAR +#undef __REG_NOCHAR +#endif +/* interface types */ +#define __REG_WIDE_T Tcl_UniChar +#define __REG_REGOFF_T long /* not really right, but good enough... */ +#define __REG_VOID_T void +#define __REG_CONST const +/* names and declarations */ +#define __REG_WIDE_COMPILE TclReComp +#define __REG_WIDE_EXEC TclReExec +#define __REG_NOFRONT /* don't want regcomp() and regexec() */ +#define __REG_NOCHAR /* or the char versions */ +#define regfree TclReFree +#define regerror TclReError +/* --- end --- */ +#ifdef REGEX_STANDALONE +# undef regfree +# undef regerror +# define regfree re_free +# define regerror re_error +# undef __REG_WIDE_T +# define __REG_WIDE_T wchar_t +# undef __REG_WIDE_COMPILE +# define __REG_WIDE_COMPILE re_wcomp +# undef __REG_WIDE_EXEC +# define __REG_WIDE_EXEC re_wexec +# ifndef REGEX_WCHAR +# undef __REG_NOCHAR +# endif +#endif + +/* + * interface types etc. + */ + +/* + * regoff_t has to be large enough to hold either off_t or ssize_t, and must + * be signed; it's only a guess that long is suitable, so we offer + * <sys/types.h> an override. + */ +#ifdef __REG_REGOFF_T +typedef __REG_REGOFF_T regoff_t; +#else +typedef long regoff_t; +#endif + +/* + * For benefit of old compilers, we offer <sys/types.h> the option of + * overriding the `void' type used to declare nonexistent return types. + */ +#ifdef __REG_VOID_T +typedef __REG_VOID_T re_void; +#else +typedef void re_void; +#endif + +/* + * Also for benefit of old compilers, <sys/types.h> can supply a macro which + * expands to a substitute for `const'. + */ +#ifndef __REG_CONST +#define __REG_CONST const +#endif + + + +/* + * other interface types + */ + +/* the biggie, a compiled RE (or rather, a front end to same) */ +typedef struct { + int re_magic; /* magic number */ + size_t re_nsub; /* number of subexpressions */ + long re_info; /* information about RE */ +#define REG_UBACKREF 000001 +#define REG_ULOOKAHEAD 000002 +#define REG_UBOUNDS 000004 +#define REG_UBRACES 000010 +#define REG_UBSALNUM 000020 +#define REG_UPBOTCH 000040 +#define REG_UBBS 000100 +#define REG_UNONPOSIX 000200 +#define REG_UUNSPEC 000400 +#define REG_UUNPORT 001000 +#define REG_ULOCALE 002000 +#define REG_UEMPTYMATCH 004000 +#define REG_UIMPOSSIBLE 010000 +#define REG_USHORTEST 020000 + int re_csize; /* sizeof(character) */ + char *re_endp; /* backward compatibility kludge */ + /* the rest is opaque pointers to hidden innards */ + char *re_guts; /* `char *' is more portable than `void *' */ + char *re_fns; +} regex_t; + +/* result reporting (may acquire more fields later) */ +typedef struct { + regoff_t rm_so; /* start of substring */ + regoff_t rm_eo; /* end of substring */ +} regmatch_t; + +/* supplementary control and reporting */ +typedef struct { + regmatch_t rm_extend; /* see REG_EXPECT */ +} rm_detail_t; + +/* + * compilation + ^ #ifndef __REG_NOCHAR + ^ int re_comp(regex_t *, __REG_CONST char *, size_t, int); + ^ #endif + ^ #ifndef __REG_NOFRONT + ^ int regcomp(regex_t *, __REG_CONST char *, int); + ^ #endif + ^ #ifdef __REG_WIDE_T + ^ int __REG_WIDE_COMPILE(regex_t *, __REG_CONST __REG_WIDE_T *, size_t, int); + ^ #endif + */ +#define REG_BASIC 000000 /* BREs (convenience) */ +#define REG_EXTENDED 000001 /* EREs */ +#define REG_ADVF 000002 /* advanced features in EREs */ +#define REG_ADVANCED 000003 /* AREs (which are also EREs) */ +#define REG_QUOTE 000004 /* no special characters, none */ +#define REG_NOSPEC REG_QUOTE /* historical synonym */ +#define REG_ICASE 000010 /* ignore case */ +#define REG_NOSUB 000020 /* don't care about subexpressions */ +#define REG_EXPANDED 000040 /* expanded format, white space & comments */ +#define REG_NLSTOP 000100 /* \n doesn't match . or [^ ] */ +#define REG_NLANCH 000200 /* ^ matches after \n, $ before */ +#define REG_NEWLINE 000300 /* newlines are line terminators */ +#define REG_PEND 000400 /* ugh -- backward-compatibility hack */ +#define REG_EXPECT 001000 /* report details on partial/limited matches */ +#define REG_BOSONLY 002000 /* temporary kludge for BOS-only matches */ +#define REG_DUMP 004000 /* none of your business :-) */ +#define REG_FAKE 010000 /* none of your business :-) */ +#define REG_PROGRESS 020000 /* none of your business :-) */ + +/* + * execution + ^ #ifndef __REG_NOCHAR + ^ int re_exec(regex_t *, __REG_CONST char *, size_t, + ^ rm_detail_t *, size_t, regmatch_t [], int); + ^ #endif + ^ #ifndef __REG_NOFRONT + ^ int regexec(regex_t *, __REG_CONST char *, size_t, regmatch_t [], int); + ^ #endif + ^ #ifdef __REG_WIDE_T + ^ int __REG_WIDE_EXEC(regex_t *, __REG_CONST __REG_WIDE_T *, size_t, + ^ rm_detail_t *, size_t, regmatch_t [], int); + ^ #endif + */ +#define REG_NOTBOL 0001 /* BOS is not BOL */ +#define REG_NOTEOL 0002 /* EOS is not EOL */ +#define REG_STARTEND 0004 /* backward compatibility kludge */ +#define REG_FTRACE 0010 /* none of your business */ +#define REG_MTRACE 0020 /* none of your business */ +#define REG_SMALL 0040 /* none of your business */ + +/* + * misc generics (may be more functions here eventually) + ^ re_void regfree(regex_t *); + */ + +/* + * error reporting + * Be careful if modifying the list of error codes -- the table used by + * regerror() is generated automatically from this file! + * + * Note that there is no wide-char variant of regerror at this time; what kind + * of character is used for error reports is independent of what kind is used + * in matching. + * + ^ extern size_t regerror(int, __REG_CONST regex_t *, char *, size_t); + */ +#define REG_OKAY 0 /* no errors detected */ +#define REG_NOMATCH 1 /* failed to match */ +#define REG_BADPAT 2 /* invalid regexp */ +#define REG_ECOLLATE 3 /* invalid collating element */ +#define REG_ECTYPE 4 /* invalid character class */ +#define REG_EESCAPE 5 /* invalid escape \ sequence */ +#define REG_ESUBREG 6 /* invalid backreference number */ +#define REG_EBRACK 7 /* brackets [] not balanced */ +#define REG_EPAREN 8 /* parentheses () not balanced */ +#define REG_EBRACE 9 /* braces {} not balanced */ +#define REG_BADBR 10 /* invalid repetition count(s) */ +#define REG_ERANGE 11 /* invalid character range */ +#define REG_ESPACE 12 /* out of memory */ +#define REG_BADRPT 13 /* quantifier operand invalid */ +#define REG_ASSERT 15 /* "can't happen" -- you found a bug */ +#define REG_INVARG 16 /* invalid argument to regex function */ +#define REG_MIXED 17 /* character widths of regex and string differ */ +#define REG_BADOPT 18 /* invalid embedded option */ +#define REG_ETOOBIG 19 /* nfa has too many states */ +/* two specials for debugging and testing */ +#define REG_ATOI 101 /* convert error-code name to number */ +#define REG_ITOA 102 /* convert error-code number to name */ + +/* + * the prototypes, as possibly munched by regfwd + */ +/* =====^!^===== begin forwards =====^!^===== */ +/* automatically gathered by fwd; do not hand-edit */ +/* === regproto.h === */ +#ifndef __REG_NOCHAR +int re_comp(regex_t *, __REG_CONST unsigned char *, size_t, int); +#endif +#ifndef __REG_NOFRONT +int regcomp(regex_t *, __REG_CONST char *, int); +#endif +#ifdef __REG_WIDE_T +MODULE_SCOPE int __REG_WIDE_COMPILE(regex_t *, __REG_CONST __REG_WIDE_T *, size_t, int); +#endif +#ifndef __REG_NOCHAR +int re_exec(regex_t *, __REG_CONST unsigned char *, size_t, rm_detail_t *, size_t, regmatch_t [], int); +#endif +#ifndef __REG_NOFRONT +int regexec(regex_t *, __REG_CONST char *, size_t, regmatch_t [], int); +#endif +#ifdef __REG_WIDE_T +MODULE_SCOPE int __REG_WIDE_EXEC(regex_t *, __REG_CONST __REG_WIDE_T *, size_t, rm_detail_t *, size_t, regmatch_t [], int); +#endif +MODULE_SCOPE re_void regfree(regex_t *); +MODULE_SCOPE size_t regerror(int, __REG_CONST regex_t *, char *, size_t); +/* automatically gathered by fwd; do not hand-edit */ +/* =====^!^===== end forwards =====^!^===== */ + +/* + * more C++ voodoo + */ +#ifdef __cplusplus +} +#endif + +#endif diff --git a/contrib/hsrex/regexec.c b/contrib/hsrex/regexec.c new file mode 100644 index 0000000..24edb41 --- /dev/null +++ b/contrib/hsrex/regexec.c @@ -0,0 +1,1215 @@ +/* + * re_*exec and friends - match REs + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation of + * software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "regguts.h" + +/* + * Lazy-DFA representation. + */ + +struct arcp { /* "pointer" to an outarc */ + struct sset *ss; + color co; +}; + +struct sset { /* state set */ + unsigned *states; /* pointer to bitvector */ + unsigned hash; /* hash of bitvector */ +#define HASH(bv, nw) (((nw) == 1) ? *(bv) : hash(bv, nw)) +#define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \ + memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0)) + int flags; +#define STARTER 01 /* the initial state set */ +#define POSTSTATE 02 /* includes the goal state */ +#define LOCKED 04 /* locked in cache */ +#define NOPROGRESS 010 /* zero-progress state set */ + struct arcp ins; /* chain of inarcs pointing here */ + chr *lastseen; /* last entered on arrival here */ + struct sset **outs; /* outarc vector indexed by color */ + struct arcp *inchain; /* chain-pointer vector for outarcs */ +}; + +struct dfa { + int nssets; /* size of cache */ + int nssused; /* how many entries occupied yet */ + int nstates; /* number of states */ + int ncolors; /* length of outarc and inchain vectors */ + int wordsper; /* length of state-set bitvectors */ + struct sset *ssets; /* state-set cache */ + unsigned *statesarea; /* bitvector storage */ + unsigned *work; /* pointer to work area within statesarea */ + struct sset **outsarea; /* outarc-vector storage */ + struct arcp *incarea; /* inchain storage */ + struct cnfa *cnfa; + struct colormap *cm; + chr *lastpost; /* location of last cache-flushed success */ + chr *lastnopr; /* location of last cache-flushed NOPROGRESS */ + struct sset *search; /* replacement-search-pointer memory */ + int cptsmalloced; /* were the areas individually malloced? */ + char *mallocarea; /* self, or master malloced area, or NULL */ +}; + +#define WORK 1 /* number of work bitvectors needed */ + +/* + * Setup for non-malloc allocation for small cases. + */ + +#define FEWSTATES 20 /* must be less than UBITS */ +#define FEWCOLORS 15 +struct smalldfa { + struct dfa dfa; + struct sset ssets[FEWSTATES*2]; + unsigned statesarea[FEWSTATES*2 + WORK]; + struct sset *outsarea[FEWSTATES*2 * FEWCOLORS]; + struct arcp incarea[FEWSTATES*2 * FEWCOLORS]; +}; +#define DOMALLOC ((struct smalldfa *)NULL) /* force malloc */ + +/* + * Internal variables, bundled for easy passing around. + */ + +struct vars { + regex_t *re; + struct guts *g; + int eflags; /* copies of arguments */ + size_t nmatch; + regmatch_t *pmatch; + rm_detail_t *details; + chr *start; /* start of string */ + chr *stop; /* just past end of string */ + int err; /* error code if any (0 none) */ + regoff_t *mem; /* memory vector for backtracking */ + struct smalldfa dfa1; + struct smalldfa dfa2; +}; +#define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */ +#define ISERR() VISERR(v) +#define VERR(vv,e) (((vv)->err) ? (vv)->err : ((vv)->err = (e))) +#define ERR(e) VERR(v, e) /* record an error */ +#define NOERR() {if (ISERR()) return v->err;} /* if error seen, return it */ +#define OFF(p) ((p) - v->start) +#define LOFF(p) ((long)OFF(p)) + +/* + * forward declarations + */ +/* =====^!^===== begin forwards =====^!^===== */ +/* automatically gathered by fwd; do not hand-edit */ +/* === regexec.c === */ +int exec(regex_t *, const chr *, size_t, rm_detail_t *, size_t, regmatch_t [], int); +static int find(struct vars *, struct cnfa *, struct colormap *); +static int cfind(struct vars *, struct cnfa *, struct colormap *); +static int cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **); +static void zapsubs(regmatch_t *, size_t); +static void zapmem(struct vars *, struct subre *); +static void subset(struct vars *, struct subre *, chr *, chr *); +static int dissect(struct vars *, struct subre *, chr *, chr *); +static int condissect(struct vars *, struct subre *, chr *, chr *); +static int altdissect(struct vars *, struct subre *, chr *, chr *); +static int cdissect(struct vars *, struct subre *, chr *, chr *); +static int ccondissect(struct vars *, struct subre *, chr *, chr *); +static int crevdissect(struct vars *, struct subre *, chr *, chr *); +static int cbrdissect(struct vars *, struct subre *, chr *, chr *); +static int caltdissect(struct vars *, struct subre *, chr *, chr *); +/* === rege_dfa.c === */ +static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *); +static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *); +static chr *lastcold(struct vars *, struct dfa *); +static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *); +static void freedfa(struct dfa *); +static unsigned hash(unsigned *, int); +static struct sset *initialize(struct vars *, struct dfa *, chr *); +static struct sset *miss(struct vars *, struct dfa *, struct sset *, pcolor, chr *, chr *); +static int lacon(struct vars *, struct cnfa *, chr *, pcolor); +static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *); +static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *); +/* automatically gathered by fwd; do not hand-edit */ +/* =====^!^===== end forwards =====^!^===== */ + +/* + - exec - match regular expression + ^ int exec(regex_t *, const chr *, size_t, rm_detail_t *, + ^ size_t, regmatch_t [], int); + */ +int +exec( + regex_t *re, + const chr *string, + size_t len, + rm_detail_t *details, + size_t nmatch, + regmatch_t pmatch[], + int flags) +{ + AllocVars(v); + int st; + size_t n; + int backref; +#define LOCALMAT 20 + regmatch_t mat[LOCALMAT]; +#define LOCALMEM 40 + regoff_t mem[LOCALMEM]; + + /* + * Sanity checks. + */ + + if (re == NULL || string == NULL || re->re_magic != REMAGIC) { + FreeVars(v); + return REG_INVARG; + } + if (re->re_csize != sizeof(chr)) { + FreeVars(v); + return REG_MIXED; + } + + /* + * Setup. + */ + + v->re = re; + v->g = (struct guts *)re->re_guts; + if ((v->g->cflags®_EXPECT) && details == NULL) { + FreeVars(v); + return REG_INVARG; + } + if (v->g->info®_UIMPOSSIBLE) { + FreeVars(v); + return REG_NOMATCH; + } + backref = (v->g->info®_UBACKREF) ? 1 : 0; + v->eflags = flags; + if (v->g->cflags®_NOSUB) { + nmatch = 0; /* override client */ + } + v->nmatch = nmatch; + if (backref) { + /* + * Need work area. + */ + + if (v->g->nsub + 1 <= LOCALMAT) { + v->pmatch = mat; + } else { + v->pmatch = (regmatch_t *) + MALLOC((v->g->nsub + 1) * sizeof(regmatch_t)); + } + if (v->pmatch == NULL) { + FreeVars(v); + return REG_ESPACE; + } + v->nmatch = v->g->nsub + 1; + } else { + v->pmatch = pmatch; + } + v->details = details; + v->start = (chr *)string; + v->stop = (chr *)string + len; + v->err = 0; + if (backref) { + /* + * Need retry memory. + */ + + assert(v->g->ntree >= 0); + n = (size_t)v->g->ntree; + if (n <= LOCALMEM) { + v->mem = mem; + } else { + v->mem = (regoff_t *) MALLOC(n*sizeof(regoff_t)); + } + if (v->mem == NULL) { + if (v->pmatch != pmatch && v->pmatch != mat) { + FREE(v->pmatch); + } + FreeVars(v); + return REG_ESPACE; + } + } else { + v->mem = NULL; + } + + /* + * Do it. + */ + + assert(v->g->tree != NULL); + if (backref) { + st = cfind(v, &v->g->tree->cnfa, &v->g->cmap); + } else { + st = find(v, &v->g->tree->cnfa, &v->g->cmap); + } + + /* + * Copy (portion of) match vector over if necessary. + */ + + if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) { + zapsubs(pmatch, nmatch); + n = (nmatch < v->nmatch) ? nmatch : v->nmatch; + memcpy(VS(pmatch), VS(v->pmatch), n*sizeof(regmatch_t)); + } + + /* + * Clean up. + */ + + if (v->pmatch != pmatch && v->pmatch != mat) { + FREE(v->pmatch); + } + if (v->mem != NULL && v->mem != mem) { + FREE(v->mem); + } + FreeVars(v); + return st; +} + +/* + - find - find a match for the main NFA (no-complications case) + ^ static int find(struct vars *, struct cnfa *, struct colormap *); + */ +static int +find( + struct vars *v, + struct cnfa *cnfa, + struct colormap *cm) +{ + struct dfa *s; + struct dfa *d; + chr *begin; + chr *end = NULL; + chr *cold; + chr *open; /* Open and close of range of possible + * starts */ + chr *close; + int hitend; + int shorter = (v->g->tree->flags&SHORTER) ? 1 : 0; + + /* + * First, a shot with the search RE. + */ + + s = newdfa(v, &v->g->search, cm, &v->dfa1); + assert(!(ISERR() && s != NULL)); + NOERR(); + MDEBUG(("\nsearch at %ld\n", LOFF(v->start))); + cold = NULL; + close = shortest(v, s, v->start, v->start, v->stop, &cold, NULL); + freedfa(s); + NOERR(); + if (v->g->cflags®_EXPECT) { + assert(v->details != NULL); + if (cold != NULL) { + v->details->rm_extend.rm_so = OFF(cold); + } else { + v->details->rm_extend.rm_so = OFF(v->stop); + } + v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ + } + if (close == NULL) { /* not found */ + return REG_NOMATCH; + } + if (v->nmatch == 0) { /* found, don't need exact location */ + return REG_OKAY; + } + + /* + * Find starting point and match. + */ + + assert(cold != NULL); + open = cold; + cold = NULL; + MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close))); + d = newdfa(v, cnfa, cm, &v->dfa1); + assert(!(ISERR() && d != NULL)); + NOERR(); + for (begin = open; begin <= close; begin++) { + MDEBUG(("\nfind trying at %ld\n", LOFF(begin))); + if (shorter) { + end = shortest(v, d, begin, begin, v->stop, NULL, &hitend); + } else { + end = longest(v, d, begin, v->stop, &hitend); + } + NOERR(); + if (hitend && cold == NULL) { + cold = begin; + } + if (end != NULL) { + break; /* NOTE BREAK OUT */ + } + } + assert(end != NULL); /* search RE succeeded so loop should */ + freedfa(d); + + /* + * And pin down details. + */ + + assert(v->nmatch > 0); + v->pmatch[0].rm_so = OFF(begin); + v->pmatch[0].rm_eo = OFF(end); + if (v->g->cflags®_EXPECT) { + if (cold != NULL) { + v->details->rm_extend.rm_so = OFF(cold); + } else { + v->details->rm_extend.rm_so = OFF(v->stop); + } + v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ + } + if (v->nmatch == 1) { /* no need for submatches */ + return REG_OKAY; + } + + /* + * Submatches. + */ + + zapsubs(v->pmatch, v->nmatch); + return dissect(v, v->g->tree, begin, end); +} + +/* + - cfind - find a match for the main NFA (with complications) + ^ static int cfind(struct vars *, struct cnfa *, struct colormap *); + */ +static int +cfind( + struct vars *v, + struct cnfa *cnfa, + struct colormap *cm) +{ + struct dfa *s; + struct dfa *d; + chr *cold = NULL; /* silence gcc 4 warning */ + int ret; + + s = newdfa(v, &v->g->search, cm, &v->dfa1); + NOERR(); + d = newdfa(v, cnfa, cm, &v->dfa2); + if (ISERR()) { + assert(d == NULL); + freedfa(s); + return v->err; + } + + ret = cfindloop(v, cnfa, cm, d, s, &cold); + + freedfa(d); + freedfa(s); + NOERR(); + if (v->g->cflags®_EXPECT) { + assert(v->details != NULL); + if (cold != NULL) { + v->details->rm_extend.rm_so = OFF(cold); + } else { + v->details->rm_extend.rm_so = OFF(v->stop); + } + v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ + } + return ret; +} + +/* + - cfindloop - the heart of cfind + ^ static int cfindloop(struct vars *, struct cnfa *, struct colormap *, + ^ struct dfa *, struct dfa *, chr **); + */ +static int +cfindloop( + struct vars *v, + struct cnfa *cnfa, + struct colormap *cm, + struct dfa *d, + struct dfa *s, + chr **coldp) /* where to put coldstart pointer */ +{ + chr *begin; + chr *end; + chr *cold; + chr *open; /* Open and close of range of possible + * starts */ + chr *close; + chr *estart; + chr *estop; + int er; + int shorter = v->g->tree->flags&SHORTER; + int hitend; + + assert(d != NULL && s != NULL); + cold = NULL; + close = v->start; + do { + MDEBUG(("\ncsearch at %ld\n", LOFF(close))); + close = shortest(v, s, close, close, v->stop, &cold, NULL); + if (close == NULL) { + break; /* NOTE BREAK */ + } + assert(cold != NULL); + open = cold; + cold = NULL; + MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close))); + for (begin = open; begin <= close; begin++) { + MDEBUG(("\ncfind trying at %ld\n", LOFF(begin))); + estart = begin; + estop = v->stop; + for (;;) { + if (shorter) { + end = shortest(v, d, begin, estart, estop, NULL, &hitend); + } else { + end = longest(v, d, begin, estop, &hitend); + } + if (hitend && cold == NULL) { + cold = begin; + } + if (end == NULL) { + break; /* NOTE BREAK OUT */ + } + + MDEBUG(("tentative end %ld\n", LOFF(end))); + zapsubs(v->pmatch, v->nmatch); + zapmem(v, v->g->tree); + er = cdissect(v, v->g->tree, begin, end); + if (er == REG_OKAY) { + if (v->nmatch > 0) { + v->pmatch[0].rm_so = OFF(begin); + v->pmatch[0].rm_eo = OFF(end); + } + *coldp = cold; + return REG_OKAY; + } + if (er != REG_NOMATCH) { + ERR(er); + return er; + } + if ((shorter) ? end == estop : end == begin) { + /* + * No point in trying again. + */ + + *coldp = cold; + return REG_NOMATCH; + } + + /* + * Go around and try again + */ + + if (shorter) { + estart = end + 1; + } else { + estop = end - 1; + } + } + } + } while (close < v->stop); + + *coldp = cold; + return REG_NOMATCH; +} + +/* + - zapsubs - initialize the subexpression matches to "no match" + ^ static void zapsubs(regmatch_t *, size_t); + */ +static void +zapsubs( + regmatch_t *p, + size_t n) +{ + size_t i; + + for (i = n-1; i > 0; i--) { + p[i].rm_so = -1; + p[i].rm_eo = -1; + } +} + +/* + - zapmem - initialize the retry memory of a subtree to zeros + ^ static void zapmem(struct vars *, struct subre *); + */ +static void +zapmem( + struct vars *v, + struct subre *t) +{ + if (t == NULL) { + return; + } + + assert(v->mem != NULL); + v->mem[t->retry] = 0; + if (t->op == '(') { + assert(t->subno > 0); + v->pmatch[t->subno].rm_so = -1; + v->pmatch[t->subno].rm_eo = -1; + } + + if (t->left != NULL) { + zapmem(v, t->left); + } + if (t->right != NULL) { + zapmem(v, t->right); + } +} + +/* + - subset - set any subexpression relevant to a successful subre + ^ static void subset(struct vars *, struct subre *, chr *, chr *); + */ +static void +subset( + struct vars *v, + struct subre *sub, + chr *begin, + chr *end) +{ + int n = sub->subno; + + assert(n > 0); + if ((size_t)n >= v->nmatch) { + return; + } + + MDEBUG(("setting %d\n", n)); + v->pmatch[n].rm_so = OFF(begin); + v->pmatch[n].rm_eo = OFF(end); +} + +/* + - dissect - determine subexpression matches (uncomplicated case) + ^ static int dissect(struct vars *, struct subre *, chr *, chr *); + */ +static int /* regexec return code */ +dissect( + struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + assert(t != NULL); + MDEBUG(("dissect %ld-%ld\n", LOFF(begin), LOFF(end))); + + switch (t->op) { + case '=': /* terminal node */ + assert(t->left == NULL && t->right == NULL); + return REG_OKAY; /* no action, parent did the work */ + break; + case '|': /* alternation */ + assert(t->left != NULL); + return altdissect(v, t, begin, end); + break; + case 'b': /* back ref -- shouldn't be calling us! */ + return REG_ASSERT; + break; + case '.': /* concatenation */ + assert(t->left != NULL && t->right != NULL); + return condissect(v, t, begin, end); + break; + case '(': /* capturing */ + assert(t->left != NULL && t->right == NULL); + assert(t->subno > 0); + subset(v, t, begin, end); + return dissect(v, t->left, begin, end); + break; + default: + return REG_ASSERT; + break; + } +} + +/* + - condissect - determine concatenation subexpression matches (uncomplicated) + ^ static int condissect(struct vars *, struct subre *, chr *, chr *); + */ +static int /* regexec return code */ +condissect( + struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct dfa *d; + struct dfa *d2; + chr *mid; + int i; + int shorter = (t->left->flags&SHORTER) ? 1 : 0; + chr *stop = (shorter) ? end : begin; + + assert(t->op == '.'); + assert(t->left != NULL && t->left->cnfa.nstates > 0); + assert(t->right != NULL && t->right->cnfa.nstates > 0); + + d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1); + NOERR(); + d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, &v->dfa2); + if (ISERR()) { + assert(d2 == NULL); + freedfa(d); + return v->err; + } + + /* + * Pick a tentative midpoint. + */ + + if (shorter) { + mid = shortest(v, d, begin, begin, end, NULL, NULL); + } else { + mid = longest(v, d, begin, end, NULL); + } + if (mid == NULL) { + freedfa(d); + freedfa(d2); + return REG_ASSERT; + } + MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); + + /* + * Iterate until satisfaction or failure. + */ + + while (longest(v, d2, mid, end, NULL) != end) { + /* + * That midpoint didn't work, find a new one. + */ + + if (mid == stop) { + /* + * All possibilities exhausted! + */ + + MDEBUG(("no midpoint!\n")); + freedfa(d); + freedfa(d2); + return REG_ASSERT; + } + if (shorter) { + mid = shortest(v, d, begin, mid+1, end, NULL, NULL); + } else { + mid = longest(v, d, begin, mid-1, NULL); + } + if (mid == NULL) { + /* + * Failed to find a new one! + */ + + MDEBUG(("failed midpoint!\n")); + freedfa(d); + freedfa(d2); + return REG_ASSERT; + } + MDEBUG(("new midpoint %ld\n", LOFF(mid))); + } + + /* + * Satisfaction. + */ + + MDEBUG(("successful\n")); + freedfa(d); + freedfa(d2); + i = dissect(v, t->left, begin, mid); + if (i != REG_OKAY) { + return i; + } + return dissect(v, t->right, mid, end); +} + +/* + - altdissect - determine alternative subexpression matches (uncomplicated) + ^ static int altdissect(struct vars *, struct subre *, chr *, chr *); + */ +static int /* regexec return code */ +altdissect( + struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct dfa *d; + int i; + + assert(t != NULL); + assert(t->op == '|'); + + for (i = 0; t != NULL; t = t->right, i++) { + MDEBUG(("trying %dth\n", i)); + assert(t->left != NULL && t->left->cnfa.nstates > 0); + d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1); + if (ISERR()) { + return v->err; + } + if (longest(v, d, begin, end, NULL) == end) { + MDEBUG(("success\n")); + freedfa(d); + return dissect(v, t->left, begin, end); + } + freedfa(d); + } + return REG_ASSERT; /* none of them matched?!? */ +} + +/* + - cdissect - determine subexpression matches (with complications) + * The retry memory stores the offset of the trial midpoint from begin, plus 1 + * so that 0 uniquely means "clean slate". + ^ static int cdissect(struct vars *, struct subre *, chr *, chr *); + */ +static int /* regexec return code */ +cdissect( + struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + int er; + + assert(t != NULL); + MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op)); + + switch (t->op) { + case '=': /* terminal node */ + assert(t->left == NULL && t->right == NULL); + return REG_OKAY; /* no action, parent did the work */ + break; + case '|': /* alternation */ + assert(t->left != NULL); + return caltdissect(v, t, begin, end); + break; + case 'b': /* back ref -- shouldn't be calling us! */ + assert(t->left == NULL && t->right == NULL); + return cbrdissect(v, t, begin, end); + break; + case '.': /* concatenation */ + assert(t->left != NULL && t->right != NULL); + return ccondissect(v, t, begin, end); + break; + case '(': /* capturing */ + assert(t->left != NULL && t->right == NULL); + assert(t->subno > 0); + er = cdissect(v, t->left, begin, end); + if (er == REG_OKAY) { + subset(v, t, begin, end); + } + return er; + break; + default: + return REG_ASSERT; + break; + } +} + +/* + - ccondissect - concatenation subexpression matches (with complications) + * The retry memory stores the offset of the trial midpoint from begin, plus 1 + * so that 0 uniquely means "clean slate". + ^ static int ccondissect(struct vars *, struct subre *, chr *, chr *); + */ +static int /* regexec return code */ +ccondissect( + struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct dfa *d; + struct dfa *d2; + chr *mid; + int er; + + assert(t->op == '.'); + assert(t->left != NULL && t->left->cnfa.nstates > 0); + assert(t->right != NULL && t->right->cnfa.nstates > 0); + + if (t->left->flags&SHORTER) { /* reverse scan */ + return crevdissect(v, t, begin, end); + } + + d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); + if (ISERR()) { + return v->err; + } + d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC); + if (ISERR()) { + freedfa(d); + return v->err; + } + MDEBUG(("cconcat %d\n", t->retry)); + + /* + * Pick a tentative midpoint. + */ + + if (v->mem[t->retry] == 0) { + mid = longest(v, d, begin, end, NULL); + if (mid == NULL) { + freedfa(d); + freedfa(d2); + return REG_NOMATCH; + } + MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); + v->mem[t->retry] = (mid - begin) + 1; + } else { + mid = begin + (v->mem[t->retry] - 1); + MDEBUG(("working midpoint %ld\n", LOFF(mid))); + } + + /* + * Iterate until satisfaction or failure. + */ + + for (;;) { + /* + * Try this midpoint on for size. + */ + + er = cdissect(v, t->left, begin, mid); + if ((er == REG_OKAY) && (longest(v, d2, mid, end, NULL) == end) + && (er = cdissect(v, t->right, mid, end)) == REG_OKAY) { + break; /* NOTE BREAK OUT */ + } + if ((er != REG_OKAY) && (er != REG_NOMATCH)) { + freedfa(d); + freedfa(d2); + return er; + } + + /* + * That midpoint didn't work, find a new one. + */ + + if (mid == begin) { + /* + * All possibilities exhausted. + */ + + MDEBUG(("%d no midpoint\n", t->retry)); + freedfa(d); + freedfa(d2); + return REG_NOMATCH; + } + mid = longest(v, d, begin, mid-1, NULL); + if (mid == NULL) { + /* + * Failed to find a new one. + */ + + MDEBUG(("%d failed midpoint\n", t->retry)); + freedfa(d); + freedfa(d2); + return REG_NOMATCH; + } + MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid))); + v->mem[t->retry] = (mid - begin) + 1; + zapmem(v, t->left); + zapmem(v, t->right); + } + + /* + * Satisfaction. + */ + + MDEBUG(("successful\n")); + freedfa(d); + freedfa(d2); + return REG_OKAY; +} + +/* + - crevdissect - determine backref shortest-first subexpression matches + * The retry memory stores the offset of the trial midpoint from begin, plus 1 + * so that 0 uniquely means "clean slate". + ^ static int crevdissect(struct vars *, struct subre *, chr *, chr *); + */ +static int /* regexec return code */ +crevdissect( + struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct dfa *d; + struct dfa *d2; + chr *mid; + int er; + + assert(t->op == '.'); + assert(t->left != NULL && t->left->cnfa.nstates > 0); + assert(t->right != NULL && t->right->cnfa.nstates > 0); + assert(t->left->flags&SHORTER); + + /* + * Concatenation -- need to split the substring between parts. + */ + + d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); + if (ISERR()) { + return v->err; + } + d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC); + if (ISERR()) { + freedfa(d); + return v->err; + } + MDEBUG(("crev %d\n", t->retry)); + + /* + * Pick a tentative midpoint. + */ + + if (v->mem[t->retry] == 0) { + mid = shortest(v, d, begin, begin, end, NULL, NULL); + if (mid == NULL) { + freedfa(d); + freedfa(d2); + return REG_NOMATCH; + } + MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); + v->mem[t->retry] = (mid - begin) + 1; + } else { + mid = begin + (v->mem[t->retry] - 1); + MDEBUG(("working midpoint %ld\n", LOFF(mid))); + } + + /* + * Iterate until satisfaction or failure. + */ + + for (;;) { + /* + * Try this midpoint on for size. + */ + + er = cdissect(v, t->left, begin, mid); + if ((er == REG_OKAY) && (longest(v, d2, mid, end, NULL) == end) + && (er = cdissect(v, t->right, mid, end)) == REG_OKAY) { + break; /* NOTE BREAK OUT */ + } + if (er != REG_OKAY && er != REG_NOMATCH) { + freedfa(d); + freedfa(d2); + return er; + } + + /* + * That midpoint didn't work, find a new one. + */ + + if (mid == end) { + /* + * All possibilities exhausted. + */ + + MDEBUG(("%d no midpoint\n", t->retry)); + freedfa(d); + freedfa(d2); + return REG_NOMATCH; + } + mid = shortest(v, d, begin, mid+1, end, NULL, NULL); + if (mid == NULL) { + /* + * Failed to find a new one. + */ + + MDEBUG(("%d failed midpoint\n", t->retry)); + freedfa(d); + freedfa(d2); + return REG_NOMATCH; + } + MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid))); + v->mem[t->retry] = (mid - begin) + 1; + zapmem(v, t->left); + zapmem(v, t->right); + } + + /* + * Satisfaction. + */ + + MDEBUG(("successful\n")); + freedfa(d); + freedfa(d2); + return REG_OKAY; +} + +/* + - cbrdissect - determine backref subexpression matches + ^ static int cbrdissect(struct vars *, struct subre *, chr *, chr *); + */ +static int /* regexec return code */ +cbrdissect( + struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + int i; + int n = t->subno; + size_t len; + chr *paren; + chr *p; + chr *stop; + int min = t->min; + int max = t->max; + + assert(t != NULL); + assert(t->op == 'b'); + assert(n >= 0); + assert((size_t)n < v->nmatch); + + MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max)); + + if (v->pmatch[n].rm_so == -1) { + return REG_NOMATCH; + } + paren = v->start + v->pmatch[n].rm_so; + len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so; + + /* + * No room to maneuver -- retries are pointless. + */ + + if (v->mem[t->retry]) { + return REG_NOMATCH; + } + v->mem[t->retry] = 1; + + /* + * Special-case zero-length string. + */ + + if (len == 0) { + if (begin == end) { + return REG_OKAY; + } + return REG_NOMATCH; + } + + /* + * And too-short string. + */ + + assert(end >= begin); + if ((size_t)(end - begin) < len) { + return REG_NOMATCH; + } + stop = end - len; + + /* + * Count occurrences. + */ + + i = 0; + for (p = begin; p <= stop && (i < max || max == INFINITY); p += len) { + if ((*v->g->compare)(paren, p, len) != 0) { + break; + } + i++; + } + MDEBUG(("cbackref found %d\n", i)); + + /* + * And sort it out. + */ + + if (p != end) { /* didn't consume all of it */ + return REG_NOMATCH; + } + if (min <= i && (i <= max || max == INFINITY)) { + return REG_OKAY; + } + return REG_NOMATCH; /* out of range */ +} + +/* + - caltdissect - determine alternative subexpression matches (w. complications) + ^ static int caltdissect(struct vars *, struct subre *, chr *, chr *); + */ +static int /* regexec return code */ +caltdissect( + struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct dfa *d; + int er; +#define UNTRIED 0 /* not yet tried at all */ +#define TRYING 1 /* top matched, trying submatches */ +#define TRIED 2 /* top didn't match or submatches exhausted */ + + if (t == NULL) { + return REG_NOMATCH; + } + assert(t->op == '|'); + if (v->mem[t->retry] == TRIED) { + return caltdissect(v, t->right, begin, end); + } + + MDEBUG(("calt n%d\n", t->retry)); + assert(t->left != NULL); + + if (v->mem[t->retry] == UNTRIED) { + d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); + if (ISERR()) { + return v->err; + } + if (longest(v, d, begin, end, NULL) != end) { + freedfa(d); + v->mem[t->retry] = TRIED; + return caltdissect(v, t->right, begin, end); + } + freedfa(d); + MDEBUG(("calt matched\n")); + v->mem[t->retry] = TRYING; + } + + er = cdissect(v, t->left, begin, end); + if (er != REG_NOMATCH) { + return er; + } + + v->mem[t->retry] = TRIED; + return caltdissect(v, t->right, begin, end); +} + +#include "rege_dfa.c" + +/* + * Local Variables: + * mode: c + * c-basic-offset: 4 + * fill-column: 78 + * End: + */ diff --git a/contrib/hsrex/regfree.c b/contrib/hsrex/regfree.c new file mode 100644 index 0000000..b0aaa70 --- /dev/null +++ b/contrib/hsrex/regfree.c @@ -0,0 +1,60 @@ +/* + * regfree - free an RE + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation of + * software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You might think that this could be incorporated into regcomp.c, and that + * would be a reasonable idea... except that this is a generic function (with + * a generic name), applicable to all compiled REs regardless of the size of + * their characters, whereas the stuff in regcomp.c gets compiled once per + * character size. + */ + +#include "regguts.h" + +/* + - regfree - free an RE (generic function, punts to RE-specific function) + * + * Ignoring invocation with NULL is a convenience. + */ +void +regfree( + regex_t *re) +{ + if (re == NULL) { + return; + } + (*((struct fns *)re->re_fns)->free)(re); +} + +/* + * Local Variables: + * mode: c + * c-basic-offset: 4 + * fill-column: 78 + * End: + */ diff --git a/contrib/hsrex/regguts.h b/contrib/hsrex/regguts.h new file mode 100644 index 0000000..67e3d03 --- /dev/null +++ b/contrib/hsrex/regguts.h @@ -0,0 +1,428 @@ +/* + * Internal interface definitions, etc., for the reg package + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation of + * software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Environmental customization. It should not (I hope) be necessary to alter + * the file you are now reading -- regcustom.h should handle it all, given + * care here and elsewhere. + */ +#include "regcustom.h" + +/* + * Things that regcustom.h might override. + */ + +/* standard header files (NULL is a reasonable indicator for them) */ +#ifndef NULL +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#include <limits.h> +#include <string.h> +#endif + +/* assertions */ +#ifndef assert +#ifndef REG_DEBUG +#ifndef NDEBUG +#define NDEBUG /* no assertions */ +#endif +#endif /* !REG_DEBUG */ +#include <assert.h> +#endif + +/* voids */ +#ifndef VOID +#define VOID void /* for function return values */ +#endif +#ifndef DISCARD +#define DISCARD void /* for throwing values away */ +#endif +#ifndef PVOID +#define PVOID void * /* generic pointer */ +#endif +#ifndef VS +#define VS(x) ((void*)(x)) /* cast something to generic ptr */ +#endif +#ifndef NOPARMS +#define NOPARMS void /* for empty parm lists */ +#endif + +/* const */ +#ifndef CONST +#define CONST const /* for old compilers, might be empty */ +#endif + +/* function-pointer declarator */ +#ifndef FUNCPTR +#if __STDC__ >= 1 +#define FUNCPTR(name, args) (*name)args +#else +#define FUNCPTR(name, args) (*name)() +#endif +#endif + +/* memory allocation */ +#ifndef MALLOC +#define MALLOC(n) malloc(n) +#endif +#ifndef REALLOC +#define REALLOC(p, n) realloc(VS(p), n) +#endif +#ifndef FREE +#define FREE(p) free(VS(p)) +#endif + +/* want size of a char in bits, and max value in bounded quantifiers */ +#ifndef CHAR_BIT +#include <limits.h> +#endif +#ifndef _POSIX2_RE_DUP_MAX +#define _POSIX2_RE_DUP_MAX 255 /* normally from <limits.h> */ +#endif + +/* + * misc + */ + +#define NOTREACHED 0 +#define xxx 1 + +#define DUPMAX _POSIX2_RE_DUP_MAX +#define INFINITY (DUPMAX+1) + +#define REMAGIC 0xfed7 /* magic number for main struct */ + +/* + * debugging facilities + */ +#ifdef REG_DEBUG +/* FDEBUG does finite-state tracing */ +#define FDEBUG(arglist) { if (v->eflags®_FTRACE) printf arglist; } +/* MDEBUG does higher-level tracing */ +#define MDEBUG(arglist) { if (v->eflags®_MTRACE) printf arglist; } +#else +#define FDEBUG(arglist) {} +#define MDEBUG(arglist) {} +#endif + +/* + * bitmap manipulation + */ +#define UBITS (CHAR_BIT * sizeof(unsigned)) +#define BSET(uv, sn) ((uv)[(sn)/UBITS] |= (unsigned)1 << ((sn)%UBITS)) +#define ISBSET(uv, sn) ((uv)[(sn)/UBITS] & ((unsigned)1 << ((sn)%UBITS))) + +/* + * We dissect a chr into byts for colormap table indexing. Here we define a + * byt, which will be the same as a byte on most machines... The exact size of + * a byt is not critical, but about 8 bits is good, and extraction of 8-bit + * chunks is sometimes especially fast. + */ + +#ifndef BYTBITS +#define BYTBITS 8 /* bits in a byt */ +#endif +#define BYTTAB (1<<BYTBITS) /* size of table with one entry per byt value */ +#define BYTMASK (BYTTAB-1) /* bit mask for byt */ +#define NBYTS ((CHRBITS+BYTBITS-1)/BYTBITS) +/* the definition of GETCOLOR(), below, assumes NBYTS <= 4 */ + +/* + * As soon as possible, we map chrs into equivalence classes -- "colors" -- + * which are of much more manageable number. + */ + +typedef short color; /* colors of characters */ +typedef int pcolor; /* what color promotes to */ +#define COLORLESS (-1) /* impossible color */ +#define WHITE 0 /* default color, parent of all others */ + +/* + * A colormap is a tree -- more precisely, a DAG -- indexed at each level by a + * byt of the chr, to map the chr to a color efficiently. Because lower + * sections of the tree can be shared, it can exploit the usual sparseness of + * such a mapping table. The tree is always NBYTS levels deep (in the past it + * was shallower during construction but was "filled" to full depth at the end + * of that); areas that are unaltered as yet point to "fill blocks" which are + * entirely WHITE in color. + */ + +/* the tree itself */ +struct colors { + color ccolor[BYTTAB]; +}; +struct ptrs { + union tree *pptr[BYTTAB]; +}; +union tree { + struct colors colors; + struct ptrs ptrs; +}; +#define tcolor colors.ccolor +#define tptr ptrs.pptr + +/* Internal per-color descriptor structure for the color machinery */ +struct colordesc { + uchr nchrs; /* number of chars of this color */ + color sub; /* open subcolor (if any); free chain ptr */ +#define NOSUB COLORLESS + struct arc *arcs; /* color chain */ + int flags; +#define FREECOL 01 /* currently free */ +#define PSEUDO 02 /* pseudocolor, no real chars */ +#define UNUSEDCOLOR(cd) ((cd)->flags&FREECOL) + union tree *block; /* block of solid color, if any */ +}; + +/* the color map itself */ +struct colormap { + int magic; +#define CMMAGIC 0x876 + struct vars *v; /* for compile error reporting */ + size_t ncds; /* number of colordescs */ + size_t max; /* highest in use */ + color free; /* beginning of free chain (if non-0) */ + struct colordesc *cd; +#define CDEND(cm) (&(cm)->cd[(cm)->max + 1]) +#define NINLINECDS ((size_t)10) + struct colordesc cdspace[NINLINECDS]; + union tree tree[NBYTS]; /* tree top, plus fill blocks */ +}; + +/* optimization magic to do fast chr->color mapping */ +#define B0(c) ((c) & BYTMASK) +#define B1(c) (((c)>>BYTBITS) & BYTMASK) +#define B2(c) (((c)>>(2*BYTBITS)) & BYTMASK) +#define B3(c) (((c)>>(3*BYTBITS)) & BYTMASK) +#if NBYTS == 1 +#define GETCOLOR(cm, c) ((cm)->tree->tcolor[B0(c)]) +#endif +/* beware, for NBYTS>1, GETCOLOR() is unsafe -- 2nd arg used repeatedly */ +#if NBYTS == 2 +#define GETCOLOR(cm, c) ((cm)->tree->tptr[B1(c)]->tcolor[B0(c)]) +#endif +#if NBYTS == 4 +#define GETCOLOR(cm, c) ((cm)->tree->tptr[B3(c)]->tptr[B2(c)]->tptr[B1(c)]->tcolor[B0(c)]) +#endif + +/* + * Interface definitions for locale-interface functions in locale.c. + */ + +/* Representation of a set of characters. */ +struct cvec { + int nchrs; /* number of chrs */ + int chrspace; /* number of chrs possible */ + chr *chrs; /* pointer to vector of chrs */ + int nranges; /* number of ranges (chr pairs) */ + int rangespace; /* number of chrs possible */ + chr *ranges; /* pointer to vector of chr pairs */ +}; + +/* + * definitions for non-deterministic finite autmaton (NFA) internal + * representation + * + * Having a "from" pointer within each arc may seem redundant, but it saves a + * lot of hassle. + */ + +struct state; + +struct arc { + int type; +#define ARCFREE '\0' + color co; + struct state *from; /* where it's from (and contained within) */ + struct state *to; /* where it's to */ + struct arc *outchain; /* *from's outs chain or free chain */ +#define freechain outchain + struct arc *inchain; /* *to's ins chain */ + struct arc *colorchain; /* color's arc chain */ + struct arc *colorchainRev; /* back-link in color's arc chain */ +}; + +struct arcbatch { /* for bulk allocation of arcs */ + struct arcbatch *next; +#define ABSIZE 10 + struct arc a[ABSIZE]; +}; + +struct state { + int no; +#define FREESTATE (-1) + char flag; /* marks special states */ + int nins; /* number of inarcs */ + struct arc *ins; /* chain of inarcs */ + int nouts; /* number of outarcs */ + struct arc *outs; /* chain of outarcs */ + struct arc *free; /* chain of free arcs */ + struct state *tmp; /* temporary for traversal algorithms */ + struct state *next; /* chain for traversing all */ + struct state *prev; /* back chain */ + struct arcbatch oas; /* first arcbatch, avoid malloc in easy case */ + int noas; /* number of arcs used in first arcbatch */ +}; + +struct nfa { + struct state *pre; /* pre-initial state */ + struct state *init; /* initial state */ + struct state *final; /* final state */ + struct state *post; /* post-final state */ + int nstates; /* for numbering states */ + struct state *states; /* state-chain header */ + struct state *slast; /* tail of the chain */ + struct state *free; /* free list */ + struct colormap *cm; /* the color map */ + color bos[2]; /* colors, if any, assigned to BOS and BOL */ + color eos[2]; /* colors, if any, assigned to EOS and EOL */ + size_t size; /* Current NFA size; differs from nstates as + * it also counts the number of states created + * by children of this state. */ + struct vars *v; /* simplifies compile error reporting */ + struct nfa *parent; /* parent NFA, if any */ +}; + +/* + * definitions for compacted NFA + */ + +struct carc { + color co; /* COLORLESS is list terminator */ + int to; /* state number */ +}; + +struct cnfa { + int nstates; /* number of states */ + int ncolors; /* number of colors */ + int flags; +#define HASLACONS 01 /* uses lookahead constraints */ + int pre; /* setup state number */ + int post; /* teardown state number */ + color bos[2]; /* colors, if any, assigned to BOS and BOL */ + color eos[2]; /* colors, if any, assigned to EOS and EOL */ + struct carc **states; /* vector of pointers to outarc lists */ + struct carc *arcs; /* the area for the lists */ +}; +#define ZAPCNFA(cnfa) ((cnfa).nstates = 0) +#define NULLCNFA(cnfa) ((cnfa).nstates == 0) + +/* + * Used to limit the maximum NFA size to something sane. [Bug 1810264] + */ + +#ifndef REG_MAX_STATES +# define REG_MAX_STATES 100000 +#endif + +/* + * subexpression tree + */ + +struct subre { + char op; /* '|', '.' (concat), 'b' (backref), '(', + * '=' */ + char flags; +#define LONGER 01 /* prefers longer match */ +#define SHORTER 02 /* prefers shorter match */ +#define MIXED 04 /* mixed preference below */ +#define CAP 010 /* capturing parens below */ +#define BACKR 020 /* back reference below */ +#define INUSE 0100 /* in use in final tree */ +#define LOCAL 03 /* bits which may not propagate up */ +#define LMIX(f) ((f)<<2) /* LONGER -> MIXED */ +#define SMIX(f) ((f)<<1) /* SHORTER -> MIXED */ +#define UP(f) (((f)&~LOCAL) | (LMIX(f) & SMIX(f) & MIXED)) +#define MESSY(f) ((f)&(MIXED|CAP|BACKR)) +#define PREF(f) ((f)&LOCAL) +#define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2)) +#define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2)) + short retry; /* index into retry memory */ + int subno; /* subexpression number (for 'b' and '(') */ + short min; /* min repetitions, for backref only */ + short max; /* max repetitions, for backref only */ + struct subre *left; /* left child, if any (also freelist chain) */ + struct subre *right; /* right child, if any */ + struct state *begin; /* outarcs from here... */ + struct state *end; /* ...ending in inarcs here */ + struct cnfa cnfa; /* compacted NFA, if any */ + struct subre *chain; /* for bookkeeping and error cleanup */ +}; + +/* + * table of function pointers for generic manipulation functions. A regex_t's + * re_fns points to one of these. + */ + +struct fns { + VOID FUNCPTR(free, (regex_t *)); +}; + +/* + * the insides of a regex_t, hidden behind a void * + */ + +struct guts { + int magic; +#define GUTSMAGIC 0xfed9 + int cflags; /* copy of compile flags */ + long info; /* copy of re_info */ + size_t nsub; /* copy of re_nsub */ + struct subre *tree; + struct cnfa search; /* for fast preliminary search */ + int ntree; + struct colormap cmap; + int FUNCPTR(compare, (CONST chr *, CONST chr *, size_t)); + struct subre *lacons; /* lookahead-constraint vector */ + int nlacons; /* size of lacons */ +}; + +/* + * Magic for allocating a variable workspace. This default version is + * stack-hungry. + */ + +#ifndef AllocVars +#define AllocVars(vPtr) \ + struct vars var; \ + register struct vars *vPtr = &var +#endif +#ifndef FreeVars +#define FreeVars(vPtr) ((void) 0) +#endif + +/* + * Local Variables: + * mode: c + * c-basic-offset: 4 + * fill-column: 78 + * End: + */ diff --git a/src/Makefile.am b/src/Makefile.am index 5b2572e..b850905 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -13,7 +13,7 @@ include $(top_srcdir)/contrib/scintilla.am # FIXME: Common flags should be in configure.ac AM_CFLAGS = -std=gnu11 -Wall -Wno-initializer-overrides -Wno-unused-value -AM_CPPFLAGS += -I$(top_srcdir)/contrib/rb3ptr +AM_CPPFLAGS += -I$(top_srcdir)/contrib/rb3ptr -I$(top_srcdir)/contrib/hsrex AM_LDFLAGS = if STATIC_EXECUTABLES @@ -57,7 +57,8 @@ libsciteco_base_la_SOURCES = main.c sciteco.h list.h \ # NOTE: We cannot link in Scintilla (static library) into # a libtool convenience library libsciteco_base_la_LIBADD = $(LIBSCITECO_INTERFACE) \ - $(top_builddir)/contrib/rb3ptr/librb3ptr.la + $(top_builddir)/contrib/rb3ptr/librb3ptr.la \ + $(top_builddir)/contrib/hsrex/libhswrex.la if REPLACE_MALLOC libsciteco_base_la_LIBADD += $(top_builddir)/contrib/dlmalloc/libdlmalloc.la endif diff --git a/src/search.c b/src/search.c index 01c598e..81d2074 100644 --- a/src/search.c +++ b/src/search.c @@ -24,6 +24,13 @@ #include <glib.h> #include <glib/gprintf.h> +/* should always be Henry Spencer's version from contrib/hsrex */ +#define REGEX_STANDALONE +//#define REGEX_WCHAR +#include <regalone.h> +#include <regex.h> +G_DEFINE_AUTO_CLEANUP_CLEAR_FUNC(regex_t, regfree); + #include "sciteco.h" #include "string-utils.h" #include "expressions.h" @@ -463,53 +470,38 @@ teco_pattern2regexp(teco_string_t *pattern, guint codepage, gboolean single_expr } static gboolean -teco_do_search(GRegex *re, gint from, gint to, gint *count, GError **error) +teco_do_search(regex_t *re, gsize from, gsize to, gint *count, GError **error) { - g_autoptr(GMatchInfo) info = NULL; - const gchar *buffer = (const gchar *)teco_interface_ssm(SCI_GETCHARACTERPOINTER, 0, 0); - GError *tmp_error = NULL; - - /* - * NOTE: The return boolean does NOT signal whether an error was generated. - */ - g_regex_match_full(re, buffer, (gssize)to, from, 0, &info, &tmp_error); - if (tmp_error) { - g_propagate_error(error, tmp_error); - return FALSE; - } + regmatch_t info = {.rm_so = from, .rm_eo = to}; + /* FIXME: avoid moving the gap here */ + const guchar *buffer = (const guchar *)teco_interface_ssm(SCI_GETCHARACTERPOINTER, 0, 0); gint matched_from = -1, matched_to = -1; if (*count >= 0) { - while (g_match_info_matches(info) && --(*count)) { - /* - * NOTE: The return boolean does NOT signal whether an error was generated. - */ - g_match_info_next(info, &tmp_error); - if (tmp_error) { - g_propagate_error(error, tmp_error); - return FALSE; - } - } - - if (!*count) + gint rc; + while ((rc = re_exec(re, buffer+from, to-from, NULL, 1, &info, REG_NOTEOL | REG_NOTBOL)) == REG_OKAY && --(*count)) + from += info.rm_eo; + if (rc == REG_OKAY) { /* successful */ - g_match_info_fetch_pos(info, 0, - &matched_from, &matched_to); + matched_from = from+info.rm_so; + matched_to = from+info.rm_eo; + } else if (rc != REG_NOMATCH) { + // FIXME: Use regerror() + g_set_error_literal(error, TECO_ERROR, TECO_ERROR_FAILED, + "Error executing regular expression"); + return FALSE; + } } else { /* only keep the last `count' matches, in a circular stack */ - typedef struct { - gint from, to; - } teco_range_t; - - gsize matched_size = sizeof(teco_range_t) * -*count; + gsize matched_size = sizeof(regmatch_t) * -*count; /* * matched_size could overflow. * NOTE: Glib 2.48 has g_size_checked_mul() which uses * compiler intrinsics. */ - if (matched_size / sizeof(teco_range_t) != -*count) + if (matched_size / sizeof(regmatch_t) != -*count) /* guaranteed to fail either teco_memory_check() or g_malloc() */ matched_size = G_MAXSIZE; @@ -522,32 +514,29 @@ teco_do_search(GRegex *re, gint from, gint to, gint *count, GError **error) if (!teco_memory_check(matched_size, error)) return FALSE; - g_autofree teco_range_t *matched = g_malloc(matched_size); + g_autofree regmatch_t *matched = g_malloc(matched_size); gint matched_total = 0, i = 0; - while (g_match_info_matches(info)) { - g_match_info_fetch_pos(info, 0, - &matched[i].from, &matched[i].to); - - /* - * NOTE: The return boolean does NOT signal whether an error was generated. - */ - g_match_info_next(info, &tmp_error); - if (tmp_error) { - g_propagate_error(error, tmp_error); - return FALSE; - } - + gint rc; + while ((rc = re_exec(re, buffer+from, to-from, NULL, 1, &matched[i], REG_NOTEOL | REG_NOTBOL | REG_STARTEND)) == REG_OKAY) { + matched[i].rm_so += from; + matched[i].rm_eo += from; + from = matched[i].rm_eo; i = ++matched_total % -(*count); } *count = MIN(*count + matched_total, 0); - if (!*count) { - /* successful -> i points to stack bottom */ - matched_from = matched[i].from; - matched_to = matched[i].to; + if (rc != REG_NOMATCH) { + // FIXME: Use regerror() + g_set_error_literal(error, TECO_ERROR, TECO_ERROR_FAILED, + "Error executing regular expression"); + return FALSE; } + + /* successful -> i points to stack bottom */ + matched_from = matched[i].rm_so; + matched_to = matched[i].rm_eo; } if (matched_from >= 0 && matched_to >= 0) @@ -560,14 +549,11 @@ teco_do_search(GRegex *re, gint from, gint to, gint *count, GError **error) static gboolean teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gsize new_chars, GError **error) { - /* FIXME: Should G_REGEX_OPTIMIZE be added under certain circumstances? */ - GRegexCompileFlags flags = G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_DOTALL; + gint flags = REG_EXTENDED | REG_ICASE; /* this is set in teco_state_search_initial() */ - if (ctx->expectstring.machine.codepage != SC_CP_UTF8) { - /* single byte encoding */ - flags |= G_REGEX_RAW; - } else if (!teco_string_validate_utf8(str)) { + if (ctx->expectstring.machine.codepage == SC_CP_UTF8 && + !teco_string_validate_utf8(str)) { /* * While SciTECO code is always guaranteed to be in valid UTF-8, * the result of string building may not (eg. if ^EQq inserts garbage). @@ -588,7 +574,7 @@ teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gs !search_reg->vtable->set_integer(search_reg, TECO_FAILURE, error)) return FALSE; - g_autoptr(GRegex) re = NULL; + g_auto(regex_t) re = {0}; teco_string_t pattern = *str; g_autofree gchar *re_pattern; /* NOTE: teco_pattern2regexp() modifies str pointer */ @@ -602,10 +588,18 @@ teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gs if (!*re_pattern) goto failure; /* - * FIXME: Should we propagate at least some of the errors? + * FIXME: We don't have to escape null characters in re_pattern. */ - re = g_regex_new(re_pattern, flags, 0, NULL); - if (!re) +#if 0 + gint rc = ctx->expectstring.machine.codepage == SC_CP_UTF8 + ? re_wcomp(&re, re_pattern, strlen(re_pattern), flags) + : re_comp(&re, re_pattern, strlen(re_pattern), flags); +#endif + // FIXME: Apparently this is the ASCII-only version, while re_wcomp() is the widechar version + // which expects UTF-32. + // This means that teco_pattern2regexp() would have to return an UTF-32 version. + gint rc = re_comp(&re, re_pattern, strlen(re_pattern), flags); + if (rc) goto failure; if (!teco_qreg_current && @@ -616,7 +610,7 @@ teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gs gint count = teco_search_parameters.count; - if (!teco_do_search(re, teco_search_parameters.from, teco_search_parameters.to, &count, error)) + if (!teco_do_search(&re, teco_search_parameters.from, teco_search_parameters.to, &count, error)) return FALSE; if (teco_search_parameters.to_buffer && count) { @@ -631,12 +625,12 @@ teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gs teco_buffer_edit(buffer); if (buffer == teco_search_parameters.to_buffer) { - if (!teco_do_search(re, 0, teco_search_parameters.dot, &count, error)) + if (!teco_do_search(&re, 0, teco_search_parameters.dot, &count, error)) return FALSE; break; } - if (!teco_do_search(re, 0, teco_interface_ssm(SCI_GETLENGTH, 0, 0), + if (!teco_do_search(&re, 0, teco_interface_ssm(SCI_GETLENGTH, 0, 0), &count, error)) return FALSE; } while (count); @@ -646,14 +640,14 @@ teco_state_search_process(teco_machine_main_t *ctx, const teco_string_t *str, gs teco_buffer_edit(buffer); if (buffer == teco_search_parameters.to_buffer) { - if (!teco_do_search(re, teco_search_parameters.dot, + if (!teco_do_search(&re, teco_search_parameters.dot, teco_interface_ssm(SCI_GETLENGTH, 0, 0), &count, error)) return FALSE; break; } - if (!teco_do_search(re, 0, teco_interface_ssm(SCI_GETLENGTH, 0, 0), + if (!teco_do_search(&re, 0, teco_interface_ssm(SCI_GETLENGTH, 0, 0), &count, error)) return FALSE; } while (count); |