diff options
author | nyamatongwe <devnull@localhost> | 2010-10-22 09:33:37 +1100 |
---|---|---|
committer | nyamatongwe <devnull@localhost> | 2010-10-22 09:33:37 +1100 |
commit | bddf4c06876ffd6bef0beede0be36e93031ea542 (patch) | |
tree | c54b2190efc552778ea4446cf9cd6f796165c307 /src | |
parent | 1f4d63e74f6c1f942e5602ecaec82188a814ce10 (diff) | |
download | scintilla-mirror-bddf4c06876ffd6bef0beede0be36e93031ea542.tar.gz |
Addition of '?' regular expression operator to indicate 0 or 1 occurrences
and non-greedy matches.
Diffstat (limited to 'src')
-rw-r--r-- | src/RESearch.cxx | 93 |
1 files changed, 62 insertions, 31 deletions
diff --git a/src/RESearch.cxx b/src/RESearch.cxx index 3baa37c91..f26375fe5 100644 --- a/src/RESearch.cxx +++ b/src/RESearch.cxx @@ -17,6 +17,7 @@ * Put all global/static variables into an object so this code can be * used from multiple threads, etc. * Some extensions by Philippe Lhoste PhiLho(a)GMX.net + * '?' extensions by Michael Mullin masmullin@gmail.com * * These routines are the PUBLIC DOMAIN equivalents of regex * routines as found in 4.nBSD UN*X, with minor extensions. @@ -53,7 +54,7 @@ * Regular Expressions: * * [1] char matches itself, unless it is a special - * character (metachar): . \ [ ] * + ^ $ + * character (metachar): . \ [ ] * + ? ^ $ * and ( ) if posix option. * * [2] . matches any character. @@ -65,12 +66,12 @@ * regex searches are made line per line * (stripped of end-of-line chars). * - if not in posix mode, when followed by a - * left or right round bracket (see [7]); - * - when followed by a digit 1 to 9 (see [8]); + * left or right round bracket (see [8]); + * - when followed by a digit 1 to 9 (see [9]); * - when followed by a left or right angle bracket - * (see [9]); - * - when followed by d, D, s, S, w or W (see [10]); - * - when followed by x and two hexa digits (see [11]. + * (see [10]); + * - when followed by d, D, s, S, w or W (see [11]); + * - when followed by x and two hexa digits (see [12]. * Backslash is used as an escape character for all * other meta-characters, and itself. * @@ -101,23 +102,28 @@ * [a-zA-Z] any alpha * * [5] * any regular expression form [1] to [4] - * (except [7], [8] and [9] forms of [3]), + * (except [8], [9] and [10] forms of [3]), * followed by closure char (*) * matches zero or more matches of that form. * * [6] + same as [5], except it matches one or more. - * Both [5] and [6] are greedy (they match as much as possible). * - * [7] a regular expression in the form [1] to [12], enclosed + * [5-6] Both [5] and [6] are greedy (they match as much as possible). + * Unless they are followed by the 'lazy' quantifier (?) + * In which case both [5] and [6] try to match as little as possible + * + * [7] ? same as [5] except it matches zero or one. + * + * [8] a regular expression in the form [1] to [13], enclosed * as \(form\) (or (form) with posix flag) matches what * form matches. The enclosure creates a set of tags, - * used for [8] and for pattern substitution. + * used for [9] and for pattern substitution. * The tagged forms are numbered starting from 1. * - * [8] a \ followed by a digit 1 to 9 matches whatever a - * previously tagged regular expression ([7]) matched. + * [9] a \ followed by a digit 1 to 9 matches whatever a + * previously tagged regular expression ([8]) matched. * - * [9] \< a regular expression starting with a \< construct + * [10] \< a regular expression starting with a \< construct * \> and/or ending with a \> construct, restricts the * pattern matching to the beginning of a word, and/or * the end of a word. A word is defined to be a character @@ -126,7 +132,7 @@ * by user setting. The word must also be preceded and/or * followed by any character outside those mentioned. * - * [10] \l a backslash followed by d, D, s, S, w or W, + * [11] \l a backslash followed by d, D, s, S, w or W, * becomes a character class (both inside and * outside sets []). * d: decimal digits @@ -136,16 +142,16 @@ * w: alphanumeric & underscore (changed by user setting) * W: any char except alphanumeric & underscore (see above) * - * [11] \xHH a backslash followed by x and two hexa digits, + * [12] \xHH a backslash followed by x and two hexa digits, * becomes the character whose Ascii code is equal * to these digits. If not followed by two digits, * it is 'x' char itself. * - * [12] a composite regular expression xy where x and y - * are in the form [1] to [11] matches the longest + * [13] a composite regular expression xy where x and y + * are in the form [1] to [12] matches the longest * match of x followed by a match for y. * - * [13] ^ a regular expression starting with a ^ character + * [14] ^ a regular expression starting with a ^ character * $ and/or ending with a $ character, restricts the * pattern matching to the beginning of the line, * or the end of line. [anchors] Elsewhere in the @@ -226,6 +232,8 @@ using namespace Scintilla; #define EOW 9 #define REF 10 #define CLO 11 +#define CLQ 12 /* 0 to 1 closure */ +#define LCLO 13 /* lazy closure */ #define END 0 @@ -356,8 +364,8 @@ static int GetHexaChar(unsigned char hd1, unsigned char hd2) { * or -1 for a char class. In this case, bittab is changed. */ int RESearch::GetBackslashExpression( - const char *pattern, - int &incr) { + const char *pattern, + int &incr) { // Since error reporting is primitive and messages are not used anyway, // I choose to interpret unexpected syntax in a logical way instead // of reporting errors. Otherwise, we can stick on, eg., PCRE behavior. @@ -593,10 +601,11 @@ const char *RESearch::Compile(const char *pattern, int length, bool caseSensitiv case '*': /* match 0 or more... */ case '+': /* match 1 or more... */ + case '?': if (p == pattern) return badpat("Empty closure"); lp = sp; /* previous opcode */ - if (*lp == CLO) /* equivalence... */ + if (*lp == CLO || *lp == LCLO) /* equivalence... */ break; switch (*lp) { @@ -618,9 +627,13 @@ const char *RESearch::Compile(const char *pattern, int length, bool caseSensitiv *mp++ = END; *mp++ = END; sp = mp; + while (--mp > lp) *mp = mp[-1]; - *mp = CLO; + if (*p == '?') *mp = CLQ; + else if (*(p+1) == '?') *mp = LCLO; + else *mp = CLO; + mp = sp; break; @@ -845,6 +858,7 @@ int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) { int bp; /* beginning of subpat... */ int ep; /* ending of subpat... */ int are; /* to save the line ptr. */ + int llp; /* lazy lp for LCLO */ while ((op = *ap++) != END) switch (op) { @@ -879,7 +893,7 @@ int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) { case EOT: eopat[*ap++] = lp; break; - case BOW: + case BOW: if ((lp!=bol && iswordc(ci.CharAt(lp-1))) || !iswordc(ci.CharAt(lp))) return NOTFOUND; break; @@ -895,18 +909,27 @@ int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) { if (ci.CharAt(bp++) != ci.CharAt(lp++)) return NOTFOUND; break; + case LCLO: + case CLQ: case CLO: are = lp; switch (*ap) { case ANY: - while (lp < endp) + if (op == CLO || op == LCLO) + while (lp < endp) + lp++; + else if (lp < endp) lp++; + n = ANYSKIP; break; case CHR: c = *(ap+1); - while ((lp < endp) && (c == ci.CharAt(lp))) + if (op == CLO || op == LCLO) + while ((lp < endp) && (c == ci.CharAt(lp))) + lp++; + else if ((lp < endp) && (c == ci.CharAt(lp))) lp++; n = CHRSKIP; break; @@ -920,15 +943,23 @@ int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) { //re_fail("closure: bad nfa.", *ap); return NOTFOUND; } - ap += n; - while (lp >= are) { - if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND) - return e; - --lp; + llp = lp; + e = NOTFOUND; + while (llp >= are) { + int q; + if ((q = PMatch(ci, llp, endp, ap)) != NOTFOUND) { + e = q; + lp = llp; + if (op != LCLO) return e; + } + if (*ap == END) return e; + --llp; } - return NOTFOUND; + if (*ap == EOT) + PMatch(ci, lp, endp, ap); + return e; default: //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op)); return NOTFOUND; |