aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authornyamatongwe <devnull@localhost>2010-10-22 09:33:37 +1100
committernyamatongwe <devnull@localhost>2010-10-22 09:33:37 +1100
commitbddf4c06876ffd6bef0beede0be36e93031ea542 (patch)
treec54b2190efc552778ea4446cf9cd6f796165c307 /src
parent1f4d63e74f6c1f942e5602ecaec82188a814ce10 (diff)
downloadscintilla-mirror-bddf4c06876ffd6bef0beede0be36e93031ea542.tar.gz
Addition of '?' regular expression operator to indicate 0 or 1 occurrences
and non-greedy matches.
Diffstat (limited to 'src')
-rw-r--r--src/RESearch.cxx93
1 files changed, 62 insertions, 31 deletions
diff --git a/src/RESearch.cxx b/src/RESearch.cxx
index 3baa37c91..f26375fe5 100644
--- a/src/RESearch.cxx
+++ b/src/RESearch.cxx
@@ -17,6 +17,7 @@
* Put all global/static variables into an object so this code can be
* used from multiple threads, etc.
* Some extensions by Philippe Lhoste PhiLho(a)GMX.net
+ * '?' extensions by Michael Mullin masmullin@gmail.com
*
* These routines are the PUBLIC DOMAIN equivalents of regex
* routines as found in 4.nBSD UN*X, with minor extensions.
@@ -53,7 +54,7 @@
* Regular Expressions:
*
* [1] char matches itself, unless it is a special
- * character (metachar): . \ [ ] * + ^ $
+ * character (metachar): . \ [ ] * + ? ^ $
* and ( ) if posix option.
*
* [2] . matches any character.
@@ -65,12 +66,12 @@
* regex searches are made line per line
* (stripped of end-of-line chars).
* - if not in posix mode, when followed by a
- * left or right round bracket (see [7]);
- * - when followed by a digit 1 to 9 (see [8]);
+ * left or right round bracket (see [8]);
+ * - when followed by a digit 1 to 9 (see [9]);
* - when followed by a left or right angle bracket
- * (see [9]);
- * - when followed by d, D, s, S, w or W (see [10]);
- * - when followed by x and two hexa digits (see [11].
+ * (see [10]);
+ * - when followed by d, D, s, S, w or W (see [11]);
+ * - when followed by x and two hexa digits (see [12].
* Backslash is used as an escape character for all
* other meta-characters, and itself.
*
@@ -101,23 +102,28 @@
* [a-zA-Z] any alpha
*
* [5] * any regular expression form [1] to [4]
- * (except [7], [8] and [9] forms of [3]),
+ * (except [8], [9] and [10] forms of [3]),
* followed by closure char (*)
* matches zero or more matches of that form.
*
* [6] + same as [5], except it matches one or more.
- * Both [5] and [6] are greedy (they match as much as possible).
*
- * [7] a regular expression in the form [1] to [12], enclosed
+ * [5-6] Both [5] and [6] are greedy (they match as much as possible).
+ * Unless they are followed by the 'lazy' quantifier (?)
+ * In which case both [5] and [6] try to match as little as possible
+ *
+ * [7] ? same as [5] except it matches zero or one.
+ *
+ * [8] a regular expression in the form [1] to [13], enclosed
* as \(form\) (or (form) with posix flag) matches what
* form matches. The enclosure creates a set of tags,
- * used for [8] and for pattern substitution.
+ * used for [9] and for pattern substitution.
* The tagged forms are numbered starting from 1.
*
- * [8] a \ followed by a digit 1 to 9 matches whatever a
- * previously tagged regular expression ([7]) matched.
+ * [9] a \ followed by a digit 1 to 9 matches whatever a
+ * previously tagged regular expression ([8]) matched.
*
- * [9] \< a regular expression starting with a \< construct
+ * [10] \< a regular expression starting with a \< construct
* \> and/or ending with a \> construct, restricts the
* pattern matching to the beginning of a word, and/or
* the end of a word. A word is defined to be a character
@@ -126,7 +132,7 @@
* by user setting. The word must also be preceded and/or
* followed by any character outside those mentioned.
*
- * [10] \l a backslash followed by d, D, s, S, w or W,
+ * [11] \l a backslash followed by d, D, s, S, w or W,
* becomes a character class (both inside and
* outside sets []).
* d: decimal digits
@@ -136,16 +142,16 @@
* w: alphanumeric & underscore (changed by user setting)
* W: any char except alphanumeric & underscore (see above)
*
- * [11] \xHH a backslash followed by x and two hexa digits,
+ * [12] \xHH a backslash followed by x and two hexa digits,
* becomes the character whose Ascii code is equal
* to these digits. If not followed by two digits,
* it is 'x' char itself.
*
- * [12] a composite regular expression xy where x and y
- * are in the form [1] to [11] matches the longest
+ * [13] a composite regular expression xy where x and y
+ * are in the form [1] to [12] matches the longest
* match of x followed by a match for y.
*
- * [13] ^ a regular expression starting with a ^ character
+ * [14] ^ a regular expression starting with a ^ character
* $ and/or ending with a $ character, restricts the
* pattern matching to the beginning of the line,
* or the end of line. [anchors] Elsewhere in the
@@ -226,6 +232,8 @@ using namespace Scintilla;
#define EOW 9
#define REF 10
#define CLO 11
+#define CLQ 12 /* 0 to 1 closure */
+#define LCLO 13 /* lazy closure */
#define END 0
@@ -356,8 +364,8 @@ static int GetHexaChar(unsigned char hd1, unsigned char hd2) {
* or -1 for a char class. In this case, bittab is changed.
*/
int RESearch::GetBackslashExpression(
- const char *pattern,
- int &incr) {
+ const char *pattern,
+ int &incr) {
// Since error reporting is primitive and messages are not used anyway,
// I choose to interpret unexpected syntax in a logical way instead
// of reporting errors. Otherwise, we can stick on, eg., PCRE behavior.
@@ -593,10 +601,11 @@ const char *RESearch::Compile(const char *pattern, int length, bool caseSensitiv
case '*': /* match 0 or more... */
case '+': /* match 1 or more... */
+ case '?':
if (p == pattern)
return badpat("Empty closure");
lp = sp; /* previous opcode */
- if (*lp == CLO) /* equivalence... */
+ if (*lp == CLO || *lp == LCLO) /* equivalence... */
break;
switch (*lp) {
@@ -618,9 +627,13 @@ const char *RESearch::Compile(const char *pattern, int length, bool caseSensitiv
*mp++ = END;
*mp++ = END;
sp = mp;
+
while (--mp > lp)
*mp = mp[-1];
- *mp = CLO;
+ if (*p == '?') *mp = CLQ;
+ else if (*(p+1) == '?') *mp = LCLO;
+ else *mp = CLO;
+
mp = sp;
break;
@@ -845,6 +858,7 @@ int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
int bp; /* beginning of subpat... */
int ep; /* ending of subpat... */
int are; /* to save the line ptr. */
+ int llp; /* lazy lp for LCLO */
while ((op = *ap++) != END)
switch (op) {
@@ -879,7 +893,7 @@ int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
case EOT:
eopat[*ap++] = lp;
break;
- case BOW:
+ case BOW:
if ((lp!=bol && iswordc(ci.CharAt(lp-1))) || !iswordc(ci.CharAt(lp)))
return NOTFOUND;
break;
@@ -895,18 +909,27 @@ int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
if (ci.CharAt(bp++) != ci.CharAt(lp++))
return NOTFOUND;
break;
+ case LCLO:
+ case CLQ:
case CLO:
are = lp;
switch (*ap) {
case ANY:
- while (lp < endp)
+ if (op == CLO || op == LCLO)
+ while (lp < endp)
+ lp++;
+ else if (lp < endp)
lp++;
+
n = ANYSKIP;
break;
case CHR:
c = *(ap+1);
- while ((lp < endp) && (c == ci.CharAt(lp)))
+ if (op == CLO || op == LCLO)
+ while ((lp < endp) && (c == ci.CharAt(lp)))
+ lp++;
+ else if ((lp < endp) && (c == ci.CharAt(lp)))
lp++;
n = CHRSKIP;
break;
@@ -920,15 +943,23 @@ int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
//re_fail("closure: bad nfa.", *ap);
return NOTFOUND;
}
-
ap += n;
- while (lp >= are) {
- if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND)
- return e;
- --lp;
+ llp = lp;
+ e = NOTFOUND;
+ while (llp >= are) {
+ int q;
+ if ((q = PMatch(ci, llp, endp, ap)) != NOTFOUND) {
+ e = q;
+ lp = llp;
+ if (op != LCLO) return e;
+ }
+ if (*ap == END) return e;
+ --llp;
}
- return NOTFOUND;
+ if (*ap == EOT)
+ PMatch(ci, lp, endp, ap);
+ return e;
default:
//re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
return NOTFOUND;