diff options
| -rw-r--r-- | doc/ScintillaHistory.html | 2 | ||||
| -rw-r--r-- | src/RESearch.cxx | 93 | 
2 files changed, 64 insertions, 31 deletions
| diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html index 0c268be02..13ab7954b 100644 --- a/doc/ScintillaHistory.html +++ b/doc/ScintillaHistory.html @@ -355,6 +355,8 @@  	<td>Jordan Russell</td>  	<td>Farshid Lashkari</td>  	<td>Sam Rawlins</td> +      </tr><tr> +	<td>Michael Mullin</td>      </tr>      </table>      <p> diff --git a/src/RESearch.cxx b/src/RESearch.cxx index 3baa37c91..f26375fe5 100644 --- a/src/RESearch.cxx +++ b/src/RESearch.cxx @@ -17,6 +17,7 @@   * Put all global/static variables into an object so this code can be   * used from multiple threads, etc.   * Some extensions by Philippe Lhoste PhiLho(a)GMX.net + * '?' extensions by Michael Mullin masmullin@gmail.com   *   * These routines are the PUBLIC DOMAIN equivalents of regex   * routines as found in 4.nBSD UN*X, with minor extensions. @@ -53,7 +54,7 @@   * Regular Expressions:   *   *      [1]     char    matches itself, unless it is a special - *                      character (metachar): . \ [ ] * + ^ $ + *                      character (metachar): . \ [ ] * + ? ^ $   *                      and ( ) if posix option.   *   *      [2]     .       matches any character. @@ -65,12 +66,12 @@   *                      regex searches are made line per line   *                      (stripped of end-of-line chars).   *                      - if not in posix mode, when followed by a - *                      left or right round bracket (see [7]); - *                      - when followed by a digit 1 to 9 (see [8]); + *                      left or right round bracket (see [8]); + *                      - when followed by a digit 1 to 9 (see [9]);   *                      - when followed by a left or right angle bracket - *                      (see [9]); - *                      - when followed by d, D, s, S, w or W (see [10]); - *                      - when followed by x and two hexa digits (see [11]. + *                      (see [10]); + *                      - when followed by d, D, s, S, w or W (see [11]); + *                      - when followed by x and two hexa digits (see [12].   *                      Backslash is used as an escape character for all   *                      other meta-characters, and itself.   * @@ -101,23 +102,28 @@   *                              [a-zA-Z] any alpha   *   *      [5]     *       any regular expression form [1] to [4] - *                      (except [7], [8] and [9] forms of [3]), + *                      (except [8], [9] and [10] forms of [3]),   *                      followed by closure char (*)   *                      matches zero or more matches of that form.   *   *      [6]     +       same as [5], except it matches one or more. - *                      Both [5] and [6] are greedy (they match as much as possible).   * - *      [7]             a regular expression in the form [1] to [12], enclosed + *      [5-6]           Both [5] and [6] are greedy (they match as much as possible). + *                      Unless they are followed by the 'lazy' quantifier (?) + *                      In which case both [5] and [6] try to match as little as possible + * + *      [7]     ?       same as [5] except it matches zero or one. + * + *      [8]             a regular expression in the form [1] to [13], enclosed   *                      as \(form\) (or (form) with posix flag) matches what   *                      form matches. The enclosure creates a set of tags, - *                      used for [8] and for pattern substitution. + *                      used for [9] and for pattern substitution.   *                      The tagged forms are numbered starting from 1.   * - *      [8]             a \ followed by a digit 1 to 9 matches whatever a - *                      previously tagged regular expression ([7]) matched. + *      [9]             a \ followed by a digit 1 to 9 matches whatever a + *                      previously tagged regular expression ([8]) matched.   * - *      [9]     \<      a regular expression starting with a \< construct + *      [10]    \<      a regular expression starting with a \< construct   *              \>      and/or ending with a \> construct, restricts the   *                      pattern matching to the beginning of a word, and/or   *                      the end of a word. A word is defined to be a character @@ -126,7 +132,7 @@   *                      by user setting. The word must also be preceded and/or   *                      followed by any character outside those mentioned.   * - *      [10]    \l      a backslash followed by d, D, s, S, w or W, + *      [11]    \l      a backslash followed by d, D, s, S, w or W,   *                      becomes a character class (both inside and   *                      outside sets []).   *                        d: decimal digits @@ -136,16 +142,16 @@   *                        w: alphanumeric & underscore (changed by user setting)   *                        W: any char except alphanumeric & underscore (see above)   * - *      [11]    \xHH    a backslash followed by x and two hexa digits, + *      [12]    \xHH    a backslash followed by x and two hexa digits,   *                      becomes the character whose Ascii code is equal   *                      to these digits. If not followed by two digits,   *                      it is 'x' char itself.   * - *      [12]            a composite regular expression xy where x and y - *                      are in the form [1] to [11] matches the longest + *      [13]            a composite regular expression xy where x and y + *                      are in the form [1] to [12] matches the longest   *                      match of x followed by a match for y.   * - *      [13]    ^       a regular expression starting with a ^ character + *      [14]    ^       a regular expression starting with a ^ character   *              $       and/or ending with a $ character, restricts the   *                      pattern matching to the beginning of the line,   *                      or the end of line. [anchors] Elsewhere in the @@ -226,6 +232,8 @@ using namespace Scintilla;  #define EOW     9  #define REF     10  #define CLO     11 +#define CLQ     12 /* 0 to 1 closure */ +#define LCLO    13 /* lazy closure */  #define END     0 @@ -356,8 +364,8 @@ static int GetHexaChar(unsigned char hd1, unsigned char hd2) {   * or -1 for a char class. In this case, bittab is changed.   */  int RESearch::GetBackslashExpression( -		const char *pattern, -		int &incr) { +    const char *pattern, +    int &incr) {  	// Since error reporting is primitive and messages are not used anyway,  	// I choose to interpret unexpected syntax in a logical way instead  	// of reporting errors. Otherwise, we can stick on, eg., PCRE behavior. @@ -593,10 +601,11 @@ const char *RESearch::Compile(const char *pattern, int length, bool caseSensitiv  		case '*':               /* match 0 or more... */  		case '+':               /* match 1 or more... */ +		case '?':  			if (p == pattern)  				return badpat("Empty closure");  			lp = sp;		/* previous opcode */ -			if (*lp == CLO)		/* equivalence... */ +			if (*lp == CLO || *lp == LCLO)		/* equivalence... */  				break;  			switch (*lp) { @@ -618,9 +627,13 @@ const char *RESearch::Compile(const char *pattern, int length, bool caseSensitiv  			*mp++ = END;  			*mp++ = END;  			sp = mp; +  			while (--mp > lp)  				*mp = mp[-1]; -			*mp = CLO; +			if (*p == '?')          *mp = CLQ; +			else if (*(p+1) == '?') *mp = LCLO; +			else                    *mp = CLO; +  			mp = sp;  			break; @@ -845,6 +858,7 @@ int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {  	int bp;		/* beginning of subpat... */  	int ep;		/* ending of subpat...    */  	int are;	/* to save the line ptr.  */ +	int llp;	/* lazy lp for LCLO       */  	while ((op = *ap++) != END)  		switch (op) { @@ -879,7 +893,7 @@ int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {  		case EOT:  			eopat[*ap++] = lp;  			break; - 		case BOW: +		case BOW:  			if ((lp!=bol && iswordc(ci.CharAt(lp-1))) || !iswordc(ci.CharAt(lp)))  				return NOTFOUND;  			break; @@ -895,18 +909,27 @@ int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {  				if (ci.CharAt(bp++) != ci.CharAt(lp++))  					return NOTFOUND;  			break; +		case LCLO: +		case CLQ:  		case CLO:  			are = lp;  			switch (*ap) {  			case ANY: -				while (lp < endp) +				if (op == CLO || op == LCLO) +					while (lp < endp) +						lp++; +				else if (lp < endp)  					lp++; +  				n = ANYSKIP;  				break;  			case CHR:  				c = *(ap+1); -				while ((lp < endp) && (c == ci.CharAt(lp))) +				if (op == CLO || op == LCLO) +					while ((lp < endp) && (c == ci.CharAt(lp))) +						lp++; +				else if ((lp < endp) && (c == ci.CharAt(lp)))  					lp++;  				n = CHRSKIP;  				break; @@ -920,15 +943,23 @@ int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {  				//re_fail("closure: bad nfa.", *ap);  				return NOTFOUND;  			} -  			ap += n; -			while (lp >= are) { -				if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND) -					return e; -				--lp; +			llp = lp; +			e = NOTFOUND; +			while (llp >= are) { +				int q; +				if ((q = PMatch(ci, llp, endp, ap)) != NOTFOUND) { +					e = q; +					lp = llp; +					if (op != LCLO) return e; +				} +				if (*ap == END) return e; +				--llp;  			} -			return NOTFOUND; +			if (*ap == EOT) +				PMatch(ci, lp, endp, ap); +			return e;  		default:  			//re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));  			return NOTFOUND; | 
