2 files changed, 156 insertions, 161 deletions
diff --git a/src/RESearch.cxx b/src/RESearch.cxx
index f1fda7304..00e183060 100644
--- a/src/RESearch.cxx
+++ b/src/RESearch.cxx
@@ -4,7 +4,7 @@
  **/
 
 /*
- * regex - Regular expression pattern matching  and replacement
+ * regex - Regular expression pattern matching and replacement
  *
  * By:  Ozan S. Yigit (oz)
  *      Dept. of Computer Science
@@ -15,7 +15,7 @@
  * Removed all use of register.
  * Converted to modern function prototypes.
  * Put all global/static variables into an object so this code can be
- * used from multiple threads etc.
+ * used from multiple threads, etc.
  *
  * These routines are the PUBLIC DOMAIN equivalents of regex
  * routines as found in 4.nBSD UN*X, with minor extensions.
@@ -30,64 +30,62 @@
  * Modification history removed.
  *
  * Interfaces:
- *      RESearch::Compile:        compile a regular expression into a NFA.
+ *  RESearch::Compile:      compile a regular expression into a NFA.
  *
- *			char *RESearch::Compile(s)
- *			char *s;
+ *          const char *RESearch::Compile(const char *pat, int length,
+ *                                        bool caseSensitive, bool posix)
  *
- *      RESearch::Execute:        execute the NFA to match a pattern.
+ * Returns a short error string if they fail.
  *
- *			int RESearch::Execute(s)
- *			char *s;
+ *  RESearch::Execute:      execute the NFA to match a pattern.
  *
- *	RESearch::ModifyWord		change RESearch::Execute's understanding of what a "word"
- *			looks like (for \< and \>) by adding into the
- *			hidden word-syntax table.
+ *          int RESearch::Execute(characterIndexer &ci, int lp, int endp)
  *
- *			void RESearch::ModifyWord(s)
- *			char *s;
+ *  RESearch::Substitute:   substitute the matched portions in a new string.
  *
- *      RESearch::Substitute:	substitute the matched portions in a new string.
+ *          int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst)
  *
- *			int RESearch::Substitute(src, dst)
- *			char *src;
- *			char *dst;
+ *  re_fail:                failure routine for RESearch::Execute. (no longer used)
  *
- *	re_fail:	failure routine for RESearch::Execute.
- *
- *			void re_fail(msg, op)
- *			char *msg;
- *			char op;
+ *          void re_fail(char *msg, char op)
  *
  * Regular Expressions:
  *
  *      [1]     char    matches itself, unless it is a special
  *                      character (metachar): . \ [ ] * + ^ $
+ *                      and ( ) if posix option.
  *
  *      [2]     .       matches any character.
  *
- *      [3]     \       matches the character following it, except
- *			when followed by a left or right round bracket,
- *			a digit 1 to 9 or a left or right angle bracket.
- *			(see [7], [8] and [9])
- *			It is used as an escape character for all
- *			other meta-characters, and itself. When used
- *			in a set ([4]), it is treated as an ordinary
- *			character.
+ *      [3]     \       matches the character following it, except:
+ *                      - \a, \b, \f, \n, \t, \v match the
+ *                      corresponding C escape char;
+ *                      - if not in posix mode, when followed by a
+ *                      left or right round bracket (see [7]);
+ *                      - when followed by a digit 1 to 9 (see [8]);
+ *                      - when followed by a left or right angle bracket
+ *                      (see [9]).
+ *                      It is used as an escape character for all
+ *                      other meta-characters, and itself. When used
+ *                      in a set ([4]), it is treated as an ordinary
+ *                      character (except for escape chars).
  *
  *      [4]     [set]   matches one of the characters in the set.
  *                      If the first character in the set is "^",
  *                      it matches a character NOT in the set, i.e.
- *			complements the set. A shorthand S-E is
- *			used to specify a set of characters S upto
- *			E, inclusive. The special characters "]" and
- *			"-" have no special meaning if they appear
- *			as the first chars in the set.
+ *                      complements the set. A shorthand S-E (start-end)
+ *                      is used to specify a set of characters S upto
+ *                      E, inclusive. The special characters "]" and
+ *                      "-" have no special meaning if they appear
+ *                      as the first chars in the set. To include both,
+ *                      put - first: [-]A-Z]:
+ *                      [-]|] matches these 2 chars,
+ *                      []-|] matches from ] to | chars.
  *                      examples:        match:
  *
  *                              [a-z]    any lowercase alpha
  *
- *                              [^]-]    any char except ] and -
+ *                              [^-]]    any char except - and ]
  *
  *                              [^A-Z]   any char except uppercase
  *                                       alpha
@@ -101,77 +99,77 @@
  *      [6]     +       same as [5], except it matches one or more.
  *
  *      [7]             a regular expression in the form [1] to [10], enclosed
- *                      as \(form\) matches what form matches. The enclosure
- *                      creates a set of tags, used for [8] and for
- *                      pattern substution. The tagged forms are numbered
- *			starting from 1.
+ *                      as \(form\) (or (form) with posix flag) matches what
+ *                      form matches. The enclosure creates a set of tags,
+ *                      used for [8] and for pattern substitution.
+ *                      The tagged forms are numbered starting from 1.
  *
  *      [8]             a \ followed by a digit 1 to 9 matches whatever a
  *                      previously tagged regular expression ([7]) matched.
  *
- *	[9]	\<	a regular expression starting with a \< construct
- *		\>	and/or ending with a \> construct, restricts the
- *			pattern matching to the beginning of a word, and/or
- *			the end of a word. A word is defined to be a character
- *			string beginning and/or ending with the characters
- *			A-Z a-z 0-9 and _. It must also be preceded and/or
- *			followed by any character outside those mentioned.
+ *      [9]     \<      a regular expression starting with a \< construct
+ *              \>      and/or ending with a \> construct, restricts the
+ *                      pattern matching to the beginning of a word, and/or
+ *                      the end of a word. A word is defined to be a character
+ *                      string beginning and/or ending with the characters
+ *                      A-Z a-z 0-9 and _. It must also be preceded and/or
+ *                      followed by any character outside those mentioned.
  *
  *      [10]            a composite regular expression xy where x and y
  *                      are in the form [1] to [10] matches the longest
  *                      match of x followed by a match for y.
  *
- *      [11]	^	a regular expression starting with a ^ character
- *		$	and/or ending with a $ character, restricts the
+ *      [11]    ^       a regular expression starting with a ^ character
+ *              $       and/or ending with a $ character, restricts the
  *                      pattern matching to the beginning of the line,
  *                      or the end of line. [anchors] Elsewhere in the
- *			pattern, ^ and $ are treated as ordinary characters.
+ *                      pattern, ^ and $ are treated as ordinary characters.
  *
  *
  * Acknowledgements:
  *
- *	HCR's Hugh Redelmeier has been most helpful in various
- *	stages of development. He convinced me to include BOW
- *	and EOW constructs, originally invented by Rob Pike at
- *	the University of Toronto.
+ *  HCR's Hugh Redelmeier has been most helpful in various
+ *  stages of development. He convinced me to include BOW
+ *  and EOW constructs, originally invented by Rob Pike at
+ *  the University of Toronto.
  *
  * References:
- *              Software tools			Kernighan & Plauger
+ *              Software tools                  Kernighan & Plauger
  *              Software tools in Pascal        Kernighan & Plauger
  *              Grep [rsx-11 C dist]            David Conroy
- *		ed - text editor		Un*x Programmer's Manual
- *		Advanced editing on Un*x	B. W. Kernighan
- *		RegExp routines			Henry Spencer
+ *              ed - text editor                Un*x Programmer's Manual
+ *              Advanced editing on Un*x        B. W. Kernighan
+ *              RegExp routines                 Henry Spencer
  *
  * Notes:
  *
  *	This implementation uses a bit-set representation for character
  *	classes for speed and compactness. Each character is represented
- *	by one bit in a 128-bit block. Thus, CCL always takes a
- *	constant 16 bytes in the internal nfa, and RESearch::Execute does a single
+ *	by one bit in a 256-bit block. Thus, CCL always takes a
+ *	constant 32 bytes in the internal nfa, and RESearch::Execute does a single
  *	bit comparison to locate the character in the set.
  *
  * Examples:
  *
- *	pattern:	foo*.*
- *	compile:	CHR f CHR o CLO CHR o END CLO ANY END END
- *	matches:	fo foo fooo foobar fobar foxx ...
+ *  pattern:    foo*.*
+ *  compile:    CHR f CHR o CLO CHR o END CLO ANY END END
+ *  matches:    fo foo fooo foobar fobar foxx ...
  *
- *	pattern:	fo[ob]a[rz]
- *	compile:	CHR f CHR o CCL bitset CHR a CCL bitset END
- *	matches:	fobar fooar fobaz fooaz
+ *  pattern:    fo[ob]a[rz]
+ *  compile:    CHR f CHR o CCL bitset CHR a CCL bitset END
+ *  matches:    fobar fooar fobaz fooaz
  *
- *	pattern:	foo\\+
- *	compile:	CHR f CHR o CHR o CHR \ CLO CHR \ END END
- *	matches:	foo\ foo\\ foo\\\  ...
+ *  pattern:    foo\\+
+ *  compile:    CHR f CHR o CHR o CHR \ CLO CHR \ END END
+ *  matches:    foo\ foo\\ foo\\\  ...
  *
- *	pattern:	\(foo\)[1-3]\1	(same as foo[1-3]foo)
- *	compile:	BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
- *	matches:	foo1foo foo2foo foo3foo
+ *  pattern:    \(foo\)[1-3]\1  (same as foo[1-3]foo)
+ *  compile:    BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
+ *  matches:    foo1foo foo2foo foo3foo
  *
- *	pattern:	\(fo.*\)-\1
- *	compile:	BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
- *	matches:	foo-foo fo-fo fob-fob foobar-foobar ...
+ *  pattern:    \(fo.*\)-\1
+ *  compile:    BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
+ *  matches:    foo-foo fo-fo fob-fob foobar-foobar ...
  */
 
 #include "CharClassify.h"
@@ -192,8 +190,8 @@
 #define EOL     5
 #define BOT     6
 #define EOT     7
-#define BOW	8
-#define EOW	9
+#define BOW     8
+#define EOW     9
 #define REF     10
 #define CLO     11
 
@@ -203,17 +201,15 @@
  * The following defines are not meant to be changeable.
  * They are for readability only.
  */
-#define BLKIND	0370
-#define BITIND	07
-
-#define ASCIIB	0177
+#define BLKIND  0370
+#define BITIND  07
 
 const char bitarr[] = {1,2,4,8,16,32,64,'\200'};
 
 #define badpat(x)	(*nfa = END, x)
 
 /*
- * character classification table for word boundary operators BOW
+ * Character classification table for word boundary operators BOW
  * and EOW is passed in by the creator of this object (Scintilla
  * Document). The Document default state is that word chars are:
  * 0-9,a-z, A-Z and _
@@ -229,7 +225,7 @@ RESearch::~RESearch() {
 }
 
 void RESearch::Init() {
-	sta = NOP;               	/* status of lastpat */
+	sta = NOP;                  /* status of lastpat */
 	bol = 0;
 	for (int i=0; i<MAXTAG; i++)
 		pat[i] = 0;
@@ -299,15 +295,15 @@ const char escapeValue(char ch) {
 
 const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, bool posix) {
 	char *mp=nfa;          /* nfa pointer       */
-	char *lp;              /* saved pointer..   */
-	char *sp=nfa;          /* another one..     */
+	char *lp;              /* saved pointer     */
+	char *sp=nfa;          /* another one       */
 	char *mpMax = mp + MAXNFA - BITBLK - 10;
 
 	int tagi = 0;          /* tag stack index   */
 	int tagc = 1;          /* actual tag count  */
 
 	int n;
-	char mask;		/* xor mask -CCL/NCL */
+	char mask;             /* xor mask -CCL/NCL */
 	int c1, c2;
 
 	if (!pat || !length)
@@ -317,18 +313,18 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b
 			return badpat("No previous regular expression");
 	sta = NOP;
 
-	const char *p=pat;               /* pattern pointer   */
+	const char *p=pat;     /* pattern pointer   */
 	for (int i=0; i<length; i++, p++) {
 		if (mp > mpMax)
 			return badpat("Pattern too long");
 		lp = mp;
 		switch(*p) {
 
-		case '.':               /* match any char..  */
+		case '.':               /* match any char  */
 			*mp++ = ANY;
 			break;
 
-		case '^':               /* match beginning.. */
+		case '^':               /* match beginning */
 			if (p == pat)
 				*mp++ = BOL;
 			else {
@@ -337,7 +333,7 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b
 			}
 			break;
 
-		case '$':               /* match endofline.. */
+		case '$':               /* match endofline */
 			if (!*(p+1))
 				*mp++ = EOL;
 			else {
@@ -346,7 +342,7 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b
 			}
 			break;
 
-		case '[':               /* match char class..*/
+		case '[':               /* match char class */
 			*mp++ = CCL;
 
 			i++;
@@ -357,7 +353,7 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b
 			} else
 				mask = 0;
 
-			if (*p == '-') {		/* real dash */
+			if (*p == '-') {	/* real dash */
 				i++;
 				ChSet(*p++);
 			}
@@ -398,12 +394,12 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b
 
 			break;
 
-		case '*':               /* match 0 or more.. */
-		case '+':               /* match 1 or more.. */
+		case '*':               /* match 0 or more... */
+		case '+':               /* match 1 or more... */
 			if (p == pat)
 				return badpat("Empty closure");
 			lp = sp;		/* previous opcode */
-			if (*lp == CLO)		/* equivalence..   */
+			if (*lp == CLO)		/* equivalence... */
 				break;
 			switch(*lp) {
 
@@ -431,7 +427,7 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b
 			mp = sp;
 			break;
 
-		case '\\':              /* tags, backrefs .. */
+		case '\\':              /* tags, backrefs... */
 			i++;
 			switch(*++p) {
 
@@ -497,7 +493,7 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b
 			}
 			break;
 
-		default :               /* an ordinary char  */
+		default :               /* an ordinary char */
 			if (posix && *p == '(') {
 				if (tagc < MAXTAG) {
 					tagstk[++tagi] = tagc;
@@ -538,23 +534,23 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b
 
 /*
  * RESearch::Execute:
- * 	execute nfa to find a match.
- *
- *	special cases: (nfa[0])
- *		BOL
- *			Match only once, starting from the
- *			beginning.
- *		CHR
- *			First locate the character without
- *			calling PMatch, and if found, call
- *			PMatch for the remaining string.
- *		END
- *			RESearch::Compile failed, poor luser did not
- *			check for it. Fail fast.
- *
- *	If a match is found, bopat[0] and eopat[0] are set
- *	to the beginning and the end of the matched fragment,
- *	respectively.
+ *   execute nfa to find a match.
+ *
+ *  special cases: (nfa[0])
+ *      BOL
+ *          Match only once, starting from the
+ *          beginning.
+ *      CHR
+ *          First locate the character without
+ *          calling PMatch, and if found, call
+ *          PMatch for the remaining string.
+ *      END
+ *          RESearch::Compile failed, poor luser did not
+ *          check for it. Fail fast.
+ *
+ *  If a match is found, bopat[0] and eopat[0] are set
+ *  to the beginning and the end of the matched fragment,
+ *  respectively.
  *
  */
 
@@ -585,7 +581,7 @@ int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {
 		c = *(ap+1);
 		while ((lp < endp) && (ci.CharAt(lp) != c))
 			lp++;
-		if (lp >= endp)		/* if EOS, fail, else fall thru. */
+		if (lp >= endp)	/* if EOS, fail, else fall thru. */
 			return 0;
 	default:			/* regular matching all the way. */
 		while (lp < endp) {
@@ -609,51 +605,50 @@ int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {
 /*
  * PMatch: internal routine for the hard part
  *
- * 	This code is partly snarfed from an early grep written by
- *	David Conroy. The backref and tag stuff, and various other
- *	innovations are by oz.
- *
- *	special case optimizations: (nfa[n], nfa[n+1])
- *		CLO ANY
- *			We KNOW .* will match everything upto the
- *			end of line. Thus, directly go to the end of
- *			line, without recursive PMatch calls. As in
- *			the other closure cases, the remaining pattern
- *			must be matched by moving backwards on the
- *			string recursively, to find a match for xy
- *			(x is ".*" and y is the remaining pattern)
- *			where the match satisfies the LONGEST match for
- *			x followed by a match for y.
- *		CLO CHR
- *			We can again scan the string forward for the
- *			single char and at the point of failure, we
- *			execute the remaining nfa recursively, same as
- *			above.
- *
- *	At the end of a successful match, bopat[n] and eopat[n]
- *	are set to the beginning and end of subpatterns matched
- *	by tagged expressions (n = 1 to 9).
- *
+ *  This code is partly snarfed from an early grep written by
+ *  David Conroy. The backref and tag stuff, and various other
+ *  innovations are by oz.
+ *
+ *  special case optimizations: (nfa[n], nfa[n+1])
+ *      CLO ANY
+ *          We KNOW .* will match everything upto the
+ *          end of line. Thus, directly go to the end of
+ *          line, without recursive PMatch calls. As in
+ *          the other closure cases, the remaining pattern
+ *          must be matched by moving backwards on the
+ *          string recursively, to find a match for xy
+ *          (x is ".*" and y is the remaining pattern)
+ *          where the match satisfies the LONGEST match for
+ *          x followed by a match for y.
+ *      CLO CHR
+ *          We can again scan the string forward for the
+ *          single char and at the point of failure, we
+ *          execute the remaining nfa recursively, same as
+ *          above.
+ *
+ *  At the end of a successful match, bopat[n] and eopat[n]
+ *  are set to the beginning and end of subpatterns matched
+ *  by tagged expressions (n = 1 to 9).
  */
 
 extern void re_fail(char *,char);
 
-#define isinset(x,y) 	((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
+#define isinset(x,y)	((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
 
 /*
  * skip values for CLO XXX to skip past the closure
  */
 
-#define ANYSKIP	2 	/* [CLO] ANY END ...	     */
-#define CHRSKIP	3	/* [CLO] CHR chr END ...     */
-#define CCLSKIP 34	/* [CLO] CCL 32bytes END ... */
+#define ANYSKIP 2 	/* [CLO] ANY END          */
+#define CHRSKIP 3	/* [CLO] CHR chr END      */
+#define CCLSKIP 34	/* [CLO] CCL 32 bytes END */
 
 int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
 	int op, c, n;
-	int e;		/* extra pointer for CLO */
-	int bp;		/* beginning of subpat.. */
-	int ep;		/* ending of subpat..	 */
-	int are;			/* to save the line ptr. */
+	int e;		/* extra pointer for CLO  */
+	int bp;		/* beginning of subpat... */
+	int ep;		/* ending of subpat...    */
+	int are;	/* to save the line ptr.  */
 
 	while ((op = *ap++) != END)
 		switch(op) {
@@ -745,13 +740,13 @@ int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
 
 /*
  * RESearch::Substitute:
- *	substitute the matched portions of the src in dst.
+ *  substitute the matched portions of the src in dst.
  *
- *	&	substitute the entire matched pattern.
+ *  &    substitute the entire matched pattern.
  *
- *	\digit	substitute a subpattern, with the given	tag number.
- *		Tags are numbered from 1 to 9. If the particular
- *		tagged subpattern does not exist, null is substituted.
+ *  \digit  substitute a subpattern, with the given tag number.
+ *      Tags are numbered from 1 to 9. If the particular
+ *      tagged subpattern does not exist, null is substituted.
  */
 int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {
 	char c;
diff --git a/src/RESearch.h b/src/RESearch.h
index 25205951f..aa8557918 100644
--- a/src/RESearch.h
+++ b/src/RESearch.h
@@ -29,11 +29,7 @@ class RESearch {
 public:
 	RESearch(CharClassify *charClassTable);
 	~RESearch();
-	void Init();
-	void Clear();
 	bool GrabMatches(CharacterIndexer &ci);
-	void ChSet(char c);
-	void ChSetWithCase(char c, bool caseSensitive);
 	const char *Compile(const char *pat, int length, bool caseSensitive, bool posix);
 	int Execute(CharacterIndexer &ci, int lp, int endp);
 	int Substitute(CharacterIndexer &ci, char *src, char *dst);
@@ -47,14 +43,18 @@ public:
 	char *pat[MAXTAG];
 
 private:
+	void Init();
+	void Clear();
+	void ChSet(char c);
+	void ChSetWithCase(char c, bool caseSensitive);
+
 	int PMatch(CharacterIndexer &ci, int lp, int endp, char *ap);
 
 	int bol;
-	int  tagstk[MAXTAG];             /* subpat tag stack..*/
-	char nfa[MAXNFA];		/* automaton..       */
+	int  tagstk[MAXTAG]; /* subpat tag stack */
+	char nfa[MAXNFA];    /* automaton */
 	int sta;
-	char bittab[BITBLK];		/* bit table for CCL */
-						/* pre-set bits...   */
+	char bittab[BITBLK]; /* bit table for CCL pre-set bits */
 	int failure;
 	CharClassify *charClass;
 	bool iswordc(unsigned char x) {