diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/RESearch.cxx | 301 | ||||
| -rw-r--r-- | src/RESearch.h | 16 | 
2 files changed, 156 insertions, 161 deletions
| diff --git a/src/RESearch.cxx b/src/RESearch.cxx index f1fda7304..00e183060 100644 --- a/src/RESearch.cxx +++ b/src/RESearch.cxx @@ -4,7 +4,7 @@   **/  /* - * regex - Regular expression pattern matching  and replacement + * regex - Regular expression pattern matching and replacement   *   * By:  Ozan S. Yigit (oz)   *      Dept. of Computer Science @@ -15,7 +15,7 @@   * Removed all use of register.   * Converted to modern function prototypes.   * Put all global/static variables into an object so this code can be - * used from multiple threads etc. + * used from multiple threads, etc.   *   * These routines are the PUBLIC DOMAIN equivalents of regex   * routines as found in 4.nBSD UN*X, with minor extensions. @@ -30,64 +30,62 @@   * Modification history removed.   *   * Interfaces: - *      RESearch::Compile:        compile a regular expression into a NFA. + *  RESearch::Compile:      compile a regular expression into a NFA.   * - *			char *RESearch::Compile(s) - *			char *s; + *          const char *RESearch::Compile(const char *pat, int length, + *                                        bool caseSensitive, bool posix)   * - *      RESearch::Execute:        execute the NFA to match a pattern. + * Returns a short error string if they fail.   * - *			int RESearch::Execute(s) - *			char *s; + *  RESearch::Execute:      execute the NFA to match a pattern.   * - *	RESearch::ModifyWord		change RESearch::Execute's understanding of what a "word" - *			looks like (for \< and \>) by adding into the - *			hidden word-syntax table. + *          int RESearch::Execute(characterIndexer &ci, int lp, int endp)   * - *			void RESearch::ModifyWord(s) - *			char *s; + *  RESearch::Substitute:   substitute the matched portions in a new string.   * - *      RESearch::Substitute:	substitute the matched portions in a new string. + *          int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst)   * - *			int RESearch::Substitute(src, dst) - *			char *src; - *			char *dst; + *  re_fail:                failure routine for RESearch::Execute. (no longer used)   * - *	re_fail:	failure routine for RESearch::Execute. - * - *			void re_fail(msg, op) - *			char *msg; - *			char op; + *          void re_fail(char *msg, char op)   *   * Regular Expressions:   *   *      [1]     char    matches itself, unless it is a special   *                      character (metachar): . \ [ ] * + ^ $ + *                      and ( ) if posix option.   *   *      [2]     .       matches any character.   * - *      [3]     \       matches the character following it, except - *			when followed by a left or right round bracket, - *			a digit 1 to 9 or a left or right angle bracket. - *			(see [7], [8] and [9]) - *			It is used as an escape character for all - *			other meta-characters, and itself. When used - *			in a set ([4]), it is treated as an ordinary - *			character. + *      [3]     \       matches the character following it, except: + *                      - \a, \b, \f, \n, \t, \v match the + *                      corresponding C escape char; + *                      - if not in posix mode, when followed by a + *                      left or right round bracket (see [7]); + *                      - when followed by a digit 1 to 9 (see [8]); + *                      - when followed by a left or right angle bracket + *                      (see [9]). + *                      It is used as an escape character for all + *                      other meta-characters, and itself. When used + *                      in a set ([4]), it is treated as an ordinary + *                      character (except for escape chars).   *   *      [4]     [set]   matches one of the characters in the set.   *                      If the first character in the set is "^",   *                      it matches a character NOT in the set, i.e. - *			complements the set. A shorthand S-E is - *			used to specify a set of characters S upto - *			E, inclusive. The special characters "]" and - *			"-" have no special meaning if they appear - *			as the first chars in the set. + *                      complements the set. A shorthand S-E (start-end) + *                      is used to specify a set of characters S upto + *                      E, inclusive. The special characters "]" and + *                      "-" have no special meaning if they appear + *                      as the first chars in the set. To include both, + *                      put - first: [-]A-Z]: + *                      [-]|] matches these 2 chars, + *                      []-|] matches from ] to | chars.   *                      examples:        match:   *   *                              [a-z]    any lowercase alpha   * - *                              [^]-]    any char except ] and - + *                              [^-]]    any char except - and ]   *   *                              [^A-Z]   any char except uppercase   *                                       alpha @@ -101,77 +99,77 @@   *      [6]     +       same as [5], except it matches one or more.   *   *      [7]             a regular expression in the form [1] to [10], enclosed - *                      as \(form\) matches what form matches. The enclosure - *                      creates a set of tags, used for [8] and for - *                      pattern substution. The tagged forms are numbered - *			starting from 1. + *                      as \(form\) (or (form) with posix flag) matches what + *                      form matches. The enclosure creates a set of tags, + *                      used for [8] and for pattern substitution. + *                      The tagged forms are numbered starting from 1.   *   *      [8]             a \ followed by a digit 1 to 9 matches whatever a   *                      previously tagged regular expression ([7]) matched.   * - *	[9]	\<	a regular expression starting with a \< construct - *		\>	and/or ending with a \> construct, restricts the - *			pattern matching to the beginning of a word, and/or - *			the end of a word. A word is defined to be a character - *			string beginning and/or ending with the characters - *			A-Z a-z 0-9 and _. It must also be preceded and/or - *			followed by any character outside those mentioned. + *      [9]     \<      a regular expression starting with a \< construct + *              \>      and/or ending with a \> construct, restricts the + *                      pattern matching to the beginning of a word, and/or + *                      the end of a word. A word is defined to be a character + *                      string beginning and/or ending with the characters + *                      A-Z a-z 0-9 and _. It must also be preceded and/or + *                      followed by any character outside those mentioned.   *   *      [10]            a composite regular expression xy where x and y   *                      are in the form [1] to [10] matches the longest   *                      match of x followed by a match for y.   * - *      [11]	^	a regular expression starting with a ^ character - *		$	and/or ending with a $ character, restricts the + *      [11]    ^       a regular expression starting with a ^ character + *              $       and/or ending with a $ character, restricts the   *                      pattern matching to the beginning of the line,   *                      or the end of line. [anchors] Elsewhere in the - *			pattern, ^ and $ are treated as ordinary characters. + *                      pattern, ^ and $ are treated as ordinary characters.   *   *   * Acknowledgements:   * - *	HCR's Hugh Redelmeier has been most helpful in various - *	stages of development. He convinced me to include BOW - *	and EOW constructs, originally invented by Rob Pike at - *	the University of Toronto. + *  HCR's Hugh Redelmeier has been most helpful in various + *  stages of development. He convinced me to include BOW + *  and EOW constructs, originally invented by Rob Pike at + *  the University of Toronto.   *   * References: - *              Software tools			Kernighan & Plauger + *              Software tools                  Kernighan & Plauger   *              Software tools in Pascal        Kernighan & Plauger   *              Grep [rsx-11 C dist]            David Conroy - *		ed - text editor		Un*x Programmer's Manual - *		Advanced editing on Un*x	B. W. Kernighan - *		RegExp routines			Henry Spencer + *              ed - text editor                Un*x Programmer's Manual + *              Advanced editing on Un*x        B. W. Kernighan + *              RegExp routines                 Henry Spencer   *   * Notes:   *   *	This implementation uses a bit-set representation for character   *	classes for speed and compactness. Each character is represented - *	by one bit in a 128-bit block. Thus, CCL always takes a - *	constant 16 bytes in the internal nfa, and RESearch::Execute does a single + *	by one bit in a 256-bit block. Thus, CCL always takes a + *	constant 32 bytes in the internal nfa, and RESearch::Execute does a single   *	bit comparison to locate the character in the set.   *   * Examples:   * - *	pattern:	foo*.* - *	compile:	CHR f CHR o CLO CHR o END CLO ANY END END - *	matches:	fo foo fooo foobar fobar foxx ... + *  pattern:    foo*.* + *  compile:    CHR f CHR o CLO CHR o END CLO ANY END END + *  matches:    fo foo fooo foobar fobar foxx ...   * - *	pattern:	fo[ob]a[rz] - *	compile:	CHR f CHR o CCL bitset CHR a CCL bitset END - *	matches:	fobar fooar fobaz fooaz + *  pattern:    fo[ob]a[rz] + *  compile:    CHR f CHR o CCL bitset CHR a CCL bitset END + *  matches:    fobar fooar fobaz fooaz   * - *	pattern:	foo\\+ - *	compile:	CHR f CHR o CHR o CHR \ CLO CHR \ END END - *	matches:	foo\ foo\\ foo\\\  ... + *  pattern:    foo\\+ + *  compile:    CHR f CHR o CHR o CHR \ CLO CHR \ END END + *  matches:    foo\ foo\\ foo\\\  ...   * - *	pattern:	\(foo\)[1-3]\1	(same as foo[1-3]foo) - *	compile:	BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END - *	matches:	foo1foo foo2foo foo3foo + *  pattern:    \(foo\)[1-3]\1  (same as foo[1-3]foo) + *  compile:    BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END + *  matches:    foo1foo foo2foo foo3foo   * - *	pattern:	\(fo.*\)-\1 - *	compile:	BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END - *	matches:	foo-foo fo-fo fob-fob foobar-foobar ... + *  pattern:    \(fo.*\)-\1 + *  compile:    BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END + *  matches:    foo-foo fo-fo fob-fob foobar-foobar ...   */  #include "CharClassify.h" @@ -192,8 +190,8 @@  #define EOL     5  #define BOT     6  #define EOT     7 -#define BOW	8 -#define EOW	9 +#define BOW     8 +#define EOW     9  #define REF     10  #define CLO     11 @@ -203,17 +201,15 @@   * The following defines are not meant to be changeable.   * They are for readability only.   */ -#define BLKIND	0370 -#define BITIND	07 - -#define ASCIIB	0177 +#define BLKIND  0370 +#define BITIND  07  const char bitarr[] = {1,2,4,8,16,32,64,'\200'};  #define badpat(x)	(*nfa = END, x)  /* - * character classification table for word boundary operators BOW + * Character classification table for word boundary operators BOW   * and EOW is passed in by the creator of this object (Scintilla   * Document). The Document default state is that word chars are:   * 0-9,a-z, A-Z and _ @@ -229,7 +225,7 @@ RESearch::~RESearch() {  }  void RESearch::Init() { -	sta = NOP;               	/* status of lastpat */ +	sta = NOP;                  /* status of lastpat */  	bol = 0;  	for (int i=0; i<MAXTAG; i++)  		pat[i] = 0; @@ -299,15 +295,15 @@ const char escapeValue(char ch) {  const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, bool posix) {  	char *mp=nfa;          /* nfa pointer       */ -	char *lp;              /* saved pointer..   */ -	char *sp=nfa;          /* another one..     */ +	char *lp;              /* saved pointer     */ +	char *sp=nfa;          /* another one       */  	char *mpMax = mp + MAXNFA - BITBLK - 10;  	int tagi = 0;          /* tag stack index   */  	int tagc = 1;          /* actual tag count  */  	int n; -	char mask;		/* xor mask -CCL/NCL */ +	char mask;             /* xor mask -CCL/NCL */  	int c1, c2;  	if (!pat || !length) @@ -317,18 +313,18 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b  			return badpat("No previous regular expression");  	sta = NOP; -	const char *p=pat;               /* pattern pointer   */ +	const char *p=pat;     /* pattern pointer   */  	for (int i=0; i<length; i++, p++) {  		if (mp > mpMax)  			return badpat("Pattern too long");  		lp = mp;  		switch(*p) { -		case '.':               /* match any char..  */ +		case '.':               /* match any char  */  			*mp++ = ANY;  			break; -		case '^':               /* match beginning.. */ +		case '^':               /* match beginning */  			if (p == pat)  				*mp++ = BOL;  			else { @@ -337,7 +333,7 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b  			}  			break; -		case '$':               /* match endofline.. */ +		case '$':               /* match endofline */  			if (!*(p+1))  				*mp++ = EOL;  			else { @@ -346,7 +342,7 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b  			}  			break; -		case '[':               /* match char class..*/ +		case '[':               /* match char class */  			*mp++ = CCL;  			i++; @@ -357,7 +353,7 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b  			} else  				mask = 0; -			if (*p == '-') {		/* real dash */ +			if (*p == '-') {	/* real dash */  				i++;  				ChSet(*p++);  			} @@ -398,12 +394,12 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b  			break; -		case '*':               /* match 0 or more.. */ -		case '+':               /* match 1 or more.. */ +		case '*':               /* match 0 or more... */ +		case '+':               /* match 1 or more... */  			if (p == pat)  				return badpat("Empty closure");  			lp = sp;		/* previous opcode */ -			if (*lp == CLO)		/* equivalence..   */ +			if (*lp == CLO)		/* equivalence... */  				break;  			switch(*lp) { @@ -431,7 +427,7 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b  			mp = sp;  			break; -		case '\\':              /* tags, backrefs .. */ +		case '\\':              /* tags, backrefs... */  			i++;  			switch(*++p) { @@ -497,7 +493,7 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b  			}  			break; -		default :               /* an ordinary char  */ +		default :               /* an ordinary char */  			if (posix && *p == '(') {  				if (tagc < MAXTAG) {  					tagstk[++tagi] = tagc; @@ -538,23 +534,23 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b  /*   * RESearch::Execute: - * 	execute nfa to find a match. - * - *	special cases: (nfa[0]) - *		BOL - *			Match only once, starting from the - *			beginning. - *		CHR - *			First locate the character without - *			calling PMatch, and if found, call - *			PMatch for the remaining string. - *		END - *			RESearch::Compile failed, poor luser did not - *			check for it. Fail fast. - * - *	If a match is found, bopat[0] and eopat[0] are set - *	to the beginning and the end of the matched fragment, - *	respectively. + *   execute nfa to find a match. + * + *  special cases: (nfa[0]) + *      BOL + *          Match only once, starting from the + *          beginning. + *      CHR + *          First locate the character without + *          calling PMatch, and if found, call + *          PMatch for the remaining string. + *      END + *          RESearch::Compile failed, poor luser did not + *          check for it. Fail fast. + * + *  If a match is found, bopat[0] and eopat[0] are set + *  to the beginning and the end of the matched fragment, + *  respectively.   *   */ @@ -585,7 +581,7 @@ int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {  		c = *(ap+1);  		while ((lp < endp) && (ci.CharAt(lp) != c))  			lp++; -		if (lp >= endp)		/* if EOS, fail, else fall thru. */ +		if (lp >= endp)	/* if EOS, fail, else fall thru. */  			return 0;  	default:			/* regular matching all the way. */  		while (lp < endp) { @@ -609,51 +605,50 @@ int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {  /*   * PMatch: internal routine for the hard part   * - * 	This code is partly snarfed from an early grep written by - *	David Conroy. The backref and tag stuff, and various other - *	innovations are by oz. - * - *	special case optimizations: (nfa[n], nfa[n+1]) - *		CLO ANY - *			We KNOW .* will match everything upto the - *			end of line. Thus, directly go to the end of - *			line, without recursive PMatch calls. As in - *			the other closure cases, the remaining pattern - *			must be matched by moving backwards on the - *			string recursively, to find a match for xy - *			(x is ".*" and y is the remaining pattern) - *			where the match satisfies the LONGEST match for - *			x followed by a match for y. - *		CLO CHR - *			We can again scan the string forward for the - *			single char and at the point of failure, we - *			execute the remaining nfa recursively, same as - *			above. - * - *	At the end of a successful match, bopat[n] and eopat[n] - *	are set to the beginning and end of subpatterns matched - *	by tagged expressions (n = 1 to 9). - * + *  This code is partly snarfed from an early grep written by + *  David Conroy. The backref and tag stuff, and various other + *  innovations are by oz. + * + *  special case optimizations: (nfa[n], nfa[n+1]) + *      CLO ANY + *          We KNOW .* will match everything upto the + *          end of line. Thus, directly go to the end of + *          line, without recursive PMatch calls. As in + *          the other closure cases, the remaining pattern + *          must be matched by moving backwards on the + *          string recursively, to find a match for xy + *          (x is ".*" and y is the remaining pattern) + *          where the match satisfies the LONGEST match for + *          x followed by a match for y. + *      CLO CHR + *          We can again scan the string forward for the + *          single char and at the point of failure, we + *          execute the remaining nfa recursively, same as + *          above. + * + *  At the end of a successful match, bopat[n] and eopat[n] + *  are set to the beginning and end of subpatterns matched + *  by tagged expressions (n = 1 to 9).   */  extern void re_fail(char *,char); -#define isinset(x,y) 	((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND]) +#define isinset(x,y)	((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])  /*   * skip values for CLO XXX to skip past the closure   */ -#define ANYSKIP	2 	/* [CLO] ANY END ...	     */ -#define CHRSKIP	3	/* [CLO] CHR chr END ...     */ -#define CCLSKIP 34	/* [CLO] CCL 32bytes END ... */ +#define ANYSKIP 2 	/* [CLO] ANY END          */ +#define CHRSKIP 3	/* [CLO] CHR chr END      */ +#define CCLSKIP 34	/* [CLO] CCL 32 bytes END */  int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {  	int op, c, n; -	int e;		/* extra pointer for CLO */ -	int bp;		/* beginning of subpat.. */ -	int ep;		/* ending of subpat..	 */ -	int are;			/* to save the line ptr. */ +	int e;		/* extra pointer for CLO  */ +	int bp;		/* beginning of subpat... */ +	int ep;		/* ending of subpat...    */ +	int are;	/* to save the line ptr.  */  	while ((op = *ap++) != END)  		switch(op) { @@ -745,13 +740,13 @@ int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {  /*   * RESearch::Substitute: - *	substitute the matched portions of the src in dst. + *  substitute the matched portions of the src in dst.   * - *	&	substitute the entire matched pattern. + *  &    substitute the entire matched pattern.   * - *	\digit	substitute a subpattern, with the given	tag number. - *		Tags are numbered from 1 to 9. If the particular - *		tagged subpattern does not exist, null is substituted. + *  \digit  substitute a subpattern, with the given tag number. + *      Tags are numbered from 1 to 9. If the particular + *      tagged subpattern does not exist, null is substituted.   */  int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {  	char c; diff --git a/src/RESearch.h b/src/RESearch.h index 25205951f..aa8557918 100644 --- a/src/RESearch.h +++ b/src/RESearch.h @@ -29,11 +29,7 @@ class RESearch {  public:  	RESearch(CharClassify *charClassTable);  	~RESearch(); -	void Init(); -	void Clear();  	bool GrabMatches(CharacterIndexer &ci); -	void ChSet(char c); -	void ChSetWithCase(char c, bool caseSensitive);  	const char *Compile(const char *pat, int length, bool caseSensitive, bool posix);  	int Execute(CharacterIndexer &ci, int lp, int endp);  	int Substitute(CharacterIndexer &ci, char *src, char *dst); @@ -47,14 +43,18 @@ public:  	char *pat[MAXTAG];  private: +	void Init(); +	void Clear(); +	void ChSet(char c); +	void ChSetWithCase(char c, bool caseSensitive); +  	int PMatch(CharacterIndexer &ci, int lp, int endp, char *ap);  	int bol; -	int  tagstk[MAXTAG];             /* subpat tag stack..*/ -	char nfa[MAXNFA];		/* automaton..       */ +	int  tagstk[MAXTAG]; /* subpat tag stack */ +	char nfa[MAXNFA];    /* automaton */  	int sta; -	char bittab[BITBLK];		/* bit table for CCL */ -						/* pre-set bits...   */ +	char bittab[BITBLK]; /* bit table for CCL pre-set bits */  	int failure;  	CharClassify *charClass;  	bool iswordc(unsigned char x) { | 
