diff options
Diffstat (limited to 'src/RESearch.cxx')
| -rw-r--r-- | src/RESearch.cxx | 844 | 
1 files changed, 844 insertions, 0 deletions
| diff --git a/src/RESearch.cxx b/src/RESearch.cxx new file mode 100644 index 000000000..a9b0d264f --- /dev/null +++ b/src/RESearch.cxx @@ -0,0 +1,844 @@ +// Scintilla source code edit control +/** @file RESearch.cxx + ** Regular expression search library. + **/ + +/* + * regex - Regular expression pattern matching  and replacement + * + * By:  Ozan S. Yigit (oz) + *      Dept. of Computer Science + *      York University + * + * Translation to C++ by Neil Hodgson neilh@scintilla.org + * Removed all use of register. + * Converted to modern function prototypes. + * Put all global/static variables into an object so this code can be  + * used from multiple threads etc. + * + * These routines are the PUBLIC DOMAIN equivalents of regex + * routines as found in 4.nBSD UN*X, with minor extensions. + * + * These routines are derived from various implementations found + * in software tools books, and Conroy's grep. They are NOT derived + * from licensed/restricted software. + * For more interesting/academic/complicated implementations, + * see Henry Spencer's regexp routines, or GNU Emacs pattern + * matching module. + * + * Modification history: + * + * $Log$ + * Revision 1.1  2001/04/04 12:52:44  nyamatongwe + * Moved to public domain regular expresion implementation. + * + * Revision 1.4  1991/10/17  03:56:42  oz + * miscellaneous changes, small cleanups etc. + * + * Revision 1.3  1989/04/01  14:18:09  oz + * Change all references to a dfa: this is actually an nfa. + * + * Revision 1.2  88/08/28  15:36:04  oz + * Use a complement bitmap to represent NCL. + * This removes the need to have seperate  + * code in the PMatch case block - it is  + * just CCL code now. + *  + * Use the actual CCL code in the CLO + * section of PMatch. No need for a recursive + * PMatch call. + *  + * Use a bitmap table to set char bits in an + * 8-bit chunk. + *  + * Interfaces: + *      RESearch::Compile:        compile a regular expression into a NFA. + * + *			char *RESearch::Compile(s) + *			char *s; + * + *      RESearch::Execute:        execute the NFA to match a pattern. + * + *			int RESearch::Execute(s) + *			char *s; + * + *	RESearch::ModifyWord		change RESearch::Execute's understanding of what a "word" + *			looks like (for \< and \>) by adding into the + *			hidden word-syntax table. + * + *			void RESearch::ModifyWord(s) + *			char *s; + * + *      RESearch::Substitute:	substitute the matched portions in a new string. + * + *			int RESearch::Substitute(src, dst) + *			char *src; + *			char *dst; + * + *	re_fail:	failure routine for RESearch::Execute. + * + *			void re_fail(msg, op) + *			char *msg; + *			char op; + *   + * Regular Expressions: + * + *      [1]     char    matches itself, unless it is a special + *                      character (metachar): . \ [ ] * + ^ $ + * + *      [2]     .       matches any character. + * + *      [3]     \       matches the character following it, except + *			when followed by a left or right round bracket, + *			a digit 1 to 9 or a left or right angle bracket.  + *			(see [7], [8] and [9]) + *			It is used as an escape character for all  + *			other meta-characters, and itself. When used + *			in a set ([4]), it is treated as an ordinary + *			character. + * + *      [4]     [set]   matches one of the characters in the set. + *                      If the first character in the set is "^", + *                      it matches a character NOT in the set, i.e.  + *			complements the set. A shorthand S-E is  + *			used to specify a set of characters S upto  + *			E, inclusive. The special characters "]" and  + *			"-" have no special meaning if they appear  + *			as the first chars in the set. + *                      examples:        match: + * + *                              [a-z]    any lowercase alpha + * + *                              [^]-]    any char except ] and - + * + *                              [^A-Z]   any char except uppercase + *                                       alpha + * + *                              [a-zA-Z] any alpha + * + *      [5]     *       any regular expression form [1] to [4], followed by + *                      closure char (*) matches zero or more matches of + *                      that form. + * + *      [6]     +       same as [5], except it matches one or more. + * + *      [7]             a regular expression in the form [1] to [10], enclosed + *                      as \(form\) matches what form matches. The enclosure + *                      creates a set of tags, used for [8] and for + *                      pattern substution. The tagged forms are numbered + *			starting from 1. + * + *      [8]             a \ followed by a digit 1 to 9 matches whatever a + *                      previously tagged regular expression ([7]) matched. + * + *	[9]	\<	a regular expression starting with a \< construct + *		\>	and/or ending with a \> construct, restricts the + *			pattern matching to the beginning of a word, and/or + *			the end of a word. A word is defined to be a character + *			string beginning and/or ending with the characters + *			A-Z a-z 0-9 and _. It must also be preceded and/or + *			followed by any character outside those mentioned. + * + *      [10]            a composite regular expression xy where x and y + *                      are in the form [1] to [10] matches the longest + *                      match of x followed by a match for y. + * + *      [11]	^	a regular expression starting with a ^ character + *		$	and/or ending with a $ character, restricts the + *                      pattern matching to the beginning of the line, + *                      or the end of line. [anchors] Elsewhere in the + *			pattern, ^ and $ are treated as ordinary characters. + * + * + * Acknowledgements: + * + *	HCR's Hugh Redelmeier has been most helpful in various + *	stages of development. He convinced me to include BOW + *	and EOW constructs, originally invented by Rob Pike at + *	the University of Toronto. + * + * References: + *              Software tools			Kernighan & Plauger + *              Software tools in Pascal        Kernighan & Plauger + *              Grep [rsx-11 C dist]            David Conroy + *		ed - text editor		Un*x Programmer's Manual + *		Advanced editing on Un*x	B. W. Kernighan + *		RegExp routines			Henry Spencer + * + * Notes: + * + *	This implementation uses a bit-set representation for character + *	classes for speed and compactness. Each character is represented  + *	by one bit in a 128-bit block. Thus, CCL always takes a  + *	constant 16 bytes in the internal nfa, and RESearch::Execute does a single + *	bit comparison to locate the character in the set. + * + * Examples: + * + *	pattern:	foo*.* + *	compile:	CHR f CHR o CLO CHR o END CLO ANY END END + *	matches:	fo foo fooo foobar fobar foxx ... + * + *	pattern:	fo[ob]a[rz]	 + *	compile:	CHR f CHR o CCL bitset CHR a CCL bitset END + *	matches:	fobar fooar fobaz fooaz + * + *	pattern:	foo\\+ + *	compile:	CHR f CHR o CHR o CHR \ CLO CHR \ END END + *	matches:	foo\ foo\\ foo\\\  ... + * + *	pattern:	\(foo\)[1-3]\1	(same as foo[1-3]foo) + *	compile:	BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END + *	matches:	foo1foo foo2foo foo3foo + * + *	pattern:	\(fo.*\)-\1 + *	compile:	BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END + *	matches:	foo-foo fo-fo fob-fob foobar-foobar ... + */ + +#include "RESearch.h" + +#define EXTEND + +#define OKP     1 +#define NOP     0 + +#define CHR     1 +#define ANY     2 +#define CCL     3 +#define BOL     4 +#define EOL     5 +#define BOT     6 +#define EOT     7 +#define BOW	8 +#define EOW	9 +#define REF     10 +#define CLO     11 + +#define END     0 + +/* + * The following defines are not meant to be changeable. + * They are for readability only. + */ +#define BLKIND	0170 +#define BITIND	07 + +#define ASCIIB	0177 + +const char bitarr[] = {1,2,4,8,16,32,64,'\200'}; + +#define badpat(x)	(*nfa = END, x) +  +RESearch::RESearch() { +	sta = NOP;               	/* status of lastpat */ +	bol = 0; +	for (int i=0; i<BITBLK; i++) +		bittab[i] = 0; +} + +void RESearch::ChSet(char c) { +	bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND]; +} + +const char *RESearch::Compile(char *pat) { +	char *p;               /* pattern pointer   */ +	char *mp=nfa;          /* nfa pointer       */ +	char *lp;              /* saved pointer..   */ +	char *sp=nfa;          /* another one..     */ + +	int tagi = 0;          /* tag stack index   */ +	int tagc = 1;          /* actual tag count  */ + +	int n; +	char mask;		/* xor mask -CCL/NCL */ +	int c1, c2; +		 +	if (!pat || !*pat) +		if (sta) +			return 0; +		else +			return badpat("No previous regular expression"); +	sta = NOP; + +	for (p = pat; *p; p++) { +		lp = mp; +		switch(*p) { + +		case '.':               /* match any char..  */ +			*mp++ = ANY; +			break; + +		case '^':               /* match beginning.. */ +			if (p == pat) +				*mp++ = BOL; +			else { +				*mp++ = CHR; +				*mp++ = *p; +			} +			break; + +		case '$':               /* match endofline.. */ +			if (!*(p+1)) +				*mp++ = EOL; +			else { +				*mp++ = CHR; +				*mp++ = *p; +			} +			break; + +		case '[':               /* match char class..*/ +			*mp++ = CCL; + +			if (*++p == '^') { +				mask = '\377';	 +				p++; +			} +			else +				mask = 0; + +			if (*p == '-')		/* real dash */ +				ChSet(*p++); +			if (*p == ']')		/* real brac */ +				ChSet(*p++); +			while (*p && *p != ']') { +				if (*p == '-' && *(p+1) && *(p+1) != ']') { +					p++; +					c1 = *(p-2) + 1; +					c2 = *p++; +					while (c1 <= c2) +						ChSet(static_cast<char>(c1++)); +				} +#ifdef EXTEND +				else if (*p == '\\' && *(p+1)) { +					p++; +					ChSet(*p++); +				} +#endif +				else +					ChSet(*p++); +			} +			if (!*p) +				return badpat("Missing ]"); + +			for (n = 0; n < BITBLK; bittab[n++] = (char) 0) +				*mp++ = static_cast<char>(mask ^ bittab[n]); +	 +			break; + +		case '*':               /* match 0 or more.. */ +		case '+':               /* match 1 or more.. */ +			if (p == pat) +				return badpat("Empty closure"); +			lp = sp;		/* previous opcode */ +			if (*lp == CLO)		/* equivalence..   */ +				break; +			switch(*lp) { + +			case BOL: +			case BOT: +			case EOT: +			case BOW: +			case EOW: +			case REF: +				return badpat("Illegal closure"); +			default: +				break; +			} + +			if (*p == '+') +				for (sp = mp; lp < sp; lp++) +					*mp++ = *lp; + +			*mp++ = END; +			*mp++ = END; +			sp = mp; +			while (--mp > lp) +				*mp = mp[-1]; +			*mp = CLO; +			mp = sp; +			break; + +		case '\\':              /* tags, backrefs .. */ +			switch(*++p) { + +			case '(': +				if (tagc < MAXTAG) { +					tagstk[++tagi] = tagc; +					*mp++ = BOT; +					*mp++ = static_cast<char>(tagc++); +				} +				else +					return badpat("Too many \\(\\) pairs"); +				break; +			case ')': +				if (*sp == BOT) +					return badpat("Null pattern inside \\(\\)"); +				if (tagi > 0) { +					*mp++ = static_cast<char>(EOT); +					*mp++ = static_cast<char>(tagstk[tagi--]); +				} +				else +					return badpat("Unmatched \\)"); +				break; +			case '<': +				*mp++ = BOW; +				break; +			case '>': +				if (*sp == BOW) +					return badpat("Null pattern inside \\<\\>"); +				*mp++ = EOW; +				break; +			case '1': +			case '2': +			case '3': +			case '4': +			case '5': +			case '6': +			case '7': +			case '8': +			case '9': +				n = *p-'0'; +				if (tagi > 0 && tagstk[tagi] == n) +					return badpat("Cyclical reference"); +				if (tagc > n) { +					*mp++ = static_cast<char>(REF); +					*mp++ = static_cast<char>(n); +				} +				else +					return badpat("Undetermined reference"); +				break; +#ifdef EXTEND +			case 'a': +				*mp++ = CHR; +				*mp++ = '\a'; +				break; +			case 'b': +				*mp++ = CHR; +				*mp++ = '\b'; +				break; +			case 'n': +				*mp++ = CHR; +				*mp++ = '\n'; +				break; +			case 'f': +				*mp++ = CHR; +				*mp++ = '\f'; +				break; +			case 'r': +				*mp++ = CHR; +				*mp++ = '\r'; +				break; +			case 't': +				*mp++ = CHR; +				*mp++ = '\t'; +				break; +			case 'v': +				*mp++ = CHR; +				*mp++ = '\v'; +				break; +#endif +			default: +				*mp++ = CHR; +				*mp++ = *p; +			} +			break; + +		default :               /* an ordinary char  */ +			*mp++ = CHR; +			*mp++ = *p; +			break; +		} +		sp = lp; +	} +	if (tagi > 0) +		return badpat("Unmatched \\("); +	*mp = END; +	sta = OKP; +	return 0; +} + +/* + * RESearch::Execute: + * 	execute nfa to find a match. + * + *	special cases: (nfa[0])	 + *		BOL + *			Match only once, starting from the + *			beginning. + *		CHR + *			First locate the character without + *			calling PMatch, and if found, call + *			PMatch for the remaining string. + *		END + *			RESearch::Compile failed, poor luser did not + *			check for it. Fail fast. + * + *	If a match is found, bopat[0] and eopat[0] are set + *	to the beginning and the end of the matched fragment, + *	respectively. + * + */ + +int RESearch::Execute(CharacterIndexer &ci, int lp) { +	char c; +	int ep = 0; +	char *ap = nfa; + +	bol = lp; +	failure = 0; + +	for (int i=0;i<MAXTAG;i++) { +		bopat[i] = NOTFOUND; +		eopat[i] = NOTFOUND; +	} + +	switch(*ap) { + +	case BOL:			/* anchored: match from BOL only */ +		ep = PMatch(ci, lp, ap); +		break; +	case CHR:			/* ordinary char: locate it fast */ +		c = *(ap+1); +		while (ci.CharAt(lp) && ci.CharAt(lp) != c) +			lp++; +		if (!ci.CharAt(lp))		/* if EOS, fail, else fall thru. */ +			return 0; +	default:			/* regular matching all the way. */ +		while (ci.CharAt(lp)) { +			ep = PMatch(ci, lp, ap); +			if (ep != NOTFOUND) +				break; +			lp++; +		} +		break; +	case END:			/* munged automaton. fail always */ +		return 0; +	} +	if (ep == NOTFOUND) +		return 0; + +	bopat[0] = lp; +	eopat[0] = ep; +	return 1; +} + +/*  + * PMatch: internal routine for the hard part + * + * 	This code is partly snarfed from an early grep written by + *	David Conroy. The backref and tag stuff, and various other + *	innovations are by oz. + * + *	special case optimizations: (nfa[n], nfa[n+1]) + *		CLO ANY + *			We KNOW .* will match everything upto the + *			end of line. Thus, directly go to the end of + *			line, without recursive PMatch calls. As in + *			the other closure cases, the remaining pattern + *			must be matched by moving backwards on the + *			string recursively, to find a match for xy + *			(x is ".*" and y is the remaining pattern) + *			where the match satisfies the LONGEST match for + *			x followed by a match for y. + *		CLO CHR + *			We can again scan the string forward for the + *			single char and at the point of failure, we + *			execute the remaining nfa recursively, same as + *			above. + * + *	At the end of a successful match, bopat[n] and eopat[n] + *	are set to the beginning and end of subpatterns matched + *	by tagged expressions (n = 1 to 9).	 + * + */ + +extern void re_fail(char *,char); + +/* + * character classification table for word boundary operators BOW + * and EOW. the reason for not using ctype macros is that we can + * let the user add into our own table. see RESearch::ModifyWord. This table + * is not in the bitset form, since we may wish to extend it in the + * future for other character classifications.  + * + *	TRUE for 0-9 A-Z a-z _ + */ +static char chrtyp[MAXCHR] = { +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  +	0, 0, 0, 0, 0, 0, 0, 0, 1, 1,  +	1, 1, 1, 1, 1, 1, 1, 1, 0, 0,  +	0, 0, 0, 0, 0, 1, 1, 1, 1, 1,  +	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  +	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  +	1, 0, 0, 0, 0, 1, 0, 1, 1, 1,  +	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  +	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  +	1, 1, 1, 0, 0, 0, 0, 0 +	}; + +#define inascii(x)	(0177&(x)) +#define iswordc(x) 	chrtyp[inascii(x)] +#define isinset(x,y) 	((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND]) + +/* + * skip values for CLO XXX to skip past the closure + */ + +#define ANYSKIP	2 	/* [CLO] ANY END ...	     */ +#define CHRSKIP	3	/* [CLO] CHR chr END ...     */ +#define CCLSKIP 18	/* [CLO] CCL 16bytes END ... */ + +int RESearch::PMatch(CharacterIndexer &ci, int lp, char *ap) { +	int op, c, n; +	int e;		/* extra pointer for CLO */ +	int bp;		/* beginning of subpat.. */ +	int ep;		/* ending of subpat..	 */ +	int are;			/* to save the line ptr. */ + +	while ((op = *ap++) != END) +		switch(op) { + +		case CHR: +			if (ci.CharAt(lp++) != *ap++) +				return NOTFOUND; +			break; +		case ANY: +			if (!ci.CharAt(lp++)) +				return NOTFOUND; +			break; +		case CCL: +			c = ci.CharAt(lp++); +			if (!isinset(ap,c)) +				return NOTFOUND; +			ap += BITBLK; +			break; +		case BOL: +			if (lp != bol) +				return NOTFOUND; +			break; +		case EOL: +			if (ci.CharAt(lp)) +				return NOTFOUND; +			break; +		case BOT: +			bopat[*ap++] = lp; +			break; +		case EOT: +			eopat[*ap++] = lp; +			break; + 		case BOW: +			if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp))) +				return NOTFOUND; +			break; +		case EOW: +			if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp))) +				return NOTFOUND; +			break; +		case REF: +			n = *ap++; +			bp = bopat[n]; +			ep = eopat[n]; +			while (bp < ep) +				if (ci.CharAt(bp++) != ci.CharAt(lp++)) +					return NOTFOUND; +			break; +		case CLO: +			are = lp; +			switch(*ap) { + +			case ANY: +				while (ci.CharAt(lp)) +					lp++; +				n = ANYSKIP; +				break; +			case CHR: +				c = *(ap+1); +				while (ci.CharAt(lp) && c == ci.CharAt(lp)) +					lp++; +				n = CHRSKIP; +				break; +			case CCL: +				while (((c = ci.CharAt(lp)) != 0) && isinset(ap+1,c)) +					lp++; +				n = CCLSKIP; +				break; +			default: +				failure = true; +				//re_fail("closure: bad nfa.", *ap); +				return NOTFOUND; +			} + +			ap += n; + +			while (lp >= are) { +				if ((e = PMatch(ci, lp, ap)) != NOTFOUND) +					return e; +				--lp; +			} +			return NOTFOUND; +		default: +			//re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op)); +			return NOTFOUND; +		} +	return lp; +} + +/* + * RESearch::ModifyWord: + *	add new characters into the word table to change RESearch::Execute's + *	understanding of what a word should look like. Note that we + *	only accept additions into the word definition. + * + *	If the string parameter is 0 or null string, the table is + *	reset back to the default containing A-Z a-z 0-9 _. [We use + *	the compact bitset representation for the default table] + */ + +static char deftab[16] = {	 +	0, 0, 0, 0, 0, 0, '\377', 003, '\376', '\377', '\377', '\207',   +	'\376', '\377', '\377', 007  +};  + +void RESearch::ModifyWord(char *s) { +	int i; + +	if (!s || !*s) { +		for (i = 0; i < MAXCHR; i++) +			if (!isinset(deftab,i)) +				iswordc(i) = 0; +	} +	else +		while(*s) +			iswordc(*s++) = 1; +} + +/* + * RESearch::Substitute: + *	substitute the matched portions of the src in dst. + * + *	&	substitute the entire matched pattern. + * + *	\digit	substitute a subpattern, with the given	tag number. + *		Tags are numbered from 1 to 9. If the particular + *		tagged subpattern does not exist, null is substituted. + */ +int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) { +	char c; +	int  pin; +	int bp; +	int ep; + +	if (!*src || !bopat[0]) +		return 0; + +	while ((c = *src++) != 0) { +		switch(c) { + +		case '&': +			pin = 0; +			break; + +		case '\\': +			c = *src++; +			if (c >= '0' && c <= '9') { +				pin = c - '0'; +				break; +			} +			 +		default: +			*dst++ = c; +			continue; +		} + +		if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) { +			while (ci.CharAt(bp) && bp < ep) +				*dst++ = ci.CharAt(bp++); +			if (bp < ep) +				return 0; +		} +	} +	*dst = (char) 0; +	return 1; +} +			 +#ifdef DEBUG +/* + * symbolic - produce a symbolic dump of the nfa + */ +void symbolic(char *s) { +	printf("pattern: %s\n", s); +	printf("nfacode:\n"); +	nfadump(nfa); +} + +static void nfadump(char *ap) { +	int n; + +	while (*ap != END) +		switch(*ap++) { +		case CLO: +			printf("CLOSURE"); +			nfadump(ap); +			switch(*ap) { +			case CHR: +				n = CHRSKIP; +				break; +			case ANY: +				n = ANYSKIP; +				break; +			case CCL: +				n = CCLSKIP; +				break; +			} +			ap += n; +			break; +		case CHR: +			printf("\tCHR %c\n",*ap++); +			break; +		case ANY: +			printf("\tANY .\n"); +			break; +		case BOL: +			printf("\tBOL -\n"); +			break; +		case EOL: +			printf("\tEOL -\n"); +			break; +		case BOT: +			printf("BOT: %d\n",*ap++); +			break; +		case EOT: +			printf("EOT: %d\n",*ap++); +			break; +		case BOW: +			printf("BOW\n"); +			break; +		case EOW: +			printf("EOW\n"); +			break; +		case REF: +			printf("REF: %d\n",*ap++); +			break; +		case CCL: +			printf("\tCCL ["); +			for (n = 0; n < MAXCHR; n++) +				if (isinset(ap,(char)n)) { +					if (n < ' ') +						printf("^%c", n ^ 0x040); +					else +						printf("%c", n); +				} +			printf("]\n"); +			ap += BITBLK; +			break; +		default: +			printf("bad nfa. opcode %o\n", ap[-1]); +			exit(1); +			break; +		} +} +#endif | 
