diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/Document.cxx | 56 | ||||
| -rw-r--r-- | src/Document.h | 2 | ||||
| -rw-r--r-- | src/PosRegExp.cxx | 1192 | ||||
| -rw-r--r-- | src/RESearch.cxx | 844 | ||||
| -rw-r--r-- | src/RESearch.h | 54 | 
5 files changed, 934 insertions, 1214 deletions
| diff --git a/src/Document.cxx b/src/Document.cxx index ac9c381a0..ba7fa4911 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -16,7 +16,10 @@  #include "SVector.h"  #include "CellBuffer.h"  #include "Document.h" -#include "PosRegExp.h" +#include "RESearch.h" + +void re_fail(char *,char) { +}  // This is ASCII specific but is safe with chars >= 0x80  inline bool isspacechar(unsigned char ch) { @@ -743,10 +746,6 @@ bool Document::IsWordAt(int start, int end) {  	return IsWordStartAt(start) && IsWordEndAt(end);  } -char Document::DocCharAt(int pos, void *param) { -	return reinterpret_cast<Document*>(param)->CharAt(pos); -} -  // The comparison and case changing functions here assume ASCII  // or extended ASCII such as the normal Windows code page. @@ -764,6 +763,22 @@ static inline char MakeLowerCase(char ch) {  		return static_cast<char>(ch - 'A' + 'a');  } +// Define a way for the Regular Expression code to access the document +class DocumentIndexer : public CharacterIndexer { +	Document *pdoc; +	int end; +public:  +	DocumentIndexer(Document *pdoc_, int end_) :  +		pdoc(pdoc_), end(end_) { +	} +	virtual char CharAt(int index) { +		if (index < 0 || index >= end) +			return 0; +		else  +			return pdoc->CharAt(index); +	} +}; +  // Find text in document, supporting both forward and backward  // searches (just pass minPos > maxPos to do a backward search)  // Has not been tested with backwards DBCS searches yet. @@ -771,11 +786,11 @@ long Document::FindText(int minPos, int maxPos, const char *s,                          bool caseSensitive, bool word, bool wordStart, bool regExp,  			int *length) {  	if (regExp) { -		char *pat = new char[strlen(s) + 4]; +		char *pat = new char[strlen(s) + 1];  		if (!pat)  			return -1; -		strcpy(pat, "/"); +		pat[0] = '\0';  		int startPos;  		int endPos; @@ -791,25 +806,26 @@ long Document::FindText(int minPos, int maxPos, const char *s,  		startPos = MovePositionOutsideChar(startPos, 1, false);  		endPos = MovePositionOutsideChar(endPos, 1, false); +		DocumentIndexer di(this, endPos); +		RESearch re;  		strcat(pat, s); -		strcat(pat, "/"); -		PosRegExp re; -		if (!caseSensitive) -			strcat(pat, "i"); -		if (!re.SetExpr(pat)) { +		const char *errmsg = re.Compile(pat); +		if (errmsg) {  			delete []pat;  			return -1;  		} -		re.param = this; -		re.CharAt = DocCharAt; -		SMatches matches; -		if (!re.Parse(startPos, 0, endPos, &matches)) { -			delete []pat; -			return -1; +		// Find a variable in a property file: \$([A-Za-z0-9_.]+) +		int success = re.Execute(di, startPos); +		int pos = -1; +		int lenRet = 0; +		if (success) { +			pos = re.bopat[0]; +			lenRet = re.eopat[0] - re.bopat[0];  		} -		*length = matches.e[0] - matches.s[0];  		delete []pat; -		return matches.s[0]; +		*length = lenRet; +		return pos; +  	} else {  		bool forward = minPos <= maxPos; diff --git a/src/Document.h b/src/Document.h index cfb131608..b52036513 100644 --- a/src/Document.h +++ b/src/Document.h @@ -196,8 +196,6 @@ public:  	const WatcherWithUserData *GetWatchers() const { return watchers; }  	int GetLenWatchers() const { return lenWatchers; } -	static char DocCharAt(int pos, void *param); -	  	bool IsWordPartSeparator(char ch);  	int WordPartLeft(int pos);  	int WordPartRight(int pos); diff --git a/src/PosRegExp.cxx b/src/PosRegExp.cxx deleted file mode 100644 index eb240e2fb..000000000 --- a/src/PosRegExp.cxx +++ /dev/null @@ -1,1192 +0,0 @@ -#include <string.h> -#include <stdio.h> -#include <ctype.h> -#include <malloc.h> - -#include "PosRegExp.h" - -#ifdef _MSC_VER -#pragma warning(disable: 4244) -#endif -#ifdef __BORLANDC__ -// Too much effort to to cean this code up so just ignore badly  -// bracketed initialisers, conversions losing significant digits,  -// and values assigned but not used. -#pragma warn -pin -#pragma warn -sig -#pragma warn -aus -#endif -//Up: /[A-Z \x80-\x9f \xf0 ]/x -//Lo: /[a-z \xa0-\xaf \xe0-\xef \xf1 ]/x -//Wd: /[\d _ A-Z a-z \xa0-\xaf \xe0-\xf1 \x80-\x9f]/x -//*   // Dos866 -SCharData UCData  = {0x0, 0x0, 0x7fffffe, 0x0, 0xffffffff, 0x0, 0x0, 0x10000}, -                     LCData  = {0x0, 0x0, 0x0, 0x7fffffe, 0x0, 0xffff, 0x0, 0x2ffff}, -                     WdData  = {0x0, 0x3ff0000, 0x87fffffe, 0x7fffffe, 0xffffffff, 0xffff, 0x0, 0x3ffff}, -                     DigData = {0x0, 0x3ff0000, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; -/*/   // cp1251 -SCharData UCData  = {0x0, 0x0, 0x7fffffe, 0x0, 0x0, 0x0, 0xffffffff, 0x0}, -          LCData  = {0x0, 0x0, 0x0, 0x7fffffe, 0x0, 0x0, 0x0, 0xffffffff}, -          WdData  = {0x0, 0x3ff0000, 0x87fffffe, 0x7fffffe, 0x0, 0x0, 0xffffffff, 0xffffffff}, -          DigData = {0x0, 0x3ff0000, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; -//*/ - -/////////////////////////////////////////////// - -int GetNumber(int *str,int s,int e) { -	int r = 1, num = 0; -	if (e < s) return -1; -	for(int i = e-1; i >= s; i--) { -		if (str[i] > '9' || str[i] < '0') return -1; -		num += (str[i] - 0x30)*r; -		r *= 10; -	}; -	return num; -	/* -	char tmp[20]; -	double Res; -	  if (e == s) return -1; -	  for (int i = s;i < e;i++) -	    tmp[i-s] = (char)Str[i]; -	  tmp[e-s] = 0; -	  GetNumber(tmp,&Res); -	  return (int)Res; -	*/ -}; - -bool IsDigit(char Symb) { -	return DigData.GetBit(Symb); -}; -bool IsWord(char Symb) { -	return WdData.GetBit(Symb); -}; -bool IsUpperCase(char Symb) { -	return UCData.GetBit(Symb); -}; -bool IsLowerCase(char Symb) { -	return LCData.GetBit(Symb); -}; -char LowCase(char Chr) { -	if (UCData.GetBit(Chr)) -		return Chr+0x20; -	return Chr; -}; - -/////////////////////////////////////////////// - -SRegInfo::SRegInfo() { -	Next = Parent = 0; -	un.Param = 0; -	Op = ReEmpty; -}; -SRegInfo::~SRegInfo() { -	if (Next) delete Next; -	if (un.Param) -		switch(Op) { -		case ReEnum: -		case ReNEnum: -			delete un.ChrClass; -			break; -		default: -			if (Op > ReBlockOps && Op < ReSymbolOps || Op == ReBrackets) -				delete un.Param; -			break; -		}; -}; - -/////////////////////////////////////////////// - -void SCharData::SetBit(unsigned char Bit) { -	int p = Bit/8; -	CArr[p] |= (1 << Bit%8); -}; -void SCharData::ClearBit(unsigned char Bit) { -	int p = Bit/8; -	CArr[p] &= ~(1 << Bit%8); -}; -bool SCharData::GetBit(unsigned char Bit) { -	int p = (unsigned char)Bit/8; -	return (CArr[p] & (1 << Bit%8))!=0; -}; - -///////////////////////////////////////////////////////////////// -//////////////////////  RegExp Class  /////////////////////////// -///////////////////////////////////////////////////////////////// - -PosRegExp::PosRegExp() { -	Info = 0; -	Exprn = 0; -	NoMoves = false; -	Error = true; -	FirstChar = 0; -	CurMatch = 0; -}; -PosRegExp::~PosRegExp() { -	if (Info) delete Info; -}; - -bool PosRegExp::SetExpr(const char *Expr) { -	if (!this) return false; -	Error = true; -	CurMatch = 0; -	if (SetExprLow(Expr)) Error = false; -	return !Error; -}; -bool PosRegExp::isok() { -	return !Error; -}; - - -bool PosRegExp::SetExprLow(const char *Expr) { -	int Len = strlen(Expr); -	bool  Ok = false; -	int i,j,s = 0,pos,tmp; -	int EnterBr = 0,EnterGr = 0,EnterFg = 0; - -	if (Info) delete Info; -	Info = new SRegInfo; -	Exprn = new int[Len]; - -	NoCase = false; -	Extend = false; -	if (Expr[0] == '/') s++; -	else return false; - -	for (i = Len; i > 0 && !Ok;i--) -		if (Expr[i] == '/') { -			Len = i-s; -			Ok = true; -			for (int j = i+1; Expr[j]; j++) { -				if (Expr[j] == 'i') NoCase = true; -				if (Expr[j] == 'x') Extend = true; -			}; -		}; -	if (!Ok) return false; - -	//////////////////////////////// -	for (j = 0,pos = 0; j < Len; j++,pos++) { -		if (Extend && Expr[j+s] == ' ') { -			pos--; -			continue; -		}; - -		Exprn[pos] = (int)(unsigned char)Expr[j+s]; - -		if (Expr[j+s] == BackSlash) { -			switch (Expr[j+s+1]) { -			case 'd': -				Exprn[pos] = ReDigit; -				break; -			case 'D': -				Exprn[pos] = ReNDigit; -				break; -			case 'w': -				Exprn[pos] = ReWordSymb; -				break; -			case 'W': -				Exprn[pos] = ReNWordSymb; -				break; -			case 's': -				Exprn[pos] = ReWSpace; -				break; -			case 'S': -				Exprn[pos] = ReNWSpace; -				break; -			case 'u': -				Exprn[pos] = ReUCase; -				break; -			case 'l': -				Exprn[pos] = ReNUCase; -				break; -			case 't': -				Exprn[pos] = '\t'; -				break; -			case 'n': -				Exprn[pos] = '\n'; -				break; -			case 'r': -				Exprn[pos] = '\r'; -				break; -			case 'b': -				Exprn[pos] = ReWBound; -				break; -			case 'B': -				Exprn[pos] = ReNWBound; -				break; -			case 'c': -				Exprn[pos] = RePreNW; -				break; -			case 'm': -				Exprn[pos] = ReStart; -				break; -			case 'M': -				Exprn[pos] = ReEnd; -				break; -			case 'x': -				tmp = toupper(Expr[j+s+2])-0x30; -				tmp = (tmp>9?tmp-7:tmp)<<4; -				tmp += (toupper(Expr[j+s+3])-0x30)>9?toupper(Expr[j+s+3])-0x37:(toupper(Expr[j+s+3])-0x30); -				Exprn[pos] = tmp; -				j+=2; -				break; -			case 'y': -				tmp = Expr[j+s+2] - 0x30; -				if (tmp >= 0 && tmp <= 9) { -					if (tmp == 1) { -						tmp = 10 + Expr[j+s+3] - 0x30; -						if (tmp >= 10 && tmp <= 19) j++; -						else tmp = 1; -					}; -					Exprn[pos] = ReBkTrace + tmp; -					j++; -					break; -				}; -			default: -				tmp = Expr[j+s+1] - 0x30; -				if (tmp >= 0 && tmp <= 9) { -					if (tmp == 1) { -						tmp = 10 + Expr[j+s+2] - 0x30; -						if (tmp >= 10 && tmp <= 19) j++; -						else tmp = 1; -					}; -					Exprn[pos] = ReBkBrack + tmp; -					break; -				} else -					Exprn[pos] = Expr[j+s+1]; -				break; -			}; -			j++; -			continue; -		}; -		if (Expr[j+s] == ']') { -			Exprn[pos] = ReEnumE; -			if (EnterFg || !EnterGr) return false; -			EnterGr--; -		}; -		if (Expr[j+s] == '-' && EnterGr) Exprn[pos] = ReFrToEnum; - -		if (EnterGr) continue; - -		if (Expr[j+s] == '[' && Expr[j+s+1] == '^') { -			Exprn[pos] = ReNEnumS; -			if (EnterFg) return false; -			EnterGr++; -			j++; -			continue; -		}; -		if (Expr[j+s] == '*' && Expr[j+s+1] == '?') { -			Exprn[pos] = ReNGMul; -			j++; -			continue; -		}; -		if (Expr[j+s] == '+' && Expr[j+s+1] == '?') { -			Exprn[pos] = ReNGPlus; -			j++; -			continue; -		}; -		if (Expr[j+s] == '?' && Expr[j+s+1] == '?') { -			Exprn[pos] = ReNGQuest; -			j++; -			continue; -		}; -		if (Expr[j+s] == '?' && Expr[j+s+1] == '#' && -		        Expr[j+s+2]>='0' && Expr[j+s+2]<='9') { -			Exprn[pos] = ReBehind+Expr[j+s+2]-0x30; -			j+=2; -			continue; -		}; -		if (Expr[j+s] == '?' && Expr[j+s+1] == '~' && -		        Expr[j+s+2]>='0' && Expr[j+s+2]<='9') { -			Exprn[pos] = ReNBehind+Expr[j+s+2]-0x30; -			j+=2; -			continue; -		}; -		if (Expr[j+s] == '?' && Expr[j+s+1] == '=') { -			Exprn[pos] = ReAhead; -			j++; -			continue; -		}; -		if (Expr[j+s] == '?' && Expr[j+s+1] == '!') { -			Exprn[pos] = ReNAhead; -			j++; -			continue; -		}; - -		if (Expr[j+s] == '(') { -			Exprn[pos] = ReLBrack; -			if (EnterFg) return false; -			EnterBr++; -		}; -		if (Expr[j+s] == ')') { -			Exprn[pos] = ReRBrack; -			if (!EnterBr || EnterFg) return false; -			EnterBr--; -		}; -		if (Expr[j+s] == '[') { -			Exprn[pos] = ReEnumS; -			if (EnterFg) return false; -			EnterGr++; -		}; -		if (Expr[j+s] == '{') { -			Exprn[pos] = ReRangeS; -			if (EnterFg) return false; -			EnterFg++; -		}; -		if (Expr[j+s] == '}' && Expr[j+s+1] == '?') { -			Exprn[pos] = ReNGRangeE; -			if (!EnterFg) return false; -			EnterFg--; -			j++; -			continue; -		}; -		if (Expr[j+s] == '}') { -			Exprn[pos] = ReRangeE; -			if (!EnterFg) return false; -			EnterFg--; -		}; - -		if (Expr[j+s] == '^') Exprn[pos] = ReSoL; -		if (Expr[j+s] == '$') Exprn[pos] = ReEoL; -		if (Expr[j+s] == '.') Exprn[pos] = ReAnyChr; -		if (Expr[j+s] == '*') Exprn[pos] = ReMul; -		if (Expr[j+s] == '+') Exprn[pos] = RePlus; -		if (Expr[j+s] == '?') Exprn[pos] = ReQuest; -		if (Expr[j+s] == '|') Exprn[pos] = ReOr; -	}; -	if (EnterGr || EnterBr || EnterFg) return false; - -	Info->Op = ReBrackets; -	Info->un.Param = new SRegInfo; -	Info->s = CurMatch++; - -	if (!SetStructs(Info->un.Param,0,pos)) return false; -	Optimize(); -	delete Exprn; -	return true; -}; - -void PosRegExp::Optimize() { -	PRegInfo Next = Info; -	FirstChar = 0; -	while(Next) { -		if (Next->Op == ReBrackets || Next->Op == RePlus  || Next->Op == ReNGPlus) { -			Next = Next->un.Param; -			continue; -		}; -		if (Next->Op == ReSymb) { -			if (Next->un.Symb & 0xFF00 &&  Next->un.Symb != ReSoL && Next->un.Symb != ReWBound) -				break; -			FirstChar = Next->un.Symb; -			break; -		}; -		break; -	}; -}; - -bool PosRegExp::SetStructs(PRegInfo &re,int start,int end) { -	PRegInfo Next,Prev,Prev2; -	int comma,st,en,ng,i, j,k; -	int EnterBr; -	bool Add; - -	if (end - start < 0) return false; -	Next = re; -	for (i = start; i < end; i++) { -		Add = false; -		// Ops -		if (Exprn[i] > ReBlockOps && Exprn[i] < ReSymbolOps) { -			Next->un.Param = 0; -			Next->Op = (EOps)Exprn[i]; -			Add = true; -		}; -		// {n,m} -		if (Exprn[i] == ReRangeS) { -			st = i; -			en = -1; -			comma = -1; -			ng = 0; -			for (j = i;j < end;j++) { -				if (Exprn[j] == ReNGRangeE) { -					en = j; -					ng = 1; -					break; -				}; -				if (Exprn[j] == ReRangeE) { -					en = j; -					break; -				}; -				if ((char)Exprn[j] == ',') -					comma = j; -			}; -			if (en == -1) return false; -			if (comma == -1) comma = en; -			Next->s = (char)GetNumber(Exprn,st+1,comma); -			if (comma != en) -				Next->e = (char)GetNumber(Exprn,comma+1,en); -			else -				Next->e = Next->s; -			Next->un.Param = 0; -			Next->Op = ng?ReNGRangeNM:ReRangeNM; -			if (en-comma == 1) { -				Next->e = -1; -				Next->Op = ng?ReNGRangeN:ReRangeN; -			}; -			i=j; -			Add = true; -		}; -		// [] [^] -		if (Exprn[i] == ReEnumS || Exprn[i] == ReNEnumS) { -			Next->Op = (Exprn[i] == ReEnumS)?ReEnum:ReNEnum; -			for (j = i+1;j < end;j++) { -				if (Exprn[j] == ReEnumE) -					break; -			}; -			if (j == end) return false; -			Next->un.ChrClass = new SCharData; -			memset(Next->un.ChrClass, 0, 32); -			for (j = i+1;Exprn[j] != ReEnumE;j++) { -				if (Exprn[j+1] == ReFrToEnum) { -					for (i = (Exprn[j]&0xFF); i < (Exprn[j+2]&0xFF);i++) -						Next->un.ChrClass->SetBit(i&0xFF); -					j++; -					continue; -				}; -				switch(Exprn[j]) { -				case ReDigit: -					for (k = 0x30;k < 0x40;k++) -						if (IsDigit((char)k)) -							Next->un.ChrClass->SetBit(k); -					break; -				case ReNDigit: -					for (k = 0x30;k < 0x40;k++) -						if (!IsDigit((char)k)) -							Next->un.ChrClass->SetBit(k); -					Next->un.ChrClass->ClearBit(0x0a); -					Next->un.ChrClass->ClearBit(0x0d); -					break; -				case ReWordSymb: -					for (k = 0;k < 256;k++) -						if (IsWord((char)k)) -							Next->un.ChrClass->SetBit(k); -					break; -				case ReNWordSymb: -					for (k = 0;k < 256;k++) -						if (!IsWord((char)k)) -							Next->un.ChrClass->SetBit(k); -					Next->un.ChrClass->ClearBit(0x0a); -					Next->un.ChrClass->ClearBit(0x0d); -					break; -				case ReWSpace: -					Next->un.ChrClass->SetBit(0x20); -					Next->un.ChrClass->SetBit(0x09); -					break; -				case ReNWSpace: -					memset(Next->un.ChrClass->IArr, 0xFF, 32); -					Next->un.ChrClass->ClearBit(0x20); -					Next->un.ChrClass->ClearBit(0x09); -					Next->un.ChrClass->ClearBit(0x0a); -					Next->un.ChrClass->ClearBit(0x0d); -					break; -				default: -					if (!(Exprn[j]&0xFF00)) -						Next->un.ChrClass->SetBit(Exprn[j]&0xFF); -					break; -				}; -			}; -			Add = true; -			i=j; -		}; -		// ( ... ) -		if (Exprn[i] == ReLBrack) { -			EnterBr = 1; -			for (j = i+1;j < end;j++) { -				if (Exprn[j] == ReLBrack) EnterBr++; -				if (Exprn[j] == ReRBrack) EnterBr--; -				if (!EnterBr) break; -			}; -			if (EnterBr) return false; -			Next->Op = ReBrackets; -			Next->un.Param = new SRegInfo; -			Next->un.Param->Parent = Next; -			Next->s = CurMatch++; -			if (CurMatch > MatchesNum) CurMatch = MatchesNum; -			if (!SetStructs(Next->un.Param,i+1,j)) return false; -			Add = true; -			i=j; -		}; -		if ((Exprn[i]&0xFF00) == ReBkTrace) { -			Next->Op = ReBkTrace; -			Next->un.Symb = Exprn[i]&0xFF; -			Add = true; -		}; -		if ((Exprn[i]&0xFF00) == ReBkBrack) { -			Next->Op = ReBkBrack; -			Next->un.Symb = Exprn[i]&0xFF; -			Add = true; -		}; -		if ((Exprn[i]&0xFF00) == ReBehind) { -			Next->Op = ReBehind; -			Next->s = Exprn[i]&0xFF; -			Add = true; -		}; -		if ((Exprn[i]&0xFF00) == ReNBehind) { -			Next->Op = ReNBehind; -			Next->s = Exprn[i]&0xFF; -			Add = true; -		}; -		// Chars -		if (Exprn[i] >= ReAnyChr && Exprn[i] < ReTemp || Exprn[i] < 0x100) { -			Next->Op = ReSymb; -			Next->un.Symb = Exprn[i]; -			Add = true; -		}; -		// Next -		if (Add && i != end-1) { -			Next->Next = new SRegInfo; -			Next->Next->Parent = Next->Parent; -			Next = Next->Next; -		}; -	}; -	Next = re; -	Prev = Prev2 = 0; -	while(Next) { -		if (Next->Op > ReBlockOps && Next->Op < ReSymbolOps) { -			if (!Prev) return false; -			if (!Prev2) re = Next; -			else Prev2->Next = Next; -			//if (Prev->Op > ReBlockOps && Prev->Op < ReSymbolOps) return false; -			Prev->Parent = Next; -			Prev->Next = 0; -			Next->un.Param = Prev; -			Prev = Prev2; -		}; -		Prev2 = Prev; -		Prev = Next; -		Next = Next->Next; -	}; - -	return true; -}; - -///////////////////////////////////////////////////////////////// -/////////////////////////  Parsing  ///////////////////////////// -///////////////////////////////////////////////////////////////// - -bool PosRegExp::CheckSymb(int Symb,bool Inc) { -	bool Res; -	char ch; -	switch(Symb) { -	case ReAnyChr: -		if (posParse >= posEnd) return false; -		ch = CharAt(posParse,param); -		Res = ch != '\r' && ch != '\n'; -		if (Res && Inc) posParse++; -		return Res; -	case ReSoL: -		if (posStart == posParse) -			return true; -		ch = CharAt(posParse-1,param); -		return ch == '\n' || ch == '\r'; -	case ReEoL: -		if (posEnd == posParse) -			return true; -		ch = CharAt(posParse,param); -		return ch == '\n' || ch == '\r'; -	case ReDigit: -		if (posParse >= posEnd) return false; -		ch = CharAt(posParse,param); -		Res = (ch >= 0x30 && ch <= 0x39); -		if (Res && Inc) posParse++; -		return Res; -	case ReNDigit: -		if (posParse >= posEnd) return false; -		ch = CharAt(posParse,param); -		Res = !(ch >= 0x30 && ch <= 0x39) && ch != '\r' && ch != '\n'; -		if (Res && Inc) posParse++; -		return Res; -	case ReWordSymb: -		if (posParse >= posEnd) return false; -		Res = IsWord(CharAt(posParse,param)); -		if (Res && Inc) posParse++; -		return Res; -	case ReNWordSymb: -		if (posParse >= posEnd) return false; -		ch = CharAt(posParse,param); -		Res = !IsWord(ch) && ch != '\r' && ch != '\n'; -		if (Res && Inc) posParse++; -		return Res; -	case ReWSpace: -		if (posParse >= posEnd) return false; -		ch = CharAt(posParse,param); -		Res = (ch == 0x20 || ch == '\t'); -		if (Res && Inc) posParse++; -		return Res; -	case ReNWSpace: -		if (posParse >= posEnd) return false; -		ch = CharAt(posParse,param); -		Res = !(ch == 0x20 || ch == '\t') && ch != '\r' && ch != '\n'; -		if (Res && Inc) posParse++; -		return Res; -	case ReUCase: -		if (posParse >= posEnd) return false; -		Res = IsUpperCase(CharAt(posParse,param)); -		if (Res && Inc) posParse++; -		return Res; -	case ReNUCase: -		if (posParse >= posEnd) return false; -		Res = IsLowerCase(CharAt(posParse,param)); -		if (Res && Inc) posParse++; -		return Res; -	case ReWBound: -		if (posParse >= posEnd) return true; -		ch = CharAt(posParse,param); -		return IsWord(CharAt(posParse,param)) && (posParse == posStart || !IsWord(CharAt(posParse-1,param))); -	case ReNWBound: -		if (posParse >= posEnd) return true; -		return !IsWord(CharAt(posParse,param)) && IsWord(CharAt(posParse-1,param)); -	case RePreNW: -		if (posParse >= posEnd) return true; -		return (posParse == posStart || !IsWord(CharAt(posParse-1,param))); -	case ReStart: -		Matches->s[0] = (posParse-posStart); -		return true; -	case ReEnd: -		Matches->e[0] = (posParse-posStart); -		return true; -	default: -		if ((Symb & 0xFF00) || posParse >= posEnd) return false; -		if (NoCase) { -			if (LowCase(CharAt(posParse,param)) != LowCase((char)Symb&0xFF)) return false; -		} else -			if (CharAt(posParse,param) != (char)(Symb&0xFF)) return false; -		if (Inc) posParse++; -		return true; -	}; -} - -bool PosRegExp::LowParseRe(PRegInfo &Next) { -	PRegInfo OrNext; -	int i,match,sv; -	int posStr; - -	switch(Next->Op) { -	case ReSymb: -		if (!CheckSymb(Next->un.Symb,true)) return false; -		break; -	case ReEmpty: -		break; -	case ReBkTrace: -		if (!posBkStr | !BkTrace) return false; -		sv = Next->un.Symb; -		posStr = posParse; -		for (i = BkTrace->s[sv]; i < BkTrace->e[sv]; i++) { -			if (CharAt(posStr,param) != CharAt(posBkStr+i,param) || posEnd == posStr) return false; -			posStr++; -		}; -		posParse = posStr; -		break; -	case ReBkBrack: -		sv = Next->un.Symb; -		posStr = posParse; -		if (Matches->s[sv] == -1 || Matches->e[sv] == -1) return false; -		for (i = Matches->s[sv]; i < Matches->e[sv]; i++) { -			if (CharAt(posStr,param) != CharAt(posStart+i,param) || posEnd == posStr) return false; -			posStr++; -		}; -		posParse = posStr; -		break; -	case ReBehind: -		sv = Next->s; -		posStr = posParse; -		posParse -= sv; -		if (!LowParse(Next->un.Param)) return false; -		posParse = posStr; -		break; -	case ReNBehind: -		sv = Next->s; -		posStr = posParse; -		posParse -= sv; -		if (LowParse(Next->un.Param)) return false; -		posParse = posStr; -		break; -	case ReAhead: -		posStr = posParse; -		if (!LowParse(Next->un.Param)) return false; -		posParse = posStr; -		break; -	case ReNAhead: -		posStr = posParse; -		if (LowParse(Next->un.Param)) return false; -		posParse = posStr; -		break; -	case ReEnum: -		if (posParse >= posEnd) return false; -		if (!Next->un.ChrClass->GetBit(CharAt(posParse,param))) return false; -		posParse++; -		break; -	case ReNEnum: -		if (posParse >= posEnd) return false; -		if (Next->un.ChrClass->GetBit(CharAt(posParse,param))) return false; -		posParse++; -		break; -	case ReBrackets: -		match = Next->s; -		sv = posParse-posStart; -		posStr = posParse; -		if (LowParse(Next->un.Param)) { -			if (match || (Matches->s[match] == -1)) -				Matches->s[match] = sv; -			if (match || (Matches->e[match] == -1)) -				Matches->e[match] = posParse-posStart; -			return true; -		}; -		posParse = posStr; -		return false; -	case ReMul: -		posStr = posParse; -		while (LowParse(Next->un.Param)); -		while(!LowCheckNext(Next) && posStr < posParse) posParse--; -		break; -	case ReNGMul: -		do { -			if (LowCheckNext(Next)) break; -		} while (LowParse(Next->un.Param)); -		break; -	case RePlus: -		posStr = posParse; -		match = false; -		while (LowParse(Next->un.Param)) -			match = true; -		if (!match) return false; -		while(!LowCheckNext(Next) && posStr < posParse) posParse--; -		break; -	case ReNGPlus: -		if (!LowParse(Next->un.Param)) return false; -		do { -			if (LowCheckNext(Next)) break; -		} while (LowParse(Next->un.Param)); -		break; -	case ReQuest: -		LowParse(Next->un.Param); -		break; -	case ReNGQuest: -		if (LowCheckNext(Next)) break; -		if (!LowParse(Next->un.Param)) return false; -		break; -	case ReOr: -		OrNext = Next; -		// posStr = posParse; -		if (LowParse(Next->un.Param)) { -			while (OrNext && OrNext->Op == ReOr) -				OrNext = OrNext->Next; -			/*if (!LowCheckNext(OrNext)){ -			  posParse = posStr; -			  OrNext = Next; -		};*/ -		}; -		Next = OrNext; -		break; -	case ReRangeN: -		posStr = posParse; -		i = 0; -		while (LowParse(Next->un.Param)) i++; // ??? -		do { -			if (i < Next->s) { -				posParse = posStr; -				return false; -			}; -			i--; -		} while(!LowCheckNext(Next) && posStr < posParse--); -		break; -	case ReNGRangeN: -		posStr = posParse; -		i = 0; -		while (LowParse(Next->un.Param)) { -			i++; -			if (i >= Next->s && LowCheckNext(Next)) // ??? -				break; -		}; -		if (i < Next->s) { -			posParse = posStr; -			return false; -		}; -		break; -	case ReRangeNM: -		posStr = posParse; -		i = 0; -		while (i < Next->s && LowParse(Next->un.Param)) // ??? -			i++; -		if (i < Next->s) { -			posParse = posStr; -			return false; -		}; -		while (i < Next->e && LowParse(Next->un.Param)) // ??? -			i++; - -		while(!LowCheckNext(Next)) { -			i--; -			posParse--; -			if (i < Next->s) { -				posParse = posStr; -				return false; -			}; -		}; -		break; -	case ReNGRangeNM: -		posStr = posParse; -		i = 0; -		while (i < Next->s && LowParse(Next->un.Param)) // ??? -			i++; -		if (i < Next->s) { -			posParse = posStr; -			return false; -		}; -		while(!LowCheckNext(Next)) { -			i++; -			if (!LowParse(Next->un.Param) || i > Next->e) { // ??? -				posParse = posStr; -				return false; -			}; -		}; -		break; -	}; -	return true; -}; - -bool PosRegExp::LowCheckNext(PRegInfo Re) { -	PRegInfo Next; -	int tmp = posParse; -	Next = Re; -	do { -		if (Next && Next->Op == ReOr) -			while (Next && Next->Op == ReOr) -				Next = Next->Next; -		if (Next->Next && !LowParse(Next->Next)) { -			posParse = tmp; -			Ok = false; -			return false; -		}; -		Next = Next->Parent; -	} while(Next); -	posParse = tmp; -	if (Ok != false) Ok = true; -	return true; -}; - -bool PosRegExp::LowParse(PRegInfo Re) { -	while(Re && posParse <= posEnd) { -		if (!LowParseRe(Re)) return false; -		if (Re) Re = Re->Next; -	}; -	return true; -}; - -bool PosRegExp::QuickCheck() { -	if (!NoMoves || !FirstChar) -		return true; -	switch(FirstChar) { -	case ReSoL: -		if (posParse != posStart) return false; -		return true; -	case ReWBound: -		return IsWord(CharAt(posParse,param)) && (posParse == posStart || !IsWord(CharAt(posParse-1,param))); -	default: -		if (NoCase && LowCase(CharAt(posParse,param)) != LowCase(FirstChar)) return false; -		if (!NoCase && CharAt(posParse,param) != (char)FirstChar) return false; -		return true; -	}; -}; - -bool PosRegExp::ParseRe(int posStr) { -	if (Error) return false; - -	posParse = posStr; -	if (!QuickCheck()) return false; - -	for (int i = 0; i < MatchesNum; i++) -		Matches->s[i] = Matches->e[i] = -1; -	Matches->CurMatch = CurMatch; - -	Ok = -1; -	//try{ -	do { -		if (!LowParse(Info)) { -			if (NoMoves) return false; -		} else -			return true; -		posParse = ++posStr; -	} while(posParse != posEnd+1); -	return false; -	//}__except(){ -	//  return true; -	//}; -} -; - -bool PosRegExp::Parse(int posStr,int posSol, int posEol, PMatches Mtch, int Moves) { -	if (!this) return false; - -	bool s = NoMoves; -	if (Moves != -1) NoMoves = Moves!=0; -	posStart = posSol; -	posEnd   = posEol; -	Matches = Mtch; -	bool r = ParseRe(posStr); -	NoMoves = s; -	return r; -}; - -bool PosRegExp::Parse(int posStr, int posStop, PMatches Mtch) { -	if (!this) return false; -	posStart = posStr; -	posEnd = posStop; -	Matches = Mtch; -	return ParseRe(posStr); -}; - -bool PosRegExp::SetNoMoves(bool Moves) { -	NoMoves = Moves; -	return true; -}; - -bool PosRegExp::SetBkTrace(int posStr,PMatches Trace) { -	BkTrace = Trace; -	posBkStr = posStr; -	return true; -}; - -#define EVAL_MATCHES 16 -#define EVAL_CHUNKSIZE 256 - -#define EVAL_LOWERCASE 1 -#define EVAL_UPPERCASE 2 -#define EVAL_LOWERCASE_NEXT 4 -#define EVAL_UPPERCASE_NEXT 8 - -bool PosRegExp::Evaluate(char *Expr, int posStr, PMatches Mtch, char **Res) { -	int length, -	newlength, -	chunklength, -	value, -	size, -	src, -	end; -	unsigned flag; -	char ch, -	*dest, -	*pool; - -	size = EVAL_CHUNKSIZE; -	pool = (char*) malloc (size); -	dest = pool; -	length = 0; -	flag = 0; -	while (*Expr) { -		switch (ch = *Expr++) { -		case '\\': -			switch (ch = *Expr++) { -			case 'A': -			case 'B': -			case 'C': -			case 'D': -			case 'E': -			case 'F': -				ch -= ('A' - '0'); -			case '0': -			case '1': -			case '2': -			case '3': -			case '4': -			case '5': -			case '6': -			case '7': -			case '8': -			case '9': -				value = ch - '0'; -				if (Mtch->s[value] != -1 && value < EVAL_MATCHES) { -					chunklength = Mtch->e[value] - Mtch->s[value]; -					if (chunklength) { -						newlength = chunklength + length; -						if (newlength > size) { -							do -								size += EVAL_CHUNKSIZE; -							while (size < newlength); -							pool = (char*) realloc (pool, size); -							dest = pool + length; -						} -						length = newlength; -						src = posStr + Mtch->s[value]; -						end = posStr + Mtch->e[value]; -						if (flag & EVAL_UPPERCASE) { -							if (flag & EVAL_LOWERCASE_NEXT) { -								*dest++ = tolower (CharAt(src++,param)); -								flag &= ~EVAL_LOWERCASE_NEXT; -							} -							while (src < end) -								*dest++ = toupper (CharAt(src++,param)); -						} else if (flag & EVAL_LOWERCASE) { -							if (flag & EVAL_UPPERCASE_NEXT) { -								*dest++ = toupper (CharAt(src++,param)); -								flag &= ~EVAL_UPPERCASE_NEXT; -							} -							while (src < end) -								*dest++ = tolower (CharAt(src++,param)); -						} else { -							if (flag & EVAL_LOWERCASE_NEXT) { -								*dest++ = tolower (CharAt(src++,param)); -								flag &= ~EVAL_LOWERCASE_NEXT; -							} else if (flag & EVAL_UPPERCASE_NEXT) { -								*dest++ = toupper (CharAt(src++,param)); -								flag &= ~EVAL_UPPERCASE_NEXT; -							} -							while (src < end) -								*dest++ = CharAt(src++,param); -						} -					} -				} else -					goto error; -				continue; -			case '\0': -				goto error; -			case 'r': -				ch = '\r'; -				break; -			case 'n': -				ch = '\n'; -				break; -			case 'b': -				ch = '\b'; -				break; -			case 'a': -				ch = '\a'; -				break; -			case 't': -				ch = '\t'; -				break; -			case 'U': -				flag |= EVAL_UPPERCASE; -				continue; -			case 'u': -				flag |= EVAL_UPPERCASE_NEXT; -				continue; -			case 'L': -				flag |= EVAL_LOWERCASE; -				continue; -			case 'l': -				flag |= EVAL_LOWERCASE_NEXT; -				continue; -			case 'Q': -			case 'q': -				flag &= ~(EVAL_UPPERCASE | EVAL_LOWERCASE); -				continue; -			case 'x': -				{ -					if (!*Expr) -						goto error; -					value = toupper (*Expr) - '0'; -					if (value > 9) -						value = value + '0' - 'A' + 10; -					if (value > 15) -						goto error; -					ch = value << 4; -					Expr++; -					if (!*Expr) -						goto error; -					value = toupper (*Expr) - '0'; -					if (value > 9) -						value = value + '0' - 'A' + 10; -					if (value > 15) -						goto error; -					Expr++; -					ch |= value; -					break; -				} -			case 'd': -				{ -					if (!*Expr) -						goto error; -					value = toupper (*Expr) - '0'; -					if (value > 9) -						goto error; -					ch = value * 100; -					Expr++; -					if (!*Expr) -						goto error; -					value = toupper (*Expr) - '0'; -					if (value > 9) -						goto error; -					ch += value * 10; -					Expr++; -					if (!*Expr) -						goto error; -					value = toupper (*Expr) - '0'; -					if (value > 9) -						goto error; -					ch += value; -					Expr++; -					break; -				} -			case 'o': -				{ -					if (!*Expr) -						goto error; -					value = toupper (*Expr) - '0'; -					if (value > 9) -						goto error; -					ch = value << 6; -					Expr++; -					if (!*Expr) -						goto error; -					value = toupper (*Expr) - '0'; -					if (value > 9) -						goto error; -					ch += value << 3; -					Expr++; -					if (!*Expr) -						goto error; -					value = toupper (*Expr) - '0'; -					if (value > 9) -						goto error; -					ch |= value; -					Expr++; -					/* break; */ -				} -				/* default: -					break; */ -			} -		default: -			if (++length > size) { -				do -					size += EVAL_CHUNKSIZE; -				while (size < length); -				pool = (char*) realloc (pool, size); -				dest = pool + length - 1; -			} -			if (flag & EVAL_LOWERCASE_NEXT) { -				*dest++ = tolower (ch); -				flag &= ~EVAL_LOWERCASE_NEXT; -			} else if (flag & EVAL_UPPERCASE_NEXT) { -				*dest++ = toupper (ch); -				flag &= ~EVAL_UPPERCASE_NEXT; -			} else if (flag & EVAL_UPPERCASE) -				*dest++ = toupper (ch); -			else if (flag & EVAL_LOWERCASE) -				*dest++ = tolower (ch); -			else -				*dest++ = ch; -		} -	} -	if (++length > size) { -		do -			size += EVAL_CHUNKSIZE; -		while (size < length); -		pool = (char*) realloc (pool, size); -		dest = pool + length - 1; -	} -	*dest = '\0'; -	*Res = pool; -	return true; -error: -	free (pool); -	return false; -} diff --git a/src/RESearch.cxx b/src/RESearch.cxx new file mode 100644 index 000000000..a9b0d264f --- /dev/null +++ b/src/RESearch.cxx @@ -0,0 +1,844 @@ +// Scintilla source code edit control +/** @file RESearch.cxx + ** Regular expression search library. + **/ + +/* + * regex - Regular expression pattern matching  and replacement + * + * By:  Ozan S. Yigit (oz) + *      Dept. of Computer Science + *      York University + * + * Translation to C++ by Neil Hodgson neilh@scintilla.org + * Removed all use of register. + * Converted to modern function prototypes. + * Put all global/static variables into an object so this code can be  + * used from multiple threads etc. + * + * These routines are the PUBLIC DOMAIN equivalents of regex + * routines as found in 4.nBSD UN*X, with minor extensions. + * + * These routines are derived from various implementations found + * in software tools books, and Conroy's grep. They are NOT derived + * from licensed/restricted software. + * For more interesting/academic/complicated implementations, + * see Henry Spencer's regexp routines, or GNU Emacs pattern + * matching module. + * + * Modification history: + * + * $Log$ + * Revision 1.1  2001/04/04 12:52:44  nyamatongwe + * Moved to public domain regular expresion implementation. + * + * Revision 1.4  1991/10/17  03:56:42  oz + * miscellaneous changes, small cleanups etc. + * + * Revision 1.3  1989/04/01  14:18:09  oz + * Change all references to a dfa: this is actually an nfa. + * + * Revision 1.2  88/08/28  15:36:04  oz + * Use a complement bitmap to represent NCL. + * This removes the need to have seperate  + * code in the PMatch case block - it is  + * just CCL code now. + *  + * Use the actual CCL code in the CLO + * section of PMatch. No need for a recursive + * PMatch call. + *  + * Use a bitmap table to set char bits in an + * 8-bit chunk. + *  + * Interfaces: + *      RESearch::Compile:        compile a regular expression into a NFA. + * + *			char *RESearch::Compile(s) + *			char *s; + * + *      RESearch::Execute:        execute the NFA to match a pattern. + * + *			int RESearch::Execute(s) + *			char *s; + * + *	RESearch::ModifyWord		change RESearch::Execute's understanding of what a "word" + *			looks like (for \< and \>) by adding into the + *			hidden word-syntax table. + * + *			void RESearch::ModifyWord(s) + *			char *s; + * + *      RESearch::Substitute:	substitute the matched portions in a new string. + * + *			int RESearch::Substitute(src, dst) + *			char *src; + *			char *dst; + * + *	re_fail:	failure routine for RESearch::Execute. + * + *			void re_fail(msg, op) + *			char *msg; + *			char op; + *   + * Regular Expressions: + * + *      [1]     char    matches itself, unless it is a special + *                      character (metachar): . \ [ ] * + ^ $ + * + *      [2]     .       matches any character. + * + *      [3]     \       matches the character following it, except + *			when followed by a left or right round bracket, + *			a digit 1 to 9 or a left or right angle bracket.  + *			(see [7], [8] and [9]) + *			It is used as an escape character for all  + *			other meta-characters, and itself. When used + *			in a set ([4]), it is treated as an ordinary + *			character. + * + *      [4]     [set]   matches one of the characters in the set. + *                      If the first character in the set is "^", + *                      it matches a character NOT in the set, i.e.  + *			complements the set. A shorthand S-E is  + *			used to specify a set of characters S upto  + *			E, inclusive. The special characters "]" and  + *			"-" have no special meaning if they appear  + *			as the first chars in the set. + *                      examples:        match: + * + *                              [a-z]    any lowercase alpha + * + *                              [^]-]    any char except ] and - + * + *                              [^A-Z]   any char except uppercase + *                                       alpha + * + *                              [a-zA-Z] any alpha + * + *      [5]     *       any regular expression form [1] to [4], followed by + *                      closure char (*) matches zero or more matches of + *                      that form. + * + *      [6]     +       same as [5], except it matches one or more. + * + *      [7]             a regular expression in the form [1] to [10], enclosed + *                      as \(form\) matches what form matches. The enclosure + *                      creates a set of tags, used for [8] and for + *                      pattern substution. The tagged forms are numbered + *			starting from 1. + * + *      [8]             a \ followed by a digit 1 to 9 matches whatever a + *                      previously tagged regular expression ([7]) matched. + * + *	[9]	\<	a regular expression starting with a \< construct + *		\>	and/or ending with a \> construct, restricts the + *			pattern matching to the beginning of a word, and/or + *			the end of a word. A word is defined to be a character + *			string beginning and/or ending with the characters + *			A-Z a-z 0-9 and _. It must also be preceded and/or + *			followed by any character outside those mentioned. + * + *      [10]            a composite regular expression xy where x and y + *                      are in the form [1] to [10] matches the longest + *                      match of x followed by a match for y. + * + *      [11]	^	a regular expression starting with a ^ character + *		$	and/or ending with a $ character, restricts the + *                      pattern matching to the beginning of the line, + *                      or the end of line. [anchors] Elsewhere in the + *			pattern, ^ and $ are treated as ordinary characters. + * + * + * Acknowledgements: + * + *	HCR's Hugh Redelmeier has been most helpful in various + *	stages of development. He convinced me to include BOW + *	and EOW constructs, originally invented by Rob Pike at + *	the University of Toronto. + * + * References: + *              Software tools			Kernighan & Plauger + *              Software tools in Pascal        Kernighan & Plauger + *              Grep [rsx-11 C dist]            David Conroy + *		ed - text editor		Un*x Programmer's Manual + *		Advanced editing on Un*x	B. W. Kernighan + *		RegExp routines			Henry Spencer + * + * Notes: + * + *	This implementation uses a bit-set representation for character + *	classes for speed and compactness. Each character is represented  + *	by one bit in a 128-bit block. Thus, CCL always takes a  + *	constant 16 bytes in the internal nfa, and RESearch::Execute does a single + *	bit comparison to locate the character in the set. + * + * Examples: + * + *	pattern:	foo*.* + *	compile:	CHR f CHR o CLO CHR o END CLO ANY END END + *	matches:	fo foo fooo foobar fobar foxx ... + * + *	pattern:	fo[ob]a[rz]	 + *	compile:	CHR f CHR o CCL bitset CHR a CCL bitset END + *	matches:	fobar fooar fobaz fooaz + * + *	pattern:	foo\\+ + *	compile:	CHR f CHR o CHR o CHR \ CLO CHR \ END END + *	matches:	foo\ foo\\ foo\\\  ... + * + *	pattern:	\(foo\)[1-3]\1	(same as foo[1-3]foo) + *	compile:	BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END + *	matches:	foo1foo foo2foo foo3foo + * + *	pattern:	\(fo.*\)-\1 + *	compile:	BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END + *	matches:	foo-foo fo-fo fob-fob foobar-foobar ... + */ + +#include "RESearch.h" + +#define EXTEND + +#define OKP     1 +#define NOP     0 + +#define CHR     1 +#define ANY     2 +#define CCL     3 +#define BOL     4 +#define EOL     5 +#define BOT     6 +#define EOT     7 +#define BOW	8 +#define EOW	9 +#define REF     10 +#define CLO     11 + +#define END     0 + +/* + * The following defines are not meant to be changeable. + * They are for readability only. + */ +#define BLKIND	0170 +#define BITIND	07 + +#define ASCIIB	0177 + +const char bitarr[] = {1,2,4,8,16,32,64,'\200'}; + +#define badpat(x)	(*nfa = END, x) +  +RESearch::RESearch() { +	sta = NOP;               	/* status of lastpat */ +	bol = 0; +	for (int i=0; i<BITBLK; i++) +		bittab[i] = 0; +} + +void RESearch::ChSet(char c) { +	bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND]; +} + +const char *RESearch::Compile(char *pat) { +	char *p;               /* pattern pointer   */ +	char *mp=nfa;          /* nfa pointer       */ +	char *lp;              /* saved pointer..   */ +	char *sp=nfa;          /* another one..     */ + +	int tagi = 0;          /* tag stack index   */ +	int tagc = 1;          /* actual tag count  */ + +	int n; +	char mask;		/* xor mask -CCL/NCL */ +	int c1, c2; +		 +	if (!pat || !*pat) +		if (sta) +			return 0; +		else +			return badpat("No previous regular expression"); +	sta = NOP; + +	for (p = pat; *p; p++) { +		lp = mp; +		switch(*p) { + +		case '.':               /* match any char..  */ +			*mp++ = ANY; +			break; + +		case '^':               /* match beginning.. */ +			if (p == pat) +				*mp++ = BOL; +			else { +				*mp++ = CHR; +				*mp++ = *p; +			} +			break; + +		case '$':               /* match endofline.. */ +			if (!*(p+1)) +				*mp++ = EOL; +			else { +				*mp++ = CHR; +				*mp++ = *p; +			} +			break; + +		case '[':               /* match char class..*/ +			*mp++ = CCL; + +			if (*++p == '^') { +				mask = '\377';	 +				p++; +			} +			else +				mask = 0; + +			if (*p == '-')		/* real dash */ +				ChSet(*p++); +			if (*p == ']')		/* real brac */ +				ChSet(*p++); +			while (*p && *p != ']') { +				if (*p == '-' && *(p+1) && *(p+1) != ']') { +					p++; +					c1 = *(p-2) + 1; +					c2 = *p++; +					while (c1 <= c2) +						ChSet(static_cast<char>(c1++)); +				} +#ifdef EXTEND +				else if (*p == '\\' && *(p+1)) { +					p++; +					ChSet(*p++); +				} +#endif +				else +					ChSet(*p++); +			} +			if (!*p) +				return badpat("Missing ]"); + +			for (n = 0; n < BITBLK; bittab[n++] = (char) 0) +				*mp++ = static_cast<char>(mask ^ bittab[n]); +	 +			break; + +		case '*':               /* match 0 or more.. */ +		case '+':               /* match 1 or more.. */ +			if (p == pat) +				return badpat("Empty closure"); +			lp = sp;		/* previous opcode */ +			if (*lp == CLO)		/* equivalence..   */ +				break; +			switch(*lp) { + +			case BOL: +			case BOT: +			case EOT: +			case BOW: +			case EOW: +			case REF: +				return badpat("Illegal closure"); +			default: +				break; +			} + +			if (*p == '+') +				for (sp = mp; lp < sp; lp++) +					*mp++ = *lp; + +			*mp++ = END; +			*mp++ = END; +			sp = mp; +			while (--mp > lp) +				*mp = mp[-1]; +			*mp = CLO; +			mp = sp; +			break; + +		case '\\':              /* tags, backrefs .. */ +			switch(*++p) { + +			case '(': +				if (tagc < MAXTAG) { +					tagstk[++tagi] = tagc; +					*mp++ = BOT; +					*mp++ = static_cast<char>(tagc++); +				} +				else +					return badpat("Too many \\(\\) pairs"); +				break; +			case ')': +				if (*sp == BOT) +					return badpat("Null pattern inside \\(\\)"); +				if (tagi > 0) { +					*mp++ = static_cast<char>(EOT); +					*mp++ = static_cast<char>(tagstk[tagi--]); +				} +				else +					return badpat("Unmatched \\)"); +				break; +			case '<': +				*mp++ = BOW; +				break; +			case '>': +				if (*sp == BOW) +					return badpat("Null pattern inside \\<\\>"); +				*mp++ = EOW; +				break; +			case '1': +			case '2': +			case '3': +			case '4': +			case '5': +			case '6': +			case '7': +			case '8': +			case '9': +				n = *p-'0'; +				if (tagi > 0 && tagstk[tagi] == n) +					return badpat("Cyclical reference"); +				if (tagc > n) { +					*mp++ = static_cast<char>(REF); +					*mp++ = static_cast<char>(n); +				} +				else +					return badpat("Undetermined reference"); +				break; +#ifdef EXTEND +			case 'a': +				*mp++ = CHR; +				*mp++ = '\a'; +				break; +			case 'b': +				*mp++ = CHR; +				*mp++ = '\b'; +				break; +			case 'n': +				*mp++ = CHR; +				*mp++ = '\n'; +				break; +			case 'f': +				*mp++ = CHR; +				*mp++ = '\f'; +				break; +			case 'r': +				*mp++ = CHR; +				*mp++ = '\r'; +				break; +			case 't': +				*mp++ = CHR; +				*mp++ = '\t'; +				break; +			case 'v': +				*mp++ = CHR; +				*mp++ = '\v'; +				break; +#endif +			default: +				*mp++ = CHR; +				*mp++ = *p; +			} +			break; + +		default :               /* an ordinary char  */ +			*mp++ = CHR; +			*mp++ = *p; +			break; +		} +		sp = lp; +	} +	if (tagi > 0) +		return badpat("Unmatched \\("); +	*mp = END; +	sta = OKP; +	return 0; +} + +/* + * RESearch::Execute: + * 	execute nfa to find a match. + * + *	special cases: (nfa[0])	 + *		BOL + *			Match only once, starting from the + *			beginning. + *		CHR + *			First locate the character without + *			calling PMatch, and if found, call + *			PMatch for the remaining string. + *		END + *			RESearch::Compile failed, poor luser did not + *			check for it. Fail fast. + * + *	If a match is found, bopat[0] and eopat[0] are set + *	to the beginning and the end of the matched fragment, + *	respectively. + * + */ + +int RESearch::Execute(CharacterIndexer &ci, int lp) { +	char c; +	int ep = 0; +	char *ap = nfa; + +	bol = lp; +	failure = 0; + +	for (int i=0;i<MAXTAG;i++) { +		bopat[i] = NOTFOUND; +		eopat[i] = NOTFOUND; +	} + +	switch(*ap) { + +	case BOL:			/* anchored: match from BOL only */ +		ep = PMatch(ci, lp, ap); +		break; +	case CHR:			/* ordinary char: locate it fast */ +		c = *(ap+1); +		while (ci.CharAt(lp) && ci.CharAt(lp) != c) +			lp++; +		if (!ci.CharAt(lp))		/* if EOS, fail, else fall thru. */ +			return 0; +	default:			/* regular matching all the way. */ +		while (ci.CharAt(lp)) { +			ep = PMatch(ci, lp, ap); +			if (ep != NOTFOUND) +				break; +			lp++; +		} +		break; +	case END:			/* munged automaton. fail always */ +		return 0; +	} +	if (ep == NOTFOUND) +		return 0; + +	bopat[0] = lp; +	eopat[0] = ep; +	return 1; +} + +/*  + * PMatch: internal routine for the hard part + * + * 	This code is partly snarfed from an early grep written by + *	David Conroy. The backref and tag stuff, and various other + *	innovations are by oz. + * + *	special case optimizations: (nfa[n], nfa[n+1]) + *		CLO ANY + *			We KNOW .* will match everything upto the + *			end of line. Thus, directly go to the end of + *			line, without recursive PMatch calls. As in + *			the other closure cases, the remaining pattern + *			must be matched by moving backwards on the + *			string recursively, to find a match for xy + *			(x is ".*" and y is the remaining pattern) + *			where the match satisfies the LONGEST match for + *			x followed by a match for y. + *		CLO CHR + *			We can again scan the string forward for the + *			single char and at the point of failure, we + *			execute the remaining nfa recursively, same as + *			above. + * + *	At the end of a successful match, bopat[n] and eopat[n] + *	are set to the beginning and end of subpatterns matched + *	by tagged expressions (n = 1 to 9).	 + * + */ + +extern void re_fail(char *,char); + +/* + * character classification table for word boundary operators BOW + * and EOW. the reason for not using ctype macros is that we can + * let the user add into our own table. see RESearch::ModifyWord. This table + * is not in the bitset form, since we may wish to extend it in the + * future for other character classifications.  + * + *	TRUE for 0-9 A-Z a-z _ + */ +static char chrtyp[MAXCHR] = { +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  +	0, 0, 0, 0, 0, 0, 0, 0, 1, 1,  +	1, 1, 1, 1, 1, 1, 1, 1, 0, 0,  +	0, 0, 0, 0, 0, 1, 1, 1, 1, 1,  +	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  +	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  +	1, 0, 0, 0, 0, 1, 0, 1, 1, 1,  +	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  +	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  +	1, 1, 1, 0, 0, 0, 0, 0 +	}; + +#define inascii(x)	(0177&(x)) +#define iswordc(x) 	chrtyp[inascii(x)] +#define isinset(x,y) 	((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND]) + +/* + * skip values for CLO XXX to skip past the closure + */ + +#define ANYSKIP	2 	/* [CLO] ANY END ...	     */ +#define CHRSKIP	3	/* [CLO] CHR chr END ...     */ +#define CCLSKIP 18	/* [CLO] CCL 16bytes END ... */ + +int RESearch::PMatch(CharacterIndexer &ci, int lp, char *ap) { +	int op, c, n; +	int e;		/* extra pointer for CLO */ +	int bp;		/* beginning of subpat.. */ +	int ep;		/* ending of subpat..	 */ +	int are;			/* to save the line ptr. */ + +	while ((op = *ap++) != END) +		switch(op) { + +		case CHR: +			if (ci.CharAt(lp++) != *ap++) +				return NOTFOUND; +			break; +		case ANY: +			if (!ci.CharAt(lp++)) +				return NOTFOUND; +			break; +		case CCL: +			c = ci.CharAt(lp++); +			if (!isinset(ap,c)) +				return NOTFOUND; +			ap += BITBLK; +			break; +		case BOL: +			if (lp != bol) +				return NOTFOUND; +			break; +		case EOL: +			if (ci.CharAt(lp)) +				return NOTFOUND; +			break; +		case BOT: +			bopat[*ap++] = lp; +			break; +		case EOT: +			eopat[*ap++] = lp; +			break; + 		case BOW: +			if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp))) +				return NOTFOUND; +			break; +		case EOW: +			if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp))) +				return NOTFOUND; +			break; +		case REF: +			n = *ap++; +			bp = bopat[n]; +			ep = eopat[n]; +			while (bp < ep) +				if (ci.CharAt(bp++) != ci.CharAt(lp++)) +					return NOTFOUND; +			break; +		case CLO: +			are = lp; +			switch(*ap) { + +			case ANY: +				while (ci.CharAt(lp)) +					lp++; +				n = ANYSKIP; +				break; +			case CHR: +				c = *(ap+1); +				while (ci.CharAt(lp) && c == ci.CharAt(lp)) +					lp++; +				n = CHRSKIP; +				break; +			case CCL: +				while (((c = ci.CharAt(lp)) != 0) && isinset(ap+1,c)) +					lp++; +				n = CCLSKIP; +				break; +			default: +				failure = true; +				//re_fail("closure: bad nfa.", *ap); +				return NOTFOUND; +			} + +			ap += n; + +			while (lp >= are) { +				if ((e = PMatch(ci, lp, ap)) != NOTFOUND) +					return e; +				--lp; +			} +			return NOTFOUND; +		default: +			//re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op)); +			return NOTFOUND; +		} +	return lp; +} + +/* + * RESearch::ModifyWord: + *	add new characters into the word table to change RESearch::Execute's + *	understanding of what a word should look like. Note that we + *	only accept additions into the word definition. + * + *	If the string parameter is 0 or null string, the table is + *	reset back to the default containing A-Z a-z 0-9 _. [We use + *	the compact bitset representation for the default table] + */ + +static char deftab[16] = {	 +	0, 0, 0, 0, 0, 0, '\377', 003, '\376', '\377', '\377', '\207',   +	'\376', '\377', '\377', 007  +};  + +void RESearch::ModifyWord(char *s) { +	int i; + +	if (!s || !*s) { +		for (i = 0; i < MAXCHR; i++) +			if (!isinset(deftab,i)) +				iswordc(i) = 0; +	} +	else +		while(*s) +			iswordc(*s++) = 1; +} + +/* + * RESearch::Substitute: + *	substitute the matched portions of the src in dst. + * + *	&	substitute the entire matched pattern. + * + *	\digit	substitute a subpattern, with the given	tag number. + *		Tags are numbered from 1 to 9. If the particular + *		tagged subpattern does not exist, null is substituted. + */ +int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) { +	char c; +	int  pin; +	int bp; +	int ep; + +	if (!*src || !bopat[0]) +		return 0; + +	while ((c = *src++) != 0) { +		switch(c) { + +		case '&': +			pin = 0; +			break; + +		case '\\': +			c = *src++; +			if (c >= '0' && c <= '9') { +				pin = c - '0'; +				break; +			} +			 +		default: +			*dst++ = c; +			continue; +		} + +		if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) { +			while (ci.CharAt(bp) && bp < ep) +				*dst++ = ci.CharAt(bp++); +			if (bp < ep) +				return 0; +		} +	} +	*dst = (char) 0; +	return 1; +} +			 +#ifdef DEBUG +/* + * symbolic - produce a symbolic dump of the nfa + */ +void symbolic(char *s) { +	printf("pattern: %s\n", s); +	printf("nfacode:\n"); +	nfadump(nfa); +} + +static void nfadump(char *ap) { +	int n; + +	while (*ap != END) +		switch(*ap++) { +		case CLO: +			printf("CLOSURE"); +			nfadump(ap); +			switch(*ap) { +			case CHR: +				n = CHRSKIP; +				break; +			case ANY: +				n = ANYSKIP; +				break; +			case CCL: +				n = CCLSKIP; +				break; +			} +			ap += n; +			break; +		case CHR: +			printf("\tCHR %c\n",*ap++); +			break; +		case ANY: +			printf("\tANY .\n"); +			break; +		case BOL: +			printf("\tBOL -\n"); +			break; +		case EOL: +			printf("\tEOL -\n"); +			break; +		case BOT: +			printf("BOT: %d\n",*ap++); +			break; +		case EOT: +			printf("EOT: %d\n",*ap++); +			break; +		case BOW: +			printf("BOW\n"); +			break; +		case EOW: +			printf("EOW\n"); +			break; +		case REF: +			printf("REF: %d\n",*ap++); +			break; +		case CCL: +			printf("\tCCL ["); +			for (n = 0; n < MAXCHR; n++) +				if (isinset(ap,(char)n)) { +					if (n < ' ') +						printf("^%c", n ^ 0x040); +					else +						printf("%c", n); +				} +			printf("]\n"); +			ap += BITBLK; +			break; +		default: +			printf("bad nfa. opcode %o\n", ap[-1]); +			exit(1); +			break; +		} +} +#endif diff --git a/src/RESearch.h b/src/RESearch.h new file mode 100644 index 000000000..f9cf7fdc5 --- /dev/null +++ b/src/RESearch.h @@ -0,0 +1,54 @@ +// Scintilla source code edit control +/** @file RESearch.h + ** Interface to the regular expression search library. + **/ +// Written by Neil Hodgson <neilh@scintilla.org> +// Based on the work of Ozan S. Yigit. +// This file is in the public domain. + +#ifndef RESEARCH_H +#define RESEARCH_H + +/* + * The following defines are not meant to be changeable. + * They are for readability only. + */ +#define MAXCHR	128 +#define CHRBIT	8 +#define BITBLK	MAXCHR/CHRBIT + +class CharacterIndexer { +public:  +	virtual char CharAt(int index)=0; +}; + +class RESearch { + +public: +	RESearch(); +	void ChSet(char c); +	const char *Compile(char *pat); +	int Execute(CharacterIndexer &ci, int lp); +	void ModifyWord(char *s); +	int Substitute(CharacterIndexer &ci, char *src, char *dst); + +	enum {MAXTAG=10}; +	enum {MAXNFA=1024}; +	enum {NOTFOUND=-1}; + +	int bopat[MAXTAG]; +	int eopat[MAXTAG]; + +private: +	int PMatch(CharacterIndexer &ci, int lp, char *ap); + +	int bol; +	int  tagstk[MAXTAG];             /* subpat tag stack..*/ +	char nfa[MAXNFA];		/* automaton..       */ +	int sta; +	char bittab[BITBLK];		/* bit table for CCL */ +						/* pre-set bits...   */ +	int failure; +}; + +#endif | 
