diff options
author | nyamatongwe <unknown> | 2001-04-04 12:52:44 +0000 |
---|---|---|
committer | nyamatongwe <unknown> | 2001-04-04 12:52:44 +0000 |
commit | 93b871d1d8fbb076510e2c410ba57a0980a22ec8 (patch) | |
tree | 56576fc17d8737f5fbb591a89fd1e9fab4bd1a59 /src | |
parent | b338ed2a95f184263c1e1c7782ba3706fa05858c (diff) | |
download | scintilla-mirror-93b871d1d8fbb076510e2c410ba57a0980a22ec8.tar.gz |
Moved to public domain regular expresion implementation.
Diffstat (limited to 'src')
-rw-r--r-- | src/Document.cxx | 56 | ||||
-rw-r--r-- | src/Document.h | 2 | ||||
-rw-r--r-- | src/PosRegExp.cxx | 1192 | ||||
-rw-r--r-- | src/RESearch.cxx | 844 | ||||
-rw-r--r-- | src/RESearch.h | 54 |
5 files changed, 934 insertions, 1214 deletions
diff --git a/src/Document.cxx b/src/Document.cxx index ac9c381a0..ba7fa4911 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -16,7 +16,10 @@ #include "SVector.h" #include "CellBuffer.h" #include "Document.h" -#include "PosRegExp.h" +#include "RESearch.h" + +void re_fail(char *,char) { +} // This is ASCII specific but is safe with chars >= 0x80 inline bool isspacechar(unsigned char ch) { @@ -743,10 +746,6 @@ bool Document::IsWordAt(int start, int end) { return IsWordStartAt(start) && IsWordEndAt(end); } -char Document::DocCharAt(int pos, void *param) { - return reinterpret_cast<Document*>(param)->CharAt(pos); -} - // The comparison and case changing functions here assume ASCII // or extended ASCII such as the normal Windows code page. @@ -764,6 +763,22 @@ static inline char MakeLowerCase(char ch) { return static_cast<char>(ch - 'A' + 'a'); } +// Define a way for the Regular Expression code to access the document +class DocumentIndexer : public CharacterIndexer { + Document *pdoc; + int end; +public: + DocumentIndexer(Document *pdoc_, int end_) : + pdoc(pdoc_), end(end_) { + } + virtual char CharAt(int index) { + if (index < 0 || index >= end) + return 0; + else + return pdoc->CharAt(index); + } +}; + // Find text in document, supporting both forward and backward // searches (just pass minPos > maxPos to do a backward search) // Has not been tested with backwards DBCS searches yet. @@ -771,11 +786,11 @@ long Document::FindText(int minPos, int maxPos, const char *s, bool caseSensitive, bool word, bool wordStart, bool regExp, int *length) { if (regExp) { - char *pat = new char[strlen(s) + 4]; + char *pat = new char[strlen(s) + 1]; if (!pat) return -1; - strcpy(pat, "/"); + pat[0] = '\0'; int startPos; int endPos; @@ -791,25 +806,26 @@ long Document::FindText(int minPos, int maxPos, const char *s, startPos = MovePositionOutsideChar(startPos, 1, false); endPos = MovePositionOutsideChar(endPos, 1, false); + DocumentIndexer di(this, endPos); + RESearch re; strcat(pat, s); - strcat(pat, "/"); - PosRegExp re; - if (!caseSensitive) - strcat(pat, "i"); - if (!re.SetExpr(pat)) { + const char *errmsg = re.Compile(pat); + if (errmsg) { delete []pat; return -1; } - re.param = this; - re.CharAt = DocCharAt; - SMatches matches; - if (!re.Parse(startPos, 0, endPos, &matches)) { - delete []pat; - return -1; + // Find a variable in a property file: \$([A-Za-z0-9_.]+) + int success = re.Execute(di, startPos); + int pos = -1; + int lenRet = 0; + if (success) { + pos = re.bopat[0]; + lenRet = re.eopat[0] - re.bopat[0]; } - *length = matches.e[0] - matches.s[0]; delete []pat; - return matches.s[0]; + *length = lenRet; + return pos; + } else { bool forward = minPos <= maxPos; diff --git a/src/Document.h b/src/Document.h index cfb131608..b52036513 100644 --- a/src/Document.h +++ b/src/Document.h @@ -196,8 +196,6 @@ public: const WatcherWithUserData *GetWatchers() const { return watchers; } int GetLenWatchers() const { return lenWatchers; } - static char DocCharAt(int pos, void *param); - bool IsWordPartSeparator(char ch); int WordPartLeft(int pos); int WordPartRight(int pos); diff --git a/src/PosRegExp.cxx b/src/PosRegExp.cxx deleted file mode 100644 index eb240e2fb..000000000 --- a/src/PosRegExp.cxx +++ /dev/null @@ -1,1192 +0,0 @@ -#include <string.h> -#include <stdio.h> -#include <ctype.h> -#include <malloc.h> - -#include "PosRegExp.h" - -#ifdef _MSC_VER -#pragma warning(disable: 4244) -#endif -#ifdef __BORLANDC__ -// Too much effort to to cean this code up so just ignore badly -// bracketed initialisers, conversions losing significant digits, -// and values assigned but not used. -#pragma warn -pin -#pragma warn -sig -#pragma warn -aus -#endif -//Up: /[A-Z \x80-\x9f \xf0 ]/x -//Lo: /[a-z \xa0-\xaf \xe0-\xef \xf1 ]/x -//Wd: /[\d _ A-Z a-z \xa0-\xaf \xe0-\xf1 \x80-\x9f]/x -//* // Dos866 -SCharData UCData = {0x0, 0x0, 0x7fffffe, 0x0, 0xffffffff, 0x0, 0x0, 0x10000}, - LCData = {0x0, 0x0, 0x0, 0x7fffffe, 0x0, 0xffff, 0x0, 0x2ffff}, - WdData = {0x0, 0x3ff0000, 0x87fffffe, 0x7fffffe, 0xffffffff, 0xffff, 0x0, 0x3ffff}, - DigData = {0x0, 0x3ff0000, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; -/*/ // cp1251 -SCharData UCData = {0x0, 0x0, 0x7fffffe, 0x0, 0x0, 0x0, 0xffffffff, 0x0}, - LCData = {0x0, 0x0, 0x0, 0x7fffffe, 0x0, 0x0, 0x0, 0xffffffff}, - WdData = {0x0, 0x3ff0000, 0x87fffffe, 0x7fffffe, 0x0, 0x0, 0xffffffff, 0xffffffff}, - DigData = {0x0, 0x3ff0000, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; -//*/ - -/////////////////////////////////////////////// - -int GetNumber(int *str,int s,int e) { - int r = 1, num = 0; - if (e < s) return -1; - for(int i = e-1; i >= s; i--) { - if (str[i] > '9' || str[i] < '0') return -1; - num += (str[i] - 0x30)*r; - r *= 10; - }; - return num; - /* - char tmp[20]; - double Res; - if (e == s) return -1; - for (int i = s;i < e;i++) - tmp[i-s] = (char)Str[i]; - tmp[e-s] = 0; - GetNumber(tmp,&Res); - return (int)Res; - */ -}; - -bool IsDigit(char Symb) { - return DigData.GetBit(Symb); -}; -bool IsWord(char Symb) { - return WdData.GetBit(Symb); -}; -bool IsUpperCase(char Symb) { - return UCData.GetBit(Symb); -}; -bool IsLowerCase(char Symb) { - return LCData.GetBit(Symb); -}; -char LowCase(char Chr) { - if (UCData.GetBit(Chr)) - return Chr+0x20; - return Chr; -}; - -/////////////////////////////////////////////// - -SRegInfo::SRegInfo() { - Next = Parent = 0; - un.Param = 0; - Op = ReEmpty; -}; -SRegInfo::~SRegInfo() { - if (Next) delete Next; - if (un.Param) - switch(Op) { - case ReEnum: - case ReNEnum: - delete un.ChrClass; - break; - default: - if (Op > ReBlockOps && Op < ReSymbolOps || Op == ReBrackets) - delete un.Param; - break; - }; -}; - -/////////////////////////////////////////////// - -void SCharData::SetBit(unsigned char Bit) { - int p = Bit/8; - CArr[p] |= (1 << Bit%8); -}; -void SCharData::ClearBit(unsigned char Bit) { - int p = Bit/8; - CArr[p] &= ~(1 << Bit%8); -}; -bool SCharData::GetBit(unsigned char Bit) { - int p = (unsigned char)Bit/8; - return (CArr[p] & (1 << Bit%8))!=0; -}; - -///////////////////////////////////////////////////////////////// -////////////////////// RegExp Class /////////////////////////// -///////////////////////////////////////////////////////////////// - -PosRegExp::PosRegExp() { - Info = 0; - Exprn = 0; - NoMoves = false; - Error = true; - FirstChar = 0; - CurMatch = 0; -}; -PosRegExp::~PosRegExp() { - if (Info) delete Info; -}; - -bool PosRegExp::SetExpr(const char *Expr) { - if (!this) return false; - Error = true; - CurMatch = 0; - if (SetExprLow(Expr)) Error = false; - return !Error; -}; -bool PosRegExp::isok() { - return !Error; -}; - - -bool PosRegExp::SetExprLow(const char *Expr) { - int Len = strlen(Expr); - bool Ok = false; - int i,j,s = 0,pos,tmp; - int EnterBr = 0,EnterGr = 0,EnterFg = 0; - - if (Info) delete Info; - Info = new SRegInfo; - Exprn = new int[Len]; - - NoCase = false; - Extend = false; - if (Expr[0] == '/') s++; - else return false; - - for (i = Len; i > 0 && !Ok;i--) - if (Expr[i] == '/') { - Len = i-s; - Ok = true; - for (int j = i+1; Expr[j]; j++) { - if (Expr[j] == 'i') NoCase = true; - if (Expr[j] == 'x') Extend = true; - }; - }; - if (!Ok) return false; - - //////////////////////////////// - for (j = 0,pos = 0; j < Len; j++,pos++) { - if (Extend && Expr[j+s] == ' ') { - pos--; - continue; - }; - - Exprn[pos] = (int)(unsigned char)Expr[j+s]; - - if (Expr[j+s] == BackSlash) { - switch (Expr[j+s+1]) { - case 'd': - Exprn[pos] = ReDigit; - break; - case 'D': - Exprn[pos] = ReNDigit; - break; - case 'w': - Exprn[pos] = ReWordSymb; - break; - case 'W': - Exprn[pos] = ReNWordSymb; - break; - case 's': - Exprn[pos] = ReWSpace; - break; - case 'S': - Exprn[pos] = ReNWSpace; - break; - case 'u': - Exprn[pos] = ReUCase; - break; - case 'l': - Exprn[pos] = ReNUCase; - break; - case 't': - Exprn[pos] = '\t'; - break; - case 'n': - Exprn[pos] = '\n'; - break; - case 'r': - Exprn[pos] = '\r'; - break; - case 'b': - Exprn[pos] = ReWBound; - break; - case 'B': - Exprn[pos] = ReNWBound; - break; - case 'c': - Exprn[pos] = RePreNW; - break; - case 'm': - Exprn[pos] = ReStart; - break; - case 'M': - Exprn[pos] = ReEnd; - break; - case 'x': - tmp = toupper(Expr[j+s+2])-0x30; - tmp = (tmp>9?tmp-7:tmp)<<4; - tmp += (toupper(Expr[j+s+3])-0x30)>9?toupper(Expr[j+s+3])-0x37:(toupper(Expr[j+s+3])-0x30); - Exprn[pos] = tmp; - j+=2; - break; - case 'y': - tmp = Expr[j+s+2] - 0x30; - if (tmp >= 0 && tmp <= 9) { - if (tmp == 1) { - tmp = 10 + Expr[j+s+3] - 0x30; - if (tmp >= 10 && tmp <= 19) j++; - else tmp = 1; - }; - Exprn[pos] = ReBkTrace + tmp; - j++; - break; - }; - default: - tmp = Expr[j+s+1] - 0x30; - if (tmp >= 0 && tmp <= 9) { - if (tmp == 1) { - tmp = 10 + Expr[j+s+2] - 0x30; - if (tmp >= 10 && tmp <= 19) j++; - else tmp = 1; - }; - Exprn[pos] = ReBkBrack + tmp; - break; - } else - Exprn[pos] = Expr[j+s+1]; - break; - }; - j++; - continue; - }; - if (Expr[j+s] == ']') { - Exprn[pos] = ReEnumE; - if (EnterFg || !EnterGr) return false; - EnterGr--; - }; - if (Expr[j+s] == '-' && EnterGr) Exprn[pos] = ReFrToEnum; - - if (EnterGr) continue; - - if (Expr[j+s] == '[' && Expr[j+s+1] == '^') { - Exprn[pos] = ReNEnumS; - if (EnterFg) return false; - EnterGr++; - j++; - continue; - }; - if (Expr[j+s] == '*' && Expr[j+s+1] == '?') { - Exprn[pos] = ReNGMul; - j++; - continue; - }; - if (Expr[j+s] == '+' && Expr[j+s+1] == '?') { - Exprn[pos] = ReNGPlus; - j++; - continue; - }; - if (Expr[j+s] == '?' && Expr[j+s+1] == '?') { - Exprn[pos] = ReNGQuest; - j++; - continue; - }; - if (Expr[j+s] == '?' && Expr[j+s+1] == '#' && - Expr[j+s+2]>='0' && Expr[j+s+2]<='9') { - Exprn[pos] = ReBehind+Expr[j+s+2]-0x30; - j+=2; - continue; - }; - if (Expr[j+s] == '?' && Expr[j+s+1] == '~' && - Expr[j+s+2]>='0' && Expr[j+s+2]<='9') { - Exprn[pos] = ReNBehind+Expr[j+s+2]-0x30; - j+=2; - continue; - }; - if (Expr[j+s] == '?' && Expr[j+s+1] == '=') { - Exprn[pos] = ReAhead; - j++; - continue; - }; - if (Expr[j+s] == '?' && Expr[j+s+1] == '!') { - Exprn[pos] = ReNAhead; - j++; - continue; - }; - - if (Expr[j+s] == '(') { - Exprn[pos] = ReLBrack; - if (EnterFg) return false; - EnterBr++; - }; - if (Expr[j+s] == ')') { - Exprn[pos] = ReRBrack; - if (!EnterBr || EnterFg) return false; - EnterBr--; - }; - if (Expr[j+s] == '[') { - Exprn[pos] = ReEnumS; - if (EnterFg) return false; - EnterGr++; - }; - if (Expr[j+s] == '{') { - Exprn[pos] = ReRangeS; - if (EnterFg) return false; - EnterFg++; - }; - if (Expr[j+s] == '}' && Expr[j+s+1] == '?') { - Exprn[pos] = ReNGRangeE; - if (!EnterFg) return false; - EnterFg--; - j++; - continue; - }; - if (Expr[j+s] == '}') { - Exprn[pos] = ReRangeE; - if (!EnterFg) return false; - EnterFg--; - }; - - if (Expr[j+s] == '^') Exprn[pos] = ReSoL; - if (Expr[j+s] == '$') Exprn[pos] = ReEoL; - if (Expr[j+s] == '.') Exprn[pos] = ReAnyChr; - if (Expr[j+s] == '*') Exprn[pos] = ReMul; - if (Expr[j+s] == '+') Exprn[pos] = RePlus; - if (Expr[j+s] == '?') Exprn[pos] = ReQuest; - if (Expr[j+s] == '|') Exprn[pos] = ReOr; - }; - if (EnterGr || EnterBr || EnterFg) return false; - - Info->Op = ReBrackets; - Info->un.Param = new SRegInfo; - Info->s = CurMatch++; - - if (!SetStructs(Info->un.Param,0,pos)) return false; - Optimize(); - delete Exprn; - return true; -}; - -void PosRegExp::Optimize() { - PRegInfo Next = Info; - FirstChar = 0; - while(Next) { - if (Next->Op == ReBrackets || Next->Op == RePlus || Next->Op == ReNGPlus) { - Next = Next->un.Param; - continue; - }; - if (Next->Op == ReSymb) { - if (Next->un.Symb & 0xFF00 && Next->un.Symb != ReSoL && Next->un.Symb != ReWBound) - break; - FirstChar = Next->un.Symb; - break; - }; - break; - }; -}; - -bool PosRegExp::SetStructs(PRegInfo &re,int start,int end) { - PRegInfo Next,Prev,Prev2; - int comma,st,en,ng,i, j,k; - int EnterBr; - bool Add; - - if (end - start < 0) return false; - Next = re; - for (i = start; i < end; i++) { - Add = false; - // Ops - if (Exprn[i] > ReBlockOps && Exprn[i] < ReSymbolOps) { - Next->un.Param = 0; - Next->Op = (EOps)Exprn[i]; - Add = true; - }; - // {n,m} - if (Exprn[i] == ReRangeS) { - st = i; - en = -1; - comma = -1; - ng = 0; - for (j = i;j < end;j++) { - if (Exprn[j] == ReNGRangeE) { - en = j; - ng = 1; - break; - }; - if (Exprn[j] == ReRangeE) { - en = j; - break; - }; - if ((char)Exprn[j] == ',') - comma = j; - }; - if (en == -1) return false; - if (comma == -1) comma = en; - Next->s = (char)GetNumber(Exprn,st+1,comma); - if (comma != en) - Next->e = (char)GetNumber(Exprn,comma+1,en); - else - Next->e = Next->s; - Next->un.Param = 0; - Next->Op = ng?ReNGRangeNM:ReRangeNM; - if (en-comma == 1) { - Next->e = -1; - Next->Op = ng?ReNGRangeN:ReRangeN; - }; - i=j; - Add = true; - }; - // [] [^] - if (Exprn[i] == ReEnumS || Exprn[i] == ReNEnumS) { - Next->Op = (Exprn[i] == ReEnumS)?ReEnum:ReNEnum; - for (j = i+1;j < end;j++) { - if (Exprn[j] == ReEnumE) - break; - }; - if (j == end) return false; - Next->un.ChrClass = new SCharData; - memset(Next->un.ChrClass, 0, 32); - for (j = i+1;Exprn[j] != ReEnumE;j++) { - if (Exprn[j+1] == ReFrToEnum) { - for (i = (Exprn[j]&0xFF); i < (Exprn[j+2]&0xFF);i++) - Next->un.ChrClass->SetBit(i&0xFF); - j++; - continue; - }; - switch(Exprn[j]) { - case ReDigit: - for (k = 0x30;k < 0x40;k++) - if (IsDigit((char)k)) - Next->un.ChrClass->SetBit(k); - break; - case ReNDigit: - for (k = 0x30;k < 0x40;k++) - if (!IsDigit((char)k)) - Next->un.ChrClass->SetBit(k); - Next->un.ChrClass->ClearBit(0x0a); - Next->un.ChrClass->ClearBit(0x0d); - break; - case ReWordSymb: - for (k = 0;k < 256;k++) - if (IsWord((char)k)) - Next->un.ChrClass->SetBit(k); - break; - case ReNWordSymb: - for (k = 0;k < 256;k++) - if (!IsWord((char)k)) - Next->un.ChrClass->SetBit(k); - Next->un.ChrClass->ClearBit(0x0a); - Next->un.ChrClass->ClearBit(0x0d); - break; - case ReWSpace: - Next->un.ChrClass->SetBit(0x20); - Next->un.ChrClass->SetBit(0x09); - break; - case ReNWSpace: - memset(Next->un.ChrClass->IArr, 0xFF, 32); - Next->un.ChrClass->ClearBit(0x20); - Next->un.ChrClass->ClearBit(0x09); - Next->un.ChrClass->ClearBit(0x0a); - Next->un.ChrClass->ClearBit(0x0d); - break; - default: - if (!(Exprn[j]&0xFF00)) - Next->un.ChrClass->SetBit(Exprn[j]&0xFF); - break; - }; - }; - Add = true; - i=j; - }; - // ( ... ) - if (Exprn[i] == ReLBrack) { - EnterBr = 1; - for (j = i+1;j < end;j++) { - if (Exprn[j] == ReLBrack) EnterBr++; - if (Exprn[j] == ReRBrack) EnterBr--; - if (!EnterBr) break; - }; - if (EnterBr) return false; - Next->Op = ReBrackets; - Next->un.Param = new SRegInfo; - Next->un.Param->Parent = Next; - Next->s = CurMatch++; - if (CurMatch > MatchesNum) CurMatch = MatchesNum; - if (!SetStructs(Next->un.Param,i+1,j)) return false; - Add = true; - i=j; - }; - if ((Exprn[i]&0xFF00) == ReBkTrace) { - Next->Op = ReBkTrace; - Next->un.Symb = Exprn[i]&0xFF; - Add = true; - }; - if ((Exprn[i]&0xFF00) == ReBkBrack) { - Next->Op = ReBkBrack; - Next->un.Symb = Exprn[i]&0xFF; - Add = true; - }; - if ((Exprn[i]&0xFF00) == ReBehind) { - Next->Op = ReBehind; - Next->s = Exprn[i]&0xFF; - Add = true; - }; - if ((Exprn[i]&0xFF00) == ReNBehind) { - Next->Op = ReNBehind; - Next->s = Exprn[i]&0xFF; - Add = true; - }; - // Chars - if (Exprn[i] >= ReAnyChr && Exprn[i] < ReTemp || Exprn[i] < 0x100) { - Next->Op = ReSymb; - Next->un.Symb = Exprn[i]; - Add = true; - }; - // Next - if (Add && i != end-1) { - Next->Next = new SRegInfo; - Next->Next->Parent = Next->Parent; - Next = Next->Next; - }; - }; - Next = re; - Prev = Prev2 = 0; - while(Next) { - if (Next->Op > ReBlockOps && Next->Op < ReSymbolOps) { - if (!Prev) return false; - if (!Prev2) re = Next; - else Prev2->Next = Next; - //if (Prev->Op > ReBlockOps && Prev->Op < ReSymbolOps) return false; - Prev->Parent = Next; - Prev->Next = 0; - Next->un.Param = Prev; - Prev = Prev2; - }; - Prev2 = Prev; - Prev = Next; - Next = Next->Next; - }; - - return true; -}; - -///////////////////////////////////////////////////////////////// -///////////////////////// Parsing ///////////////////////////// -///////////////////////////////////////////////////////////////// - -bool PosRegExp::CheckSymb(int Symb,bool Inc) { - bool Res; - char ch; - switch(Symb) { - case ReAnyChr: - if (posParse >= posEnd) return false; - ch = CharAt(posParse,param); - Res = ch != '\r' && ch != '\n'; - if (Res && Inc) posParse++; - return Res; - case ReSoL: - if (posStart == posParse) - return true; - ch = CharAt(posParse-1,param); - return ch == '\n' || ch == '\r'; - case ReEoL: - if (posEnd == posParse) - return true; - ch = CharAt(posParse,param); - return ch == '\n' || ch == '\r'; - case ReDigit: - if (posParse >= posEnd) return false; - ch = CharAt(posParse,param); - Res = (ch >= 0x30 && ch <= 0x39); - if (Res && Inc) posParse++; - return Res; - case ReNDigit: - if (posParse >= posEnd) return false; - ch = CharAt(posParse,param); - Res = !(ch >= 0x30 && ch <= 0x39) && ch != '\r' && ch != '\n'; - if (Res && Inc) posParse++; - return Res; - case ReWordSymb: - if (posParse >= posEnd) return false; - Res = IsWord(CharAt(posParse,param)); - if (Res && Inc) posParse++; - return Res; - case ReNWordSymb: - if (posParse >= posEnd) return false; - ch = CharAt(posParse,param); - Res = !IsWord(ch) && ch != '\r' && ch != '\n'; - if (Res && Inc) posParse++; - return Res; - case ReWSpace: - if (posParse >= posEnd) return false; - ch = CharAt(posParse,param); - Res = (ch == 0x20 || ch == '\t'); - if (Res && Inc) posParse++; - return Res; - case ReNWSpace: - if (posParse >= posEnd) return false; - ch = CharAt(posParse,param); - Res = !(ch == 0x20 || ch == '\t') && ch != '\r' && ch != '\n'; - if (Res && Inc) posParse++; - return Res; - case ReUCase: - if (posParse >= posEnd) return false; - Res = IsUpperCase(CharAt(posParse,param)); - if (Res && Inc) posParse++; - return Res; - case ReNUCase: - if (posParse >= posEnd) return false; - Res = IsLowerCase(CharAt(posParse,param)); - if (Res && Inc) posParse++; - return Res; - case ReWBound: - if (posParse >= posEnd) return true; - ch = CharAt(posParse,param); - return IsWord(CharAt(posParse,param)) && (posParse == posStart || !IsWord(CharAt(posParse-1,param))); - case ReNWBound: - if (posParse >= posEnd) return true; - return !IsWord(CharAt(posParse,param)) && IsWord(CharAt(posParse-1,param)); - case RePreNW: - if (posParse >= posEnd) return true; - return (posParse == posStart || !IsWord(CharAt(posParse-1,param))); - case ReStart: - Matches->s[0] = (posParse-posStart); - return true; - case ReEnd: - Matches->e[0] = (posParse-posStart); - return true; - default: - if ((Symb & 0xFF00) || posParse >= posEnd) return false; - if (NoCase) { - if (LowCase(CharAt(posParse,param)) != LowCase((char)Symb&0xFF)) return false; - } else - if (CharAt(posParse,param) != (char)(Symb&0xFF)) return false; - if (Inc) posParse++; - return true; - }; -} - -bool PosRegExp::LowParseRe(PRegInfo &Next) { - PRegInfo OrNext; - int i,match,sv; - int posStr; - - switch(Next->Op) { - case ReSymb: - if (!CheckSymb(Next->un.Symb,true)) return false; - break; - case ReEmpty: - break; - case ReBkTrace: - if (!posBkStr | !BkTrace) return false; - sv = Next->un.Symb; - posStr = posParse; - for (i = BkTrace->s[sv]; i < BkTrace->e[sv]; i++) { - if (CharAt(posStr,param) != CharAt(posBkStr+i,param) || posEnd == posStr) return false; - posStr++; - }; - posParse = posStr; - break; - case ReBkBrack: - sv = Next->un.Symb; - posStr = posParse; - if (Matches->s[sv] == -1 || Matches->e[sv] == -1) return false; - for (i = Matches->s[sv]; i < Matches->e[sv]; i++) { - if (CharAt(posStr,param) != CharAt(posStart+i,param) || posEnd == posStr) return false; - posStr++; - }; - posParse = posStr; - break; - case ReBehind: - sv = Next->s; - posStr = posParse; - posParse -= sv; - if (!LowParse(Next->un.Param)) return false; - posParse = posStr; - break; - case ReNBehind: - sv = Next->s; - posStr = posParse; - posParse -= sv; - if (LowParse(Next->un.Param)) return false; - posParse = posStr; - break; - case ReAhead: - posStr = posParse; - if (!LowParse(Next->un.Param)) return false; - posParse = posStr; - break; - case ReNAhead: - posStr = posParse; - if (LowParse(Next->un.Param)) return false; - posParse = posStr; - break; - case ReEnum: - if (posParse >= posEnd) return false; - if (!Next->un.ChrClass->GetBit(CharAt(posParse,param))) return false; - posParse++; - break; - case ReNEnum: - if (posParse >= posEnd) return false; - if (Next->un.ChrClass->GetBit(CharAt(posParse,param))) return false; - posParse++; - break; - case ReBrackets: - match = Next->s; - sv = posParse-posStart; - posStr = posParse; - if (LowParse(Next->un.Param)) { - if (match || (Matches->s[match] == -1)) - Matches->s[match] = sv; - if (match || (Matches->e[match] == -1)) - Matches->e[match] = posParse-posStart; - return true; - }; - posParse = posStr; - return false; - case ReMul: - posStr = posParse; - while (LowParse(Next->un.Param)); - while(!LowCheckNext(Next) && posStr < posParse) posParse--; - break; - case ReNGMul: - do { - if (LowCheckNext(Next)) break; - } while (LowParse(Next->un.Param)); - break; - case RePlus: - posStr = posParse; - match = false; - while (LowParse(Next->un.Param)) - match = true; - if (!match) return false; - while(!LowCheckNext(Next) && posStr < posParse) posParse--; - break; - case ReNGPlus: - if (!LowParse(Next->un.Param)) return false; - do { - if (LowCheckNext(Next)) break; - } while (LowParse(Next->un.Param)); - break; - case ReQuest: - LowParse(Next->un.Param); - break; - case ReNGQuest: - if (LowCheckNext(Next)) break; - if (!LowParse(Next->un.Param)) return false; - break; - case ReOr: - OrNext = Next; - // posStr = posParse; - if (LowParse(Next->un.Param)) { - while (OrNext && OrNext->Op == ReOr) - OrNext = OrNext->Next; - /*if (!LowCheckNext(OrNext)){ - posParse = posStr; - OrNext = Next; - };*/ - }; - Next = OrNext; - break; - case ReRangeN: - posStr = posParse; - i = 0; - while (LowParse(Next->un.Param)) i++; // ??? - do { - if (i < Next->s) { - posParse = posStr; - return false; - }; - i--; - } while(!LowCheckNext(Next) && posStr < posParse--); - break; - case ReNGRangeN: - posStr = posParse; - i = 0; - while (LowParse(Next->un.Param)) { - i++; - if (i >= Next->s && LowCheckNext(Next)) // ??? - break; - }; - if (i < Next->s) { - posParse = posStr; - return false; - }; - break; - case ReRangeNM: - posStr = posParse; - i = 0; - while (i < Next->s && LowParse(Next->un.Param)) // ??? - i++; - if (i < Next->s) { - posParse = posStr; - return false; - }; - while (i < Next->e && LowParse(Next->un.Param)) // ??? - i++; - - while(!LowCheckNext(Next)) { - i--; - posParse--; - if (i < Next->s) { - posParse = posStr; - return false; - }; - }; - break; - case ReNGRangeNM: - posStr = posParse; - i = 0; - while (i < Next->s && LowParse(Next->un.Param)) // ??? - i++; - if (i < Next->s) { - posParse = posStr; - return false; - }; - while(!LowCheckNext(Next)) { - i++; - if (!LowParse(Next->un.Param) || i > Next->e) { // ??? - posParse = posStr; - return false; - }; - }; - break; - }; - return true; -}; - -bool PosRegExp::LowCheckNext(PRegInfo Re) { - PRegInfo Next; - int tmp = posParse; - Next = Re; - do { - if (Next && Next->Op == ReOr) - while (Next && Next->Op == ReOr) - Next = Next->Next; - if (Next->Next && !LowParse(Next->Next)) { - posParse = tmp; - Ok = false; - return false; - }; - Next = Next->Parent; - } while(Next); - posParse = tmp; - if (Ok != false) Ok = true; - return true; -}; - -bool PosRegExp::LowParse(PRegInfo Re) { - while(Re && posParse <= posEnd) { - if (!LowParseRe(Re)) return false; - if (Re) Re = Re->Next; - }; - return true; -}; - -bool PosRegExp::QuickCheck() { - if (!NoMoves || !FirstChar) - return true; - switch(FirstChar) { - case ReSoL: - if (posParse != posStart) return false; - return true; - case ReWBound: - return IsWord(CharAt(posParse,param)) && (posParse == posStart || !IsWord(CharAt(posParse-1,param))); - default: - if (NoCase && LowCase(CharAt(posParse,param)) != LowCase(FirstChar)) return false; - if (!NoCase && CharAt(posParse,param) != (char)FirstChar) return false; - return true; - }; -}; - -bool PosRegExp::ParseRe(int posStr) { - if (Error) return false; - - posParse = posStr; - if (!QuickCheck()) return false; - - for (int i = 0; i < MatchesNum; i++) - Matches->s[i] = Matches->e[i] = -1; - Matches->CurMatch = CurMatch; - - Ok = -1; - //try{ - do { - if (!LowParse(Info)) { - if (NoMoves) return false; - } else - return true; - posParse = ++posStr; - } while(posParse != posEnd+1); - return false; - //}__except(){ - // return true; - //}; -} -; - -bool PosRegExp::Parse(int posStr,int posSol, int posEol, PMatches Mtch, int Moves) { - if (!this) return false; - - bool s = NoMoves; - if (Moves != -1) NoMoves = Moves!=0; - posStart = posSol; - posEnd = posEol; - Matches = Mtch; - bool r = ParseRe(posStr); - NoMoves = s; - return r; -}; - -bool PosRegExp::Parse(int posStr, int posStop, PMatches Mtch) { - if (!this) return false; - posStart = posStr; - posEnd = posStop; - Matches = Mtch; - return ParseRe(posStr); -}; - -bool PosRegExp::SetNoMoves(bool Moves) { - NoMoves = Moves; - return true; -}; - -bool PosRegExp::SetBkTrace(int posStr,PMatches Trace) { - BkTrace = Trace; - posBkStr = posStr; - return true; -}; - -#define EVAL_MATCHES 16 -#define EVAL_CHUNKSIZE 256 - -#define EVAL_LOWERCASE 1 -#define EVAL_UPPERCASE 2 -#define EVAL_LOWERCASE_NEXT 4 -#define EVAL_UPPERCASE_NEXT 8 - -bool PosRegExp::Evaluate(char *Expr, int posStr, PMatches Mtch, char **Res) { - int length, - newlength, - chunklength, - value, - size, - src, - end; - unsigned flag; - char ch, - *dest, - *pool; - - size = EVAL_CHUNKSIZE; - pool = (char*) malloc (size); - dest = pool; - length = 0; - flag = 0; - while (*Expr) { - switch (ch = *Expr++) { - case '\\': - switch (ch = *Expr++) { - case 'A': - case 'B': - case 'C': - case 'D': - case 'E': - case 'F': - ch -= ('A' - '0'); - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - value = ch - '0'; - if (Mtch->s[value] != -1 && value < EVAL_MATCHES) { - chunklength = Mtch->e[value] - Mtch->s[value]; - if (chunklength) { - newlength = chunklength + length; - if (newlength > size) { - do - size += EVAL_CHUNKSIZE; - while (size < newlength); - pool = (char*) realloc (pool, size); - dest = pool + length; - } - length = newlength; - src = posStr + Mtch->s[value]; - end = posStr + Mtch->e[value]; - if (flag & EVAL_UPPERCASE) { - if (flag & EVAL_LOWERCASE_NEXT) { - *dest++ = tolower (CharAt(src++,param)); - flag &= ~EVAL_LOWERCASE_NEXT; - } - while (src < end) - *dest++ = toupper (CharAt(src++,param)); - } else if (flag & EVAL_LOWERCASE) { - if (flag & EVAL_UPPERCASE_NEXT) { - *dest++ = toupper (CharAt(src++,param)); - flag &= ~EVAL_UPPERCASE_NEXT; - } - while (src < end) - *dest++ = tolower (CharAt(src++,param)); - } else { - if (flag & EVAL_LOWERCASE_NEXT) { - *dest++ = tolower (CharAt(src++,param)); - flag &= ~EVAL_LOWERCASE_NEXT; - } else if (flag & EVAL_UPPERCASE_NEXT) { - *dest++ = toupper (CharAt(src++,param)); - flag &= ~EVAL_UPPERCASE_NEXT; - } - while (src < end) - *dest++ = CharAt(src++,param); - } - } - } else - goto error; - continue; - case '\0': - goto error; - case 'r': - ch = '\r'; - break; - case 'n': - ch = '\n'; - break; - case 'b': - ch = '\b'; - break; - case 'a': - ch = '\a'; - break; - case 't': - ch = '\t'; - break; - case 'U': - flag |= EVAL_UPPERCASE; - continue; - case 'u': - flag |= EVAL_UPPERCASE_NEXT; - continue; - case 'L': - flag |= EVAL_LOWERCASE; - continue; - case 'l': - flag |= EVAL_LOWERCASE_NEXT; - continue; - case 'Q': - case 'q': - flag &= ~(EVAL_UPPERCASE | EVAL_LOWERCASE); - continue; - case 'x': - { - if (!*Expr) - goto error; - value = toupper (*Expr) - '0'; - if (value > 9) - value = value + '0' - 'A' + 10; - if (value > 15) - goto error; - ch = value << 4; - Expr++; - if (!*Expr) - goto error; - value = toupper (*Expr) - '0'; - if (value > 9) - value = value + '0' - 'A' + 10; - if (value > 15) - goto error; - Expr++; - ch |= value; - break; - } - case 'd': - { - if (!*Expr) - goto error; - value = toupper (*Expr) - '0'; - if (value > 9) - goto error; - ch = value * 100; - Expr++; - if (!*Expr) - goto error; - value = toupper (*Expr) - '0'; - if (value > 9) - goto error; - ch += value * 10; - Expr++; - if (!*Expr) - goto error; - value = toupper (*Expr) - '0'; - if (value > 9) - goto error; - ch += value; - Expr++; - break; - } - case 'o': - { - if (!*Expr) - goto error; - value = toupper (*Expr) - '0'; - if (value > 9) - goto error; - ch = value << 6; - Expr++; - if (!*Expr) - goto error; - value = toupper (*Expr) - '0'; - if (value > 9) - goto error; - ch += value << 3; - Expr++; - if (!*Expr) - goto error; - value = toupper (*Expr) - '0'; - if (value > 9) - goto error; - ch |= value; - Expr++; - /* break; */ - } - /* default: - break; */ - } - default: - if (++length > size) { - do - size += EVAL_CHUNKSIZE; - while (size < length); - pool = (char*) realloc (pool, size); - dest = pool + length - 1; - } - if (flag & EVAL_LOWERCASE_NEXT) { - *dest++ = tolower (ch); - flag &= ~EVAL_LOWERCASE_NEXT; - } else if (flag & EVAL_UPPERCASE_NEXT) { - *dest++ = toupper (ch); - flag &= ~EVAL_UPPERCASE_NEXT; - } else if (flag & EVAL_UPPERCASE) - *dest++ = toupper (ch); - else if (flag & EVAL_LOWERCASE) - *dest++ = tolower (ch); - else - *dest++ = ch; - } - } - if (++length > size) { - do - size += EVAL_CHUNKSIZE; - while (size < length); - pool = (char*) realloc (pool, size); - dest = pool + length - 1; - } - *dest = '\0'; - *Res = pool; - return true; -error: - free (pool); - return false; -} diff --git a/src/RESearch.cxx b/src/RESearch.cxx new file mode 100644 index 000000000..a9b0d264f --- /dev/null +++ b/src/RESearch.cxx @@ -0,0 +1,844 @@ +// Scintilla source code edit control +/** @file RESearch.cxx + ** Regular expression search library. + **/ + +/* + * regex - Regular expression pattern matching and replacement + * + * By: Ozan S. Yigit (oz) + * Dept. of Computer Science + * York University + * + * Translation to C++ by Neil Hodgson neilh@scintilla.org + * Removed all use of register. + * Converted to modern function prototypes. + * Put all global/static variables into an object so this code can be + * used from multiple threads etc. + * + * These routines are the PUBLIC DOMAIN equivalents of regex + * routines as found in 4.nBSD UN*X, with minor extensions. + * + * These routines are derived from various implementations found + * in software tools books, and Conroy's grep. They are NOT derived + * from licensed/restricted software. + * For more interesting/academic/complicated implementations, + * see Henry Spencer's regexp routines, or GNU Emacs pattern + * matching module. + * + * Modification history: + * + * $Log$ + * Revision 1.1 2001/04/04 12:52:44 nyamatongwe + * Moved to public domain regular expresion implementation. + * + * Revision 1.4 1991/10/17 03:56:42 oz + * miscellaneous changes, small cleanups etc. + * + * Revision 1.3 1989/04/01 14:18:09 oz + * Change all references to a dfa: this is actually an nfa. + * + * Revision 1.2 88/08/28 15:36:04 oz + * Use a complement bitmap to represent NCL. + * This removes the need to have seperate + * code in the PMatch case block - it is + * just CCL code now. + * + * Use the actual CCL code in the CLO + * section of PMatch. No need for a recursive + * PMatch call. + * + * Use a bitmap table to set char bits in an + * 8-bit chunk. + * + * Interfaces: + * RESearch::Compile: compile a regular expression into a NFA. + * + * char *RESearch::Compile(s) + * char *s; + * + * RESearch::Execute: execute the NFA to match a pattern. + * + * int RESearch::Execute(s) + * char *s; + * + * RESearch::ModifyWord change RESearch::Execute's understanding of what a "word" + * looks like (for \< and \>) by adding into the + * hidden word-syntax table. + * + * void RESearch::ModifyWord(s) + * char *s; + * + * RESearch::Substitute: substitute the matched portions in a new string. + * + * int RESearch::Substitute(src, dst) + * char *src; + * char *dst; + * + * re_fail: failure routine for RESearch::Execute. + * + * void re_fail(msg, op) + * char *msg; + * char op; + * + * Regular Expressions: + * + * [1] char matches itself, unless it is a special + * character (metachar): . \ [ ] * + ^ $ + * + * [2] . matches any character. + * + * [3] \ matches the character following it, except + * when followed by a left or right round bracket, + * a digit 1 to 9 or a left or right angle bracket. + * (see [7], [8] and [9]) + * It is used as an escape character for all + * other meta-characters, and itself. When used + * in a set ([4]), it is treated as an ordinary + * character. + * + * [4] [set] matches one of the characters in the set. + * If the first character in the set is "^", + * it matches a character NOT in the set, i.e. + * complements the set. A shorthand S-E is + * used to specify a set of characters S upto + * E, inclusive. The special characters "]" and + * "-" have no special meaning if they appear + * as the first chars in the set. + * examples: match: + * + * [a-z] any lowercase alpha + * + * [^]-] any char except ] and - + * + * [^A-Z] any char except uppercase + * alpha + * + * [a-zA-Z] any alpha + * + * [5] * any regular expression form [1] to [4], followed by + * closure char (*) matches zero or more matches of + * that form. + * + * [6] + same as [5], except it matches one or more. + * + * [7] a regular expression in the form [1] to [10], enclosed + * as \(form\) matches what form matches. The enclosure + * creates a set of tags, used for [8] and for + * pattern substution. The tagged forms are numbered + * starting from 1. + * + * [8] a \ followed by a digit 1 to 9 matches whatever a + * previously tagged regular expression ([7]) matched. + * + * [9] \< a regular expression starting with a \< construct + * \> and/or ending with a \> construct, restricts the + * pattern matching to the beginning of a word, and/or + * the end of a word. A word is defined to be a character + * string beginning and/or ending with the characters + * A-Z a-z 0-9 and _. It must also be preceded and/or + * followed by any character outside those mentioned. + * + * [10] a composite regular expression xy where x and y + * are in the form [1] to [10] matches the longest + * match of x followed by a match for y. + * + * [11] ^ a regular expression starting with a ^ character + * $ and/or ending with a $ character, restricts the + * pattern matching to the beginning of the line, + * or the end of line. [anchors] Elsewhere in the + * pattern, ^ and $ are treated as ordinary characters. + * + * + * Acknowledgements: + * + * HCR's Hugh Redelmeier has been most helpful in various + * stages of development. He convinced me to include BOW + * and EOW constructs, originally invented by Rob Pike at + * the University of Toronto. + * + * References: + * Software tools Kernighan & Plauger + * Software tools in Pascal Kernighan & Plauger + * Grep [rsx-11 C dist] David Conroy + * ed - text editor Un*x Programmer's Manual + * Advanced editing on Un*x B. W. Kernighan + * RegExp routines Henry Spencer + * + * Notes: + * + * This implementation uses a bit-set representation for character + * classes for speed and compactness. Each character is represented + * by one bit in a 128-bit block. Thus, CCL always takes a + * constant 16 bytes in the internal nfa, and RESearch::Execute does a single + * bit comparison to locate the character in the set. + * + * Examples: + * + * pattern: foo*.* + * compile: CHR f CHR o CLO CHR o END CLO ANY END END + * matches: fo foo fooo foobar fobar foxx ... + * + * pattern: fo[ob]a[rz] + * compile: CHR f CHR o CCL bitset CHR a CCL bitset END + * matches: fobar fooar fobaz fooaz + * + * pattern: foo\\+ + * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END + * matches: foo\ foo\\ foo\\\ ... + * + * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo) + * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END + * matches: foo1foo foo2foo foo3foo + * + * pattern: \(fo.*\)-\1 + * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END + * matches: foo-foo fo-fo fob-fob foobar-foobar ... + */ + +#include "RESearch.h" + +#define EXTEND + +#define OKP 1 +#define NOP 0 + +#define CHR 1 +#define ANY 2 +#define CCL 3 +#define BOL 4 +#define EOL 5 +#define BOT 6 +#define EOT 7 +#define BOW 8 +#define EOW 9 +#define REF 10 +#define CLO 11 + +#define END 0 + +/* + * The following defines are not meant to be changeable. + * They are for readability only. + */ +#define BLKIND 0170 +#define BITIND 07 + +#define ASCIIB 0177 + +const char bitarr[] = {1,2,4,8,16,32,64,'\200'}; + +#define badpat(x) (*nfa = END, x) + +RESearch::RESearch() { + sta = NOP; /* status of lastpat */ + bol = 0; + for (int i=0; i<BITBLK; i++) + bittab[i] = 0; +} + +void RESearch::ChSet(char c) { + bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND]; +} + +const char *RESearch::Compile(char *pat) { + char *p; /* pattern pointer */ + char *mp=nfa; /* nfa pointer */ + char *lp; /* saved pointer.. */ + char *sp=nfa; /* another one.. */ + + int tagi = 0; /* tag stack index */ + int tagc = 1; /* actual tag count */ + + int n; + char mask; /* xor mask -CCL/NCL */ + int c1, c2; + + if (!pat || !*pat) + if (sta) + return 0; + else + return badpat("No previous regular expression"); + sta = NOP; + + for (p = pat; *p; p++) { + lp = mp; + switch(*p) { + + case '.': /* match any char.. */ + *mp++ = ANY; + break; + + case '^': /* match beginning.. */ + if (p == pat) + *mp++ = BOL; + else { + *mp++ = CHR; + *mp++ = *p; + } + break; + + case '$': /* match endofline.. */ + if (!*(p+1)) + *mp++ = EOL; + else { + *mp++ = CHR; + *mp++ = *p; + } + break; + + case '[': /* match char class..*/ + *mp++ = CCL; + + if (*++p == '^') { + mask = '\377'; + p++; + } + else + mask = 0; + + if (*p == '-') /* real dash */ + ChSet(*p++); + if (*p == ']') /* real brac */ + ChSet(*p++); + while (*p && *p != ']') { + if (*p == '-' && *(p+1) && *(p+1) != ']') { + p++; + c1 = *(p-2) + 1; + c2 = *p++; + while (c1 <= c2) + ChSet(static_cast<char>(c1++)); + } +#ifdef EXTEND + else if (*p == '\\' && *(p+1)) { + p++; + ChSet(*p++); + } +#endif + else + ChSet(*p++); + } + if (!*p) + return badpat("Missing ]"); + + for (n = 0; n < BITBLK; bittab[n++] = (char) 0) + *mp++ = static_cast<char>(mask ^ bittab[n]); + + break; + + case '*': /* match 0 or more.. */ + case '+': /* match 1 or more.. */ + if (p == pat) + return badpat("Empty closure"); + lp = sp; /* previous opcode */ + if (*lp == CLO) /* equivalence.. */ + break; + switch(*lp) { + + case BOL: + case BOT: + case EOT: + case BOW: + case EOW: + case REF: + return badpat("Illegal closure"); + default: + break; + } + + if (*p == '+') + for (sp = mp; lp < sp; lp++) + *mp++ = *lp; + + *mp++ = END; + *mp++ = END; + sp = mp; + while (--mp > lp) + *mp = mp[-1]; + *mp = CLO; + mp = sp; + break; + + case '\\': /* tags, backrefs .. */ + switch(*++p) { + + case '(': + if (tagc < MAXTAG) { + tagstk[++tagi] = tagc; + *mp++ = BOT; + *mp++ = static_cast<char>(tagc++); + } + else + return badpat("Too many \\(\\) pairs"); + break; + case ')': + if (*sp == BOT) + return badpat("Null pattern inside \\(\\)"); + if (tagi > 0) { + *mp++ = static_cast<char>(EOT); + *mp++ = static_cast<char>(tagstk[tagi--]); + } + else + return badpat("Unmatched \\)"); + break; + case '<': + *mp++ = BOW; + break; + case '>': + if (*sp == BOW) + return badpat("Null pattern inside \\<\\>"); + *mp++ = EOW; + break; + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + n = *p-'0'; + if (tagi > 0 && tagstk[tagi] == n) + return badpat("Cyclical reference"); + if (tagc > n) { + *mp++ = static_cast<char>(REF); + *mp++ = static_cast<char>(n); + } + else + return badpat("Undetermined reference"); + break; +#ifdef EXTEND + case 'a': + *mp++ = CHR; + *mp++ = '\a'; + break; + case 'b': + *mp++ = CHR; + *mp++ = '\b'; + break; + case 'n': + *mp++ = CHR; + *mp++ = '\n'; + break; + case 'f': + *mp++ = CHR; + *mp++ = '\f'; + break; + case 'r': + *mp++ = CHR; + *mp++ = '\r'; + break; + case 't': + *mp++ = CHR; + *mp++ = '\t'; + break; + case 'v': + *mp++ = CHR; + *mp++ = '\v'; + break; +#endif + default: + *mp++ = CHR; + *mp++ = *p; + } + break; + + default : /* an ordinary char */ + *mp++ = CHR; + *mp++ = *p; + break; + } + sp = lp; + } + if (tagi > 0) + return badpat("Unmatched \\("); + *mp = END; + sta = OKP; + return 0; +} + +/* + * RESearch::Execute: + * execute nfa to find a match. + * + * special cases: (nfa[0]) + * BOL + * Match only once, starting from the + * beginning. + * CHR + * First locate the character without + * calling PMatch, and if found, call + * PMatch for the remaining string. + * END + * RESearch::Compile failed, poor luser did not + * check for it. Fail fast. + * + * If a match is found, bopat[0] and eopat[0] are set + * to the beginning and the end of the matched fragment, + * respectively. + * + */ + +int RESearch::Execute(CharacterIndexer &ci, int lp) { + char c; + int ep = 0; + char *ap = nfa; + + bol = lp; + failure = 0; + + for (int i=0;i<MAXTAG;i++) { + bopat[i] = NOTFOUND; + eopat[i] = NOTFOUND; + } + + switch(*ap) { + + case BOL: /* anchored: match from BOL only */ + ep = PMatch(ci, lp, ap); + break; + case CHR: /* ordinary char: locate it fast */ + c = *(ap+1); + while (ci.CharAt(lp) && ci.CharAt(lp) != c) + lp++; + if (!ci.CharAt(lp)) /* if EOS, fail, else fall thru. */ + return 0; + default: /* regular matching all the way. */ + while (ci.CharAt(lp)) { + ep = PMatch(ci, lp, ap); + if (ep != NOTFOUND) + break; + lp++; + } + break; + case END: /* munged automaton. fail always */ + return 0; + } + if (ep == NOTFOUND) + return 0; + + bopat[0] = lp; + eopat[0] = ep; + return 1; +} + +/* + * PMatch: internal routine for the hard part + * + * This code is partly snarfed from an early grep written by + * David Conroy. The backref and tag stuff, and various other + * innovations are by oz. + * + * special case optimizations: (nfa[n], nfa[n+1]) + * CLO ANY + * We KNOW .* will match everything upto the + * end of line. Thus, directly go to the end of + * line, without recursive PMatch calls. As in + * the other closure cases, the remaining pattern + * must be matched by moving backwards on the + * string recursively, to find a match for xy + * (x is ".*" and y is the remaining pattern) + * where the match satisfies the LONGEST match for + * x followed by a match for y. + * CLO CHR + * We can again scan the string forward for the + * single char and at the point of failure, we + * execute the remaining nfa recursively, same as + * above. + * + * At the end of a successful match, bopat[n] and eopat[n] + * are set to the beginning and end of subpatterns matched + * by tagged expressions (n = 1 to 9). + * + */ + +extern void re_fail(char *,char); + +/* + * character classification table for word boundary operators BOW + * and EOW. the reason for not using ctype macros is that we can + * let the user add into our own table. see RESearch::ModifyWord. This table + * is not in the bitset form, since we may wish to extend it in the + * future for other character classifications. + * + * TRUE for 0-9 A-Z a-z _ + */ +static char chrtyp[MAXCHR] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 0, 0, 0, 0, 0 + }; + +#define inascii(x) (0177&(x)) +#define iswordc(x) chrtyp[inascii(x)] +#define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND]) + +/* + * skip values for CLO XXX to skip past the closure + */ + +#define ANYSKIP 2 /* [CLO] ANY END ... */ +#define CHRSKIP 3 /* [CLO] CHR chr END ... */ +#define CCLSKIP 18 /* [CLO] CCL 16bytes END ... */ + +int RESearch::PMatch(CharacterIndexer &ci, int lp, char *ap) { + int op, c, n; + int e; /* extra pointer for CLO */ + int bp; /* beginning of subpat.. */ + int ep; /* ending of subpat.. */ + int are; /* to save the line ptr. */ + + while ((op = *ap++) != END) + switch(op) { + + case CHR: + if (ci.CharAt(lp++) != *ap++) + return NOTFOUND; + break; + case ANY: + if (!ci.CharAt(lp++)) + return NOTFOUND; + break; + case CCL: + c = ci.CharAt(lp++); + if (!isinset(ap,c)) + return NOTFOUND; + ap += BITBLK; + break; + case BOL: + if (lp != bol) + return NOTFOUND; + break; + case EOL: + if (ci.CharAt(lp)) + return NOTFOUND; + break; + case BOT: + bopat[*ap++] = lp; + break; + case EOT: + eopat[*ap++] = lp; + break; + case BOW: + if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp))) + return NOTFOUND; + break; + case EOW: + if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp))) + return NOTFOUND; + break; + case REF: + n = *ap++; + bp = bopat[n]; + ep = eopat[n]; + while (bp < ep) + if (ci.CharAt(bp++) != ci.CharAt(lp++)) + return NOTFOUND; + break; + case CLO: + are = lp; + switch(*ap) { + + case ANY: + while (ci.CharAt(lp)) + lp++; + n = ANYSKIP; + break; + case CHR: + c = *(ap+1); + while (ci.CharAt(lp) && c == ci.CharAt(lp)) + lp++; + n = CHRSKIP; + break; + case CCL: + while (((c = ci.CharAt(lp)) != 0) && isinset(ap+1,c)) + lp++; + n = CCLSKIP; + break; + default: + failure = true; + //re_fail("closure: bad nfa.", *ap); + return NOTFOUND; + } + + ap += n; + + while (lp >= are) { + if ((e = PMatch(ci, lp, ap)) != NOTFOUND) + return e; + --lp; + } + return NOTFOUND; + default: + //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op)); + return NOTFOUND; + } + return lp; +} + +/* + * RESearch::ModifyWord: + * add new characters into the word table to change RESearch::Execute's + * understanding of what a word should look like. Note that we + * only accept additions into the word definition. + * + * If the string parameter is 0 or null string, the table is + * reset back to the default containing A-Z a-z 0-9 _. [We use + * the compact bitset representation for the default table] + */ + +static char deftab[16] = { + 0, 0, 0, 0, 0, 0, '\377', 003, '\376', '\377', '\377', '\207', + '\376', '\377', '\377', 007 +}; + +void RESearch::ModifyWord(char *s) { + int i; + + if (!s || !*s) { + for (i = 0; i < MAXCHR; i++) + if (!isinset(deftab,i)) + iswordc(i) = 0; + } + else + while(*s) + iswordc(*s++) = 1; +} + +/* + * RESearch::Substitute: + * substitute the matched portions of the src in dst. + * + * & substitute the entire matched pattern. + * + * \digit substitute a subpattern, with the given tag number. + * Tags are numbered from 1 to 9. If the particular + * tagged subpattern does not exist, null is substituted. + */ +int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) { + char c; + int pin; + int bp; + int ep; + + if (!*src || !bopat[0]) + return 0; + + while ((c = *src++) != 0) { + switch(c) { + + case '&': + pin = 0; + break; + + case '\\': + c = *src++; + if (c >= '0' && c <= '9') { + pin = c - '0'; + break; + } + + default: + *dst++ = c; + continue; + } + + if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) { + while (ci.CharAt(bp) && bp < ep) + *dst++ = ci.CharAt(bp++); + if (bp < ep) + return 0; + } + } + *dst = (char) 0; + return 1; +} + +#ifdef DEBUG +/* + * symbolic - produce a symbolic dump of the nfa + */ +void symbolic(char *s) { + printf("pattern: %s\n", s); + printf("nfacode:\n"); + nfadump(nfa); +} + +static void nfadump(char *ap) { + int n; + + while (*ap != END) + switch(*ap++) { + case CLO: + printf("CLOSURE"); + nfadump(ap); + switch(*ap) { + case CHR: + n = CHRSKIP; + break; + case ANY: + n = ANYSKIP; + break; + case CCL: + n = CCLSKIP; + break; + } + ap += n; + break; + case CHR: + printf("\tCHR %c\n",*ap++); + break; + case ANY: + printf("\tANY .\n"); + break; + case BOL: + printf("\tBOL -\n"); + break; + case EOL: + printf("\tEOL -\n"); + break; + case BOT: + printf("BOT: %d\n",*ap++); + break; + case EOT: + printf("EOT: %d\n",*ap++); + break; + case BOW: + printf("BOW\n"); + break; + case EOW: + printf("EOW\n"); + break; + case REF: + printf("REF: %d\n",*ap++); + break; + case CCL: + printf("\tCCL ["); + for (n = 0; n < MAXCHR; n++) + if (isinset(ap,(char)n)) { + if (n < ' ') + printf("^%c", n ^ 0x040); + else + printf("%c", n); + } + printf("]\n"); + ap += BITBLK; + break; + default: + printf("bad nfa. opcode %o\n", ap[-1]); + exit(1); + break; + } +} +#endif diff --git a/src/RESearch.h b/src/RESearch.h new file mode 100644 index 000000000..f9cf7fdc5 --- /dev/null +++ b/src/RESearch.h @@ -0,0 +1,54 @@ +// Scintilla source code edit control +/** @file RESearch.h + ** Interface to the regular expression search library. + **/ +// Written by Neil Hodgson <neilh@scintilla.org> +// Based on the work of Ozan S. Yigit. +// This file is in the public domain. + +#ifndef RESEARCH_H +#define RESEARCH_H + +/* + * The following defines are not meant to be changeable. + * They are for readability only. + */ +#define MAXCHR 128 +#define CHRBIT 8 +#define BITBLK MAXCHR/CHRBIT + +class CharacterIndexer { +public: + virtual char CharAt(int index)=0; +}; + +class RESearch { + +public: + RESearch(); + void ChSet(char c); + const char *Compile(char *pat); + int Execute(CharacterIndexer &ci, int lp); + void ModifyWord(char *s); + int Substitute(CharacterIndexer &ci, char *src, char *dst); + + enum {MAXTAG=10}; + enum {MAXNFA=1024}; + enum {NOTFOUND=-1}; + + int bopat[MAXTAG]; + int eopat[MAXTAG]; + +private: + int PMatch(CharacterIndexer &ci, int lp, char *ap); + + int bol; + int tagstk[MAXTAG]; /* subpat tag stack..*/ + char nfa[MAXNFA]; /* automaton.. */ + int sta; + char bittab[BITBLK]; /* bit table for CCL */ + /* pre-set bits... */ + int failure; +}; + +#endif |