aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authornyamatongwe <unknown>2001-04-04 12:52:44 +0000
committernyamatongwe <unknown>2001-04-04 12:52:44 +0000
commit93b871d1d8fbb076510e2c410ba57a0980a22ec8 (patch)
tree56576fc17d8737f5fbb591a89fd1e9fab4bd1a59 /src
parentb338ed2a95f184263c1e1c7782ba3706fa05858c (diff)
downloadscintilla-mirror-93b871d1d8fbb076510e2c410ba57a0980a22ec8.tar.gz
Moved to public domain regular expresion implementation.
Diffstat (limited to 'src')
-rw-r--r--src/Document.cxx56
-rw-r--r--src/Document.h2
-rw-r--r--src/PosRegExp.cxx1192
-rw-r--r--src/RESearch.cxx844
-rw-r--r--src/RESearch.h54
5 files changed, 934 insertions, 1214 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index ac9c381a0..ba7fa4911 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -16,7 +16,10 @@
#include "SVector.h"
#include "CellBuffer.h"
#include "Document.h"
-#include "PosRegExp.h"
+#include "RESearch.h"
+
+void re_fail(char *,char) {
+}
// This is ASCII specific but is safe with chars >= 0x80
inline bool isspacechar(unsigned char ch) {
@@ -743,10 +746,6 @@ bool Document::IsWordAt(int start, int end) {
return IsWordStartAt(start) && IsWordEndAt(end);
}
-char Document::DocCharAt(int pos, void *param) {
- return reinterpret_cast<Document*>(param)->CharAt(pos);
-}
-
// The comparison and case changing functions here assume ASCII
// or extended ASCII such as the normal Windows code page.
@@ -764,6 +763,22 @@ static inline char MakeLowerCase(char ch) {
return static_cast<char>(ch - 'A' + 'a');
}
+// Define a way for the Regular Expression code to access the document
+class DocumentIndexer : public CharacterIndexer {
+ Document *pdoc;
+ int end;
+public:
+ DocumentIndexer(Document *pdoc_, int end_) :
+ pdoc(pdoc_), end(end_) {
+ }
+ virtual char CharAt(int index) {
+ if (index < 0 || index >= end)
+ return 0;
+ else
+ return pdoc->CharAt(index);
+ }
+};
+
// Find text in document, supporting both forward and backward
// searches (just pass minPos > maxPos to do a backward search)
// Has not been tested with backwards DBCS searches yet.
@@ -771,11 +786,11 @@ long Document::FindText(int minPos, int maxPos, const char *s,
bool caseSensitive, bool word, bool wordStart, bool regExp,
int *length) {
if (regExp) {
- char *pat = new char[strlen(s) + 4];
+ char *pat = new char[strlen(s) + 1];
if (!pat)
return -1;
- strcpy(pat, "/");
+ pat[0] = '\0';
int startPos;
int endPos;
@@ -791,25 +806,26 @@ long Document::FindText(int minPos, int maxPos, const char *s,
startPos = MovePositionOutsideChar(startPos, 1, false);
endPos = MovePositionOutsideChar(endPos, 1, false);
+ DocumentIndexer di(this, endPos);
+ RESearch re;
strcat(pat, s);
- strcat(pat, "/");
- PosRegExp re;
- if (!caseSensitive)
- strcat(pat, "i");
- if (!re.SetExpr(pat)) {
+ const char *errmsg = re.Compile(pat);
+ if (errmsg) {
delete []pat;
return -1;
}
- re.param = this;
- re.CharAt = DocCharAt;
- SMatches matches;
- if (!re.Parse(startPos, 0, endPos, &matches)) {
- delete []pat;
- return -1;
+ // Find a variable in a property file: \$([A-Za-z0-9_.]+)
+ int success = re.Execute(di, startPos);
+ int pos = -1;
+ int lenRet = 0;
+ if (success) {
+ pos = re.bopat[0];
+ lenRet = re.eopat[0] - re.bopat[0];
}
- *length = matches.e[0] - matches.s[0];
delete []pat;
- return matches.s[0];
+ *length = lenRet;
+ return pos;
+
} else {
bool forward = minPos <= maxPos;
diff --git a/src/Document.h b/src/Document.h
index cfb131608..b52036513 100644
--- a/src/Document.h
+++ b/src/Document.h
@@ -196,8 +196,6 @@ public:
const WatcherWithUserData *GetWatchers() const { return watchers; }
int GetLenWatchers() const { return lenWatchers; }
- static char DocCharAt(int pos, void *param);
-
bool IsWordPartSeparator(char ch);
int WordPartLeft(int pos);
int WordPartRight(int pos);
diff --git a/src/PosRegExp.cxx b/src/PosRegExp.cxx
deleted file mode 100644
index eb240e2fb..000000000
--- a/src/PosRegExp.cxx
+++ /dev/null
@@ -1,1192 +0,0 @@
-#include <string.h>
-#include <stdio.h>
-#include <ctype.h>
-#include <malloc.h>
-
-#include "PosRegExp.h"
-
-#ifdef _MSC_VER
-#pragma warning(disable: 4244)
-#endif
-#ifdef __BORLANDC__
-// Too much effort to to cean this code up so just ignore badly
-// bracketed initialisers, conversions losing significant digits,
-// and values assigned but not used.
-#pragma warn -pin
-#pragma warn -sig
-#pragma warn -aus
-#endif
-//Up: /[A-Z \x80-\x9f \xf0 ]/x
-//Lo: /[a-z \xa0-\xaf \xe0-\xef \xf1 ]/x
-//Wd: /[\d _ A-Z a-z \xa0-\xaf \xe0-\xf1 \x80-\x9f]/x
-//* // Dos866
-SCharData UCData = {0x0, 0x0, 0x7fffffe, 0x0, 0xffffffff, 0x0, 0x0, 0x10000},
- LCData = {0x0, 0x0, 0x0, 0x7fffffe, 0x0, 0xffff, 0x0, 0x2ffff},
- WdData = {0x0, 0x3ff0000, 0x87fffffe, 0x7fffffe, 0xffffffff, 0xffff, 0x0, 0x3ffff},
- DigData = {0x0, 0x3ff0000, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
-/*/ // cp1251
-SCharData UCData = {0x0, 0x0, 0x7fffffe, 0x0, 0x0, 0x0, 0xffffffff, 0x0},
- LCData = {0x0, 0x0, 0x0, 0x7fffffe, 0x0, 0x0, 0x0, 0xffffffff},
- WdData = {0x0, 0x3ff0000, 0x87fffffe, 0x7fffffe, 0x0, 0x0, 0xffffffff, 0xffffffff},
- DigData = {0x0, 0x3ff0000, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
-//*/
-
-///////////////////////////////////////////////
-
-int GetNumber(int *str,int s,int e) {
- int r = 1, num = 0;
- if (e < s) return -1;
- for(int i = e-1; i >= s; i--) {
- if (str[i] > '9' || str[i] < '0') return -1;
- num += (str[i] - 0x30)*r;
- r *= 10;
- };
- return num;
- /*
- char tmp[20];
- double Res;
- if (e == s) return -1;
- for (int i = s;i < e;i++)
- tmp[i-s] = (char)Str[i];
- tmp[e-s] = 0;
- GetNumber(tmp,&Res);
- return (int)Res;
- */
-};
-
-bool IsDigit(char Symb) {
- return DigData.GetBit(Symb);
-};
-bool IsWord(char Symb) {
- return WdData.GetBit(Symb);
-};
-bool IsUpperCase(char Symb) {
- return UCData.GetBit(Symb);
-};
-bool IsLowerCase(char Symb) {
- return LCData.GetBit(Symb);
-};
-char LowCase(char Chr) {
- if (UCData.GetBit(Chr))
- return Chr+0x20;
- return Chr;
-};
-
-///////////////////////////////////////////////
-
-SRegInfo::SRegInfo() {
- Next = Parent = 0;
- un.Param = 0;
- Op = ReEmpty;
-};
-SRegInfo::~SRegInfo() {
- if (Next) delete Next;
- if (un.Param)
- switch(Op) {
- case ReEnum:
- case ReNEnum:
- delete un.ChrClass;
- break;
- default:
- if (Op > ReBlockOps && Op < ReSymbolOps || Op == ReBrackets)
- delete un.Param;
- break;
- };
-};
-
-///////////////////////////////////////////////
-
-void SCharData::SetBit(unsigned char Bit) {
- int p = Bit/8;
- CArr[p] |= (1 << Bit%8);
-};
-void SCharData::ClearBit(unsigned char Bit) {
- int p = Bit/8;
- CArr[p] &= ~(1 << Bit%8);
-};
-bool SCharData::GetBit(unsigned char Bit) {
- int p = (unsigned char)Bit/8;
- return (CArr[p] & (1 << Bit%8))!=0;
-};
-
-/////////////////////////////////////////////////////////////////
-////////////////////// RegExp Class ///////////////////////////
-/////////////////////////////////////////////////////////////////
-
-PosRegExp::PosRegExp() {
- Info = 0;
- Exprn = 0;
- NoMoves = false;
- Error = true;
- FirstChar = 0;
- CurMatch = 0;
-};
-PosRegExp::~PosRegExp() {
- if (Info) delete Info;
-};
-
-bool PosRegExp::SetExpr(const char *Expr) {
- if (!this) return false;
- Error = true;
- CurMatch = 0;
- if (SetExprLow(Expr)) Error = false;
- return !Error;
-};
-bool PosRegExp::isok() {
- return !Error;
-};
-
-
-bool PosRegExp::SetExprLow(const char *Expr) {
- int Len = strlen(Expr);
- bool Ok = false;
- int i,j,s = 0,pos,tmp;
- int EnterBr = 0,EnterGr = 0,EnterFg = 0;
-
- if (Info) delete Info;
- Info = new SRegInfo;
- Exprn = new int[Len];
-
- NoCase = false;
- Extend = false;
- if (Expr[0] == '/') s++;
- else return false;
-
- for (i = Len; i > 0 && !Ok;i--)
- if (Expr[i] == '/') {
- Len = i-s;
- Ok = true;
- for (int j = i+1; Expr[j]; j++) {
- if (Expr[j] == 'i') NoCase = true;
- if (Expr[j] == 'x') Extend = true;
- };
- };
- if (!Ok) return false;
-
- ////////////////////////////////
- for (j = 0,pos = 0; j < Len; j++,pos++) {
- if (Extend && Expr[j+s] == ' ') {
- pos--;
- continue;
- };
-
- Exprn[pos] = (int)(unsigned char)Expr[j+s];
-
- if (Expr[j+s] == BackSlash) {
- switch (Expr[j+s+1]) {
- case 'd':
- Exprn[pos] = ReDigit;
- break;
- case 'D':
- Exprn[pos] = ReNDigit;
- break;
- case 'w':
- Exprn[pos] = ReWordSymb;
- break;
- case 'W':
- Exprn[pos] = ReNWordSymb;
- break;
- case 's':
- Exprn[pos] = ReWSpace;
- break;
- case 'S':
- Exprn[pos] = ReNWSpace;
- break;
- case 'u':
- Exprn[pos] = ReUCase;
- break;
- case 'l':
- Exprn[pos] = ReNUCase;
- break;
- case 't':
- Exprn[pos] = '\t';
- break;
- case 'n':
- Exprn[pos] = '\n';
- break;
- case 'r':
- Exprn[pos] = '\r';
- break;
- case 'b':
- Exprn[pos] = ReWBound;
- break;
- case 'B':
- Exprn[pos] = ReNWBound;
- break;
- case 'c':
- Exprn[pos] = RePreNW;
- break;
- case 'm':
- Exprn[pos] = ReStart;
- break;
- case 'M':
- Exprn[pos] = ReEnd;
- break;
- case 'x':
- tmp = toupper(Expr[j+s+2])-0x30;
- tmp = (tmp>9?tmp-7:tmp)<<4;
- tmp += (toupper(Expr[j+s+3])-0x30)>9?toupper(Expr[j+s+3])-0x37:(toupper(Expr[j+s+3])-0x30);
- Exprn[pos] = tmp;
- j+=2;
- break;
- case 'y':
- tmp = Expr[j+s+2] - 0x30;
- if (tmp >= 0 && tmp <= 9) {
- if (tmp == 1) {
- tmp = 10 + Expr[j+s+3] - 0x30;
- if (tmp >= 10 && tmp <= 19) j++;
- else tmp = 1;
- };
- Exprn[pos] = ReBkTrace + tmp;
- j++;
- break;
- };
- default:
- tmp = Expr[j+s+1] - 0x30;
- if (tmp >= 0 && tmp <= 9) {
- if (tmp == 1) {
- tmp = 10 + Expr[j+s+2] - 0x30;
- if (tmp >= 10 && tmp <= 19) j++;
- else tmp = 1;
- };
- Exprn[pos] = ReBkBrack + tmp;
- break;
- } else
- Exprn[pos] = Expr[j+s+1];
- break;
- };
- j++;
- continue;
- };
- if (Expr[j+s] == ']') {
- Exprn[pos] = ReEnumE;
- if (EnterFg || !EnterGr) return false;
- EnterGr--;
- };
- if (Expr[j+s] == '-' && EnterGr) Exprn[pos] = ReFrToEnum;
-
- if (EnterGr) continue;
-
- if (Expr[j+s] == '[' && Expr[j+s+1] == '^') {
- Exprn[pos] = ReNEnumS;
- if (EnterFg) return false;
- EnterGr++;
- j++;
- continue;
- };
- if (Expr[j+s] == '*' && Expr[j+s+1] == '?') {
- Exprn[pos] = ReNGMul;
- j++;
- continue;
- };
- if (Expr[j+s] == '+' && Expr[j+s+1] == '?') {
- Exprn[pos] = ReNGPlus;
- j++;
- continue;
- };
- if (Expr[j+s] == '?' && Expr[j+s+1] == '?') {
- Exprn[pos] = ReNGQuest;
- j++;
- continue;
- };
- if (Expr[j+s] == '?' && Expr[j+s+1] == '#' &&
- Expr[j+s+2]>='0' && Expr[j+s+2]<='9') {
- Exprn[pos] = ReBehind+Expr[j+s+2]-0x30;
- j+=2;
- continue;
- };
- if (Expr[j+s] == '?' && Expr[j+s+1] == '~' &&
- Expr[j+s+2]>='0' && Expr[j+s+2]<='9') {
- Exprn[pos] = ReNBehind+Expr[j+s+2]-0x30;
- j+=2;
- continue;
- };
- if (Expr[j+s] == '?' && Expr[j+s+1] == '=') {
- Exprn[pos] = ReAhead;
- j++;
- continue;
- };
- if (Expr[j+s] == '?' && Expr[j+s+1] == '!') {
- Exprn[pos] = ReNAhead;
- j++;
- continue;
- };
-
- if (Expr[j+s] == '(') {
- Exprn[pos] = ReLBrack;
- if (EnterFg) return false;
- EnterBr++;
- };
- if (Expr[j+s] == ')') {
- Exprn[pos] = ReRBrack;
- if (!EnterBr || EnterFg) return false;
- EnterBr--;
- };
- if (Expr[j+s] == '[') {
- Exprn[pos] = ReEnumS;
- if (EnterFg) return false;
- EnterGr++;
- };
- if (Expr[j+s] == '{') {
- Exprn[pos] = ReRangeS;
- if (EnterFg) return false;
- EnterFg++;
- };
- if (Expr[j+s] == '}' && Expr[j+s+1] == '?') {
- Exprn[pos] = ReNGRangeE;
- if (!EnterFg) return false;
- EnterFg--;
- j++;
- continue;
- };
- if (Expr[j+s] == '}') {
- Exprn[pos] = ReRangeE;
- if (!EnterFg) return false;
- EnterFg--;
- };
-
- if (Expr[j+s] == '^') Exprn[pos] = ReSoL;
- if (Expr[j+s] == '$') Exprn[pos] = ReEoL;
- if (Expr[j+s] == '.') Exprn[pos] = ReAnyChr;
- if (Expr[j+s] == '*') Exprn[pos] = ReMul;
- if (Expr[j+s] == '+') Exprn[pos] = RePlus;
- if (Expr[j+s] == '?') Exprn[pos] = ReQuest;
- if (Expr[j+s] == '|') Exprn[pos] = ReOr;
- };
- if (EnterGr || EnterBr || EnterFg) return false;
-
- Info->Op = ReBrackets;
- Info->un.Param = new SRegInfo;
- Info->s = CurMatch++;
-
- if (!SetStructs(Info->un.Param,0,pos)) return false;
- Optimize();
- delete Exprn;
- return true;
-};
-
-void PosRegExp::Optimize() {
- PRegInfo Next = Info;
- FirstChar = 0;
- while(Next) {
- if (Next->Op == ReBrackets || Next->Op == RePlus || Next->Op == ReNGPlus) {
- Next = Next->un.Param;
- continue;
- };
- if (Next->Op == ReSymb) {
- if (Next->un.Symb & 0xFF00 && Next->un.Symb != ReSoL && Next->un.Symb != ReWBound)
- break;
- FirstChar = Next->un.Symb;
- break;
- };
- break;
- };
-};
-
-bool PosRegExp::SetStructs(PRegInfo &re,int start,int end) {
- PRegInfo Next,Prev,Prev2;
- int comma,st,en,ng,i, j,k;
- int EnterBr;
- bool Add;
-
- if (end - start < 0) return false;
- Next = re;
- for (i = start; i < end; i++) {
- Add = false;
- // Ops
- if (Exprn[i] > ReBlockOps && Exprn[i] < ReSymbolOps) {
- Next->un.Param = 0;
- Next->Op = (EOps)Exprn[i];
- Add = true;
- };
- // {n,m}
- if (Exprn[i] == ReRangeS) {
- st = i;
- en = -1;
- comma = -1;
- ng = 0;
- for (j = i;j < end;j++) {
- if (Exprn[j] == ReNGRangeE) {
- en = j;
- ng = 1;
- break;
- };
- if (Exprn[j] == ReRangeE) {
- en = j;
- break;
- };
- if ((char)Exprn[j] == ',')
- comma = j;
- };
- if (en == -1) return false;
- if (comma == -1) comma = en;
- Next->s = (char)GetNumber(Exprn,st+1,comma);
- if (comma != en)
- Next->e = (char)GetNumber(Exprn,comma+1,en);
- else
- Next->e = Next->s;
- Next->un.Param = 0;
- Next->Op = ng?ReNGRangeNM:ReRangeNM;
- if (en-comma == 1) {
- Next->e = -1;
- Next->Op = ng?ReNGRangeN:ReRangeN;
- };
- i=j;
- Add = true;
- };
- // [] [^]
- if (Exprn[i] == ReEnumS || Exprn[i] == ReNEnumS) {
- Next->Op = (Exprn[i] == ReEnumS)?ReEnum:ReNEnum;
- for (j = i+1;j < end;j++) {
- if (Exprn[j] == ReEnumE)
- break;
- };
- if (j == end) return false;
- Next->un.ChrClass = new SCharData;
- memset(Next->un.ChrClass, 0, 32);
- for (j = i+1;Exprn[j] != ReEnumE;j++) {
- if (Exprn[j+1] == ReFrToEnum) {
- for (i = (Exprn[j]&0xFF); i < (Exprn[j+2]&0xFF);i++)
- Next->un.ChrClass->SetBit(i&0xFF);
- j++;
- continue;
- };
- switch(Exprn[j]) {
- case ReDigit:
- for (k = 0x30;k < 0x40;k++)
- if (IsDigit((char)k))
- Next->un.ChrClass->SetBit(k);
- break;
- case ReNDigit:
- for (k = 0x30;k < 0x40;k++)
- if (!IsDigit((char)k))
- Next->un.ChrClass->SetBit(k);
- Next->un.ChrClass->ClearBit(0x0a);
- Next->un.ChrClass->ClearBit(0x0d);
- break;
- case ReWordSymb:
- for (k = 0;k < 256;k++)
- if (IsWord((char)k))
- Next->un.ChrClass->SetBit(k);
- break;
- case ReNWordSymb:
- for (k = 0;k < 256;k++)
- if (!IsWord((char)k))
- Next->un.ChrClass->SetBit(k);
- Next->un.ChrClass->ClearBit(0x0a);
- Next->un.ChrClass->ClearBit(0x0d);
- break;
- case ReWSpace:
- Next->un.ChrClass->SetBit(0x20);
- Next->un.ChrClass->SetBit(0x09);
- break;
- case ReNWSpace:
- memset(Next->un.ChrClass->IArr, 0xFF, 32);
- Next->un.ChrClass->ClearBit(0x20);
- Next->un.ChrClass->ClearBit(0x09);
- Next->un.ChrClass->ClearBit(0x0a);
- Next->un.ChrClass->ClearBit(0x0d);
- break;
- default:
- if (!(Exprn[j]&0xFF00))
- Next->un.ChrClass->SetBit(Exprn[j]&0xFF);
- break;
- };
- };
- Add = true;
- i=j;
- };
- // ( ... )
- if (Exprn[i] == ReLBrack) {
- EnterBr = 1;
- for (j = i+1;j < end;j++) {
- if (Exprn[j] == ReLBrack) EnterBr++;
- if (Exprn[j] == ReRBrack) EnterBr--;
- if (!EnterBr) break;
- };
- if (EnterBr) return false;
- Next->Op = ReBrackets;
- Next->un.Param = new SRegInfo;
- Next->un.Param->Parent = Next;
- Next->s = CurMatch++;
- if (CurMatch > MatchesNum) CurMatch = MatchesNum;
- if (!SetStructs(Next->un.Param,i+1,j)) return false;
- Add = true;
- i=j;
- };
- if ((Exprn[i]&0xFF00) == ReBkTrace) {
- Next->Op = ReBkTrace;
- Next->un.Symb = Exprn[i]&0xFF;
- Add = true;
- };
- if ((Exprn[i]&0xFF00) == ReBkBrack) {
- Next->Op = ReBkBrack;
- Next->un.Symb = Exprn[i]&0xFF;
- Add = true;
- };
- if ((Exprn[i]&0xFF00) == ReBehind) {
- Next->Op = ReBehind;
- Next->s = Exprn[i]&0xFF;
- Add = true;
- };
- if ((Exprn[i]&0xFF00) == ReNBehind) {
- Next->Op = ReNBehind;
- Next->s = Exprn[i]&0xFF;
- Add = true;
- };
- // Chars
- if (Exprn[i] >= ReAnyChr && Exprn[i] < ReTemp || Exprn[i] < 0x100) {
- Next->Op = ReSymb;
- Next->un.Symb = Exprn[i];
- Add = true;
- };
- // Next
- if (Add && i != end-1) {
- Next->Next = new SRegInfo;
- Next->Next->Parent = Next->Parent;
- Next = Next->Next;
- };
- };
- Next = re;
- Prev = Prev2 = 0;
- while(Next) {
- if (Next->Op > ReBlockOps && Next->Op < ReSymbolOps) {
- if (!Prev) return false;
- if (!Prev2) re = Next;
- else Prev2->Next = Next;
- //if (Prev->Op > ReBlockOps && Prev->Op < ReSymbolOps) return false;
- Prev->Parent = Next;
- Prev->Next = 0;
- Next->un.Param = Prev;
- Prev = Prev2;
- };
- Prev2 = Prev;
- Prev = Next;
- Next = Next->Next;
- };
-
- return true;
-};
-
-/////////////////////////////////////////////////////////////////
-///////////////////////// Parsing /////////////////////////////
-/////////////////////////////////////////////////////////////////
-
-bool PosRegExp::CheckSymb(int Symb,bool Inc) {
- bool Res;
- char ch;
- switch(Symb) {
- case ReAnyChr:
- if (posParse >= posEnd) return false;
- ch = CharAt(posParse,param);
- Res = ch != '\r' && ch != '\n';
- if (Res && Inc) posParse++;
- return Res;
- case ReSoL:
- if (posStart == posParse)
- return true;
- ch = CharAt(posParse-1,param);
- return ch == '\n' || ch == '\r';
- case ReEoL:
- if (posEnd == posParse)
- return true;
- ch = CharAt(posParse,param);
- return ch == '\n' || ch == '\r';
- case ReDigit:
- if (posParse >= posEnd) return false;
- ch = CharAt(posParse,param);
- Res = (ch >= 0x30 && ch <= 0x39);
- if (Res && Inc) posParse++;
- return Res;
- case ReNDigit:
- if (posParse >= posEnd) return false;
- ch = CharAt(posParse,param);
- Res = !(ch >= 0x30 && ch <= 0x39) && ch != '\r' && ch != '\n';
- if (Res && Inc) posParse++;
- return Res;
- case ReWordSymb:
- if (posParse >= posEnd) return false;
- Res = IsWord(CharAt(posParse,param));
- if (Res && Inc) posParse++;
- return Res;
- case ReNWordSymb:
- if (posParse >= posEnd) return false;
- ch = CharAt(posParse,param);
- Res = !IsWord(ch) && ch != '\r' && ch != '\n';
- if (Res && Inc) posParse++;
- return Res;
- case ReWSpace:
- if (posParse >= posEnd) return false;
- ch = CharAt(posParse,param);
- Res = (ch == 0x20 || ch == '\t');
- if (Res && Inc) posParse++;
- return Res;
- case ReNWSpace:
- if (posParse >= posEnd) return false;
- ch = CharAt(posParse,param);
- Res = !(ch == 0x20 || ch == '\t') && ch != '\r' && ch != '\n';
- if (Res && Inc) posParse++;
- return Res;
- case ReUCase:
- if (posParse >= posEnd) return false;
- Res = IsUpperCase(CharAt(posParse,param));
- if (Res && Inc) posParse++;
- return Res;
- case ReNUCase:
- if (posParse >= posEnd) return false;
- Res = IsLowerCase(CharAt(posParse,param));
- if (Res && Inc) posParse++;
- return Res;
- case ReWBound:
- if (posParse >= posEnd) return true;
- ch = CharAt(posParse,param);
- return IsWord(CharAt(posParse,param)) && (posParse == posStart || !IsWord(CharAt(posParse-1,param)));
- case ReNWBound:
- if (posParse >= posEnd) return true;
- return !IsWord(CharAt(posParse,param)) && IsWord(CharAt(posParse-1,param));
- case RePreNW:
- if (posParse >= posEnd) return true;
- return (posParse == posStart || !IsWord(CharAt(posParse-1,param)));
- case ReStart:
- Matches->s[0] = (posParse-posStart);
- return true;
- case ReEnd:
- Matches->e[0] = (posParse-posStart);
- return true;
- default:
- if ((Symb & 0xFF00) || posParse >= posEnd) return false;
- if (NoCase) {
- if (LowCase(CharAt(posParse,param)) != LowCase((char)Symb&0xFF)) return false;
- } else
- if (CharAt(posParse,param) != (char)(Symb&0xFF)) return false;
- if (Inc) posParse++;
- return true;
- };
-}
-
-bool PosRegExp::LowParseRe(PRegInfo &Next) {
- PRegInfo OrNext;
- int i,match,sv;
- int posStr;
-
- switch(Next->Op) {
- case ReSymb:
- if (!CheckSymb(Next->un.Symb,true)) return false;
- break;
- case ReEmpty:
- break;
- case ReBkTrace:
- if (!posBkStr | !BkTrace) return false;
- sv = Next->un.Symb;
- posStr = posParse;
- for (i = BkTrace->s[sv]; i < BkTrace->e[sv]; i++) {
- if (CharAt(posStr,param) != CharAt(posBkStr+i,param) || posEnd == posStr) return false;
- posStr++;
- };
- posParse = posStr;
- break;
- case ReBkBrack:
- sv = Next->un.Symb;
- posStr = posParse;
- if (Matches->s[sv] == -1 || Matches->e[sv] == -1) return false;
- for (i = Matches->s[sv]; i < Matches->e[sv]; i++) {
- if (CharAt(posStr,param) != CharAt(posStart+i,param) || posEnd == posStr) return false;
- posStr++;
- };
- posParse = posStr;
- break;
- case ReBehind:
- sv = Next->s;
- posStr = posParse;
- posParse -= sv;
- if (!LowParse(Next->un.Param)) return false;
- posParse = posStr;
- break;
- case ReNBehind:
- sv = Next->s;
- posStr = posParse;
- posParse -= sv;
- if (LowParse(Next->un.Param)) return false;
- posParse = posStr;
- break;
- case ReAhead:
- posStr = posParse;
- if (!LowParse(Next->un.Param)) return false;
- posParse = posStr;
- break;
- case ReNAhead:
- posStr = posParse;
- if (LowParse(Next->un.Param)) return false;
- posParse = posStr;
- break;
- case ReEnum:
- if (posParse >= posEnd) return false;
- if (!Next->un.ChrClass->GetBit(CharAt(posParse,param))) return false;
- posParse++;
- break;
- case ReNEnum:
- if (posParse >= posEnd) return false;
- if (Next->un.ChrClass->GetBit(CharAt(posParse,param))) return false;
- posParse++;
- break;
- case ReBrackets:
- match = Next->s;
- sv = posParse-posStart;
- posStr = posParse;
- if (LowParse(Next->un.Param)) {
- if (match || (Matches->s[match] == -1))
- Matches->s[match] = sv;
- if (match || (Matches->e[match] == -1))
- Matches->e[match] = posParse-posStart;
- return true;
- };
- posParse = posStr;
- return false;
- case ReMul:
- posStr = posParse;
- while (LowParse(Next->un.Param));
- while(!LowCheckNext(Next) && posStr < posParse) posParse--;
- break;
- case ReNGMul:
- do {
- if (LowCheckNext(Next)) break;
- } while (LowParse(Next->un.Param));
- break;
- case RePlus:
- posStr = posParse;
- match = false;
- while (LowParse(Next->un.Param))
- match = true;
- if (!match) return false;
- while(!LowCheckNext(Next) && posStr < posParse) posParse--;
- break;
- case ReNGPlus:
- if (!LowParse(Next->un.Param)) return false;
- do {
- if (LowCheckNext(Next)) break;
- } while (LowParse(Next->un.Param));
- break;
- case ReQuest:
- LowParse(Next->un.Param);
- break;
- case ReNGQuest:
- if (LowCheckNext(Next)) break;
- if (!LowParse(Next->un.Param)) return false;
- break;
- case ReOr:
- OrNext = Next;
- // posStr = posParse;
- if (LowParse(Next->un.Param)) {
- while (OrNext && OrNext->Op == ReOr)
- OrNext = OrNext->Next;
- /*if (!LowCheckNext(OrNext)){
- posParse = posStr;
- OrNext = Next;
- };*/
- };
- Next = OrNext;
- break;
- case ReRangeN:
- posStr = posParse;
- i = 0;
- while (LowParse(Next->un.Param)) i++; // ???
- do {
- if (i < Next->s) {
- posParse = posStr;
- return false;
- };
- i--;
- } while(!LowCheckNext(Next) && posStr < posParse--);
- break;
- case ReNGRangeN:
- posStr = posParse;
- i = 0;
- while (LowParse(Next->un.Param)) {
- i++;
- if (i >= Next->s && LowCheckNext(Next)) // ???
- break;
- };
- if (i < Next->s) {
- posParse = posStr;
- return false;
- };
- break;
- case ReRangeNM:
- posStr = posParse;
- i = 0;
- while (i < Next->s && LowParse(Next->un.Param)) // ???
- i++;
- if (i < Next->s) {
- posParse = posStr;
- return false;
- };
- while (i < Next->e && LowParse(Next->un.Param)) // ???
- i++;
-
- while(!LowCheckNext(Next)) {
- i--;
- posParse--;
- if (i < Next->s) {
- posParse = posStr;
- return false;
- };
- };
- break;
- case ReNGRangeNM:
- posStr = posParse;
- i = 0;
- while (i < Next->s && LowParse(Next->un.Param)) // ???
- i++;
- if (i < Next->s) {
- posParse = posStr;
- return false;
- };
- while(!LowCheckNext(Next)) {
- i++;
- if (!LowParse(Next->un.Param) || i > Next->e) { // ???
- posParse = posStr;
- return false;
- };
- };
- break;
- };
- return true;
-};
-
-bool PosRegExp::LowCheckNext(PRegInfo Re) {
- PRegInfo Next;
- int tmp = posParse;
- Next = Re;
- do {
- if (Next && Next->Op == ReOr)
- while (Next && Next->Op == ReOr)
- Next = Next->Next;
- if (Next->Next && !LowParse(Next->Next)) {
- posParse = tmp;
- Ok = false;
- return false;
- };
- Next = Next->Parent;
- } while(Next);
- posParse = tmp;
- if (Ok != false) Ok = true;
- return true;
-};
-
-bool PosRegExp::LowParse(PRegInfo Re) {
- while(Re && posParse <= posEnd) {
- if (!LowParseRe(Re)) return false;
- if (Re) Re = Re->Next;
- };
- return true;
-};
-
-bool PosRegExp::QuickCheck() {
- if (!NoMoves || !FirstChar)
- return true;
- switch(FirstChar) {
- case ReSoL:
- if (posParse != posStart) return false;
- return true;
- case ReWBound:
- return IsWord(CharAt(posParse,param)) && (posParse == posStart || !IsWord(CharAt(posParse-1,param)));
- default:
- if (NoCase && LowCase(CharAt(posParse,param)) != LowCase(FirstChar)) return false;
- if (!NoCase && CharAt(posParse,param) != (char)FirstChar) return false;
- return true;
- };
-};
-
-bool PosRegExp::ParseRe(int posStr) {
- if (Error) return false;
-
- posParse = posStr;
- if (!QuickCheck()) return false;
-
- for (int i = 0; i < MatchesNum; i++)
- Matches->s[i] = Matches->e[i] = -1;
- Matches->CurMatch = CurMatch;
-
- Ok = -1;
- //try{
- do {
- if (!LowParse(Info)) {
- if (NoMoves) return false;
- } else
- return true;
- posParse = ++posStr;
- } while(posParse != posEnd+1);
- return false;
- //}__except(){
- // return true;
- //};
-}
-;
-
-bool PosRegExp::Parse(int posStr,int posSol, int posEol, PMatches Mtch, int Moves) {
- if (!this) return false;
-
- bool s = NoMoves;
- if (Moves != -1) NoMoves = Moves!=0;
- posStart = posSol;
- posEnd = posEol;
- Matches = Mtch;
- bool r = ParseRe(posStr);
- NoMoves = s;
- return r;
-};
-
-bool PosRegExp::Parse(int posStr, int posStop, PMatches Mtch) {
- if (!this) return false;
- posStart = posStr;
- posEnd = posStop;
- Matches = Mtch;
- return ParseRe(posStr);
-};
-
-bool PosRegExp::SetNoMoves(bool Moves) {
- NoMoves = Moves;
- return true;
-};
-
-bool PosRegExp::SetBkTrace(int posStr,PMatches Trace) {
- BkTrace = Trace;
- posBkStr = posStr;
- return true;
-};
-
-#define EVAL_MATCHES 16
-#define EVAL_CHUNKSIZE 256
-
-#define EVAL_LOWERCASE 1
-#define EVAL_UPPERCASE 2
-#define EVAL_LOWERCASE_NEXT 4
-#define EVAL_UPPERCASE_NEXT 8
-
-bool PosRegExp::Evaluate(char *Expr, int posStr, PMatches Mtch, char **Res) {
- int length,
- newlength,
- chunklength,
- value,
- size,
- src,
- end;
- unsigned flag;
- char ch,
- *dest,
- *pool;
-
- size = EVAL_CHUNKSIZE;
- pool = (char*) malloc (size);
- dest = pool;
- length = 0;
- flag = 0;
- while (*Expr) {
- switch (ch = *Expr++) {
- case '\\':
- switch (ch = *Expr++) {
- case 'A':
- case 'B':
- case 'C':
- case 'D':
- case 'E':
- case 'F':
- ch -= ('A' - '0');
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9':
- value = ch - '0';
- if (Mtch->s[value] != -1 && value < EVAL_MATCHES) {
- chunklength = Mtch->e[value] - Mtch->s[value];
- if (chunklength) {
- newlength = chunklength + length;
- if (newlength > size) {
- do
- size += EVAL_CHUNKSIZE;
- while (size < newlength);
- pool = (char*) realloc (pool, size);
- dest = pool + length;
- }
- length = newlength;
- src = posStr + Mtch->s[value];
- end = posStr + Mtch->e[value];
- if (flag & EVAL_UPPERCASE) {
- if (flag & EVAL_LOWERCASE_NEXT) {
- *dest++ = tolower (CharAt(src++,param));
- flag &= ~EVAL_LOWERCASE_NEXT;
- }
- while (src < end)
- *dest++ = toupper (CharAt(src++,param));
- } else if (flag & EVAL_LOWERCASE) {
- if (flag & EVAL_UPPERCASE_NEXT) {
- *dest++ = toupper (CharAt(src++,param));
- flag &= ~EVAL_UPPERCASE_NEXT;
- }
- while (src < end)
- *dest++ = tolower (CharAt(src++,param));
- } else {
- if (flag & EVAL_LOWERCASE_NEXT) {
- *dest++ = tolower (CharAt(src++,param));
- flag &= ~EVAL_LOWERCASE_NEXT;
- } else if (flag & EVAL_UPPERCASE_NEXT) {
- *dest++ = toupper (CharAt(src++,param));
- flag &= ~EVAL_UPPERCASE_NEXT;
- }
- while (src < end)
- *dest++ = CharAt(src++,param);
- }
- }
- } else
- goto error;
- continue;
- case '\0':
- goto error;
- case 'r':
- ch = '\r';
- break;
- case 'n':
- ch = '\n';
- break;
- case 'b':
- ch = '\b';
- break;
- case 'a':
- ch = '\a';
- break;
- case 't':
- ch = '\t';
- break;
- case 'U':
- flag |= EVAL_UPPERCASE;
- continue;
- case 'u':
- flag |= EVAL_UPPERCASE_NEXT;
- continue;
- case 'L':
- flag |= EVAL_LOWERCASE;
- continue;
- case 'l':
- flag |= EVAL_LOWERCASE_NEXT;
- continue;
- case 'Q':
- case 'q':
- flag &= ~(EVAL_UPPERCASE | EVAL_LOWERCASE);
- continue;
- case 'x':
- {
- if (!*Expr)
- goto error;
- value = toupper (*Expr) - '0';
- if (value > 9)
- value = value + '0' - 'A' + 10;
- if (value > 15)
- goto error;
- ch = value << 4;
- Expr++;
- if (!*Expr)
- goto error;
- value = toupper (*Expr) - '0';
- if (value > 9)
- value = value + '0' - 'A' + 10;
- if (value > 15)
- goto error;
- Expr++;
- ch |= value;
- break;
- }
- case 'd':
- {
- if (!*Expr)
- goto error;
- value = toupper (*Expr) - '0';
- if (value > 9)
- goto error;
- ch = value * 100;
- Expr++;
- if (!*Expr)
- goto error;
- value = toupper (*Expr) - '0';
- if (value > 9)
- goto error;
- ch += value * 10;
- Expr++;
- if (!*Expr)
- goto error;
- value = toupper (*Expr) - '0';
- if (value > 9)
- goto error;
- ch += value;
- Expr++;
- break;
- }
- case 'o':
- {
- if (!*Expr)
- goto error;
- value = toupper (*Expr) - '0';
- if (value > 9)
- goto error;
- ch = value << 6;
- Expr++;
- if (!*Expr)
- goto error;
- value = toupper (*Expr) - '0';
- if (value > 9)
- goto error;
- ch += value << 3;
- Expr++;
- if (!*Expr)
- goto error;
- value = toupper (*Expr) - '0';
- if (value > 9)
- goto error;
- ch |= value;
- Expr++;
- /* break; */
- }
- /* default:
- break; */
- }
- default:
- if (++length > size) {
- do
- size += EVAL_CHUNKSIZE;
- while (size < length);
- pool = (char*) realloc (pool, size);
- dest = pool + length - 1;
- }
- if (flag & EVAL_LOWERCASE_NEXT) {
- *dest++ = tolower (ch);
- flag &= ~EVAL_LOWERCASE_NEXT;
- } else if (flag & EVAL_UPPERCASE_NEXT) {
- *dest++ = toupper (ch);
- flag &= ~EVAL_UPPERCASE_NEXT;
- } else if (flag & EVAL_UPPERCASE)
- *dest++ = toupper (ch);
- else if (flag & EVAL_LOWERCASE)
- *dest++ = tolower (ch);
- else
- *dest++ = ch;
- }
- }
- if (++length > size) {
- do
- size += EVAL_CHUNKSIZE;
- while (size < length);
- pool = (char*) realloc (pool, size);
- dest = pool + length - 1;
- }
- *dest = '\0';
- *Res = pool;
- return true;
-error:
- free (pool);
- return false;
-}
diff --git a/src/RESearch.cxx b/src/RESearch.cxx
new file mode 100644
index 000000000..a9b0d264f
--- /dev/null
+++ b/src/RESearch.cxx
@@ -0,0 +1,844 @@
+// Scintilla source code edit control
+/** @file RESearch.cxx
+ ** Regular expression search library.
+ **/
+
+/*
+ * regex - Regular expression pattern matching and replacement
+ *
+ * By: Ozan S. Yigit (oz)
+ * Dept. of Computer Science
+ * York University
+ *
+ * Translation to C++ by Neil Hodgson neilh@scintilla.org
+ * Removed all use of register.
+ * Converted to modern function prototypes.
+ * Put all global/static variables into an object so this code can be
+ * used from multiple threads etc.
+ *
+ * These routines are the PUBLIC DOMAIN equivalents of regex
+ * routines as found in 4.nBSD UN*X, with minor extensions.
+ *
+ * These routines are derived from various implementations found
+ * in software tools books, and Conroy's grep. They are NOT derived
+ * from licensed/restricted software.
+ * For more interesting/academic/complicated implementations,
+ * see Henry Spencer's regexp routines, or GNU Emacs pattern
+ * matching module.
+ *
+ * Modification history:
+ *
+ * $Log$
+ * Revision 1.1 2001/04/04 12:52:44 nyamatongwe
+ * Moved to public domain regular expresion implementation.
+ *
+ * Revision 1.4 1991/10/17 03:56:42 oz
+ * miscellaneous changes, small cleanups etc.
+ *
+ * Revision 1.3 1989/04/01 14:18:09 oz
+ * Change all references to a dfa: this is actually an nfa.
+ *
+ * Revision 1.2 88/08/28 15:36:04 oz
+ * Use a complement bitmap to represent NCL.
+ * This removes the need to have seperate
+ * code in the PMatch case block - it is
+ * just CCL code now.
+ *
+ * Use the actual CCL code in the CLO
+ * section of PMatch. No need for a recursive
+ * PMatch call.
+ *
+ * Use a bitmap table to set char bits in an
+ * 8-bit chunk.
+ *
+ * Interfaces:
+ * RESearch::Compile: compile a regular expression into a NFA.
+ *
+ * char *RESearch::Compile(s)
+ * char *s;
+ *
+ * RESearch::Execute: execute the NFA to match a pattern.
+ *
+ * int RESearch::Execute(s)
+ * char *s;
+ *
+ * RESearch::ModifyWord change RESearch::Execute's understanding of what a "word"
+ * looks like (for \< and \>) by adding into the
+ * hidden word-syntax table.
+ *
+ * void RESearch::ModifyWord(s)
+ * char *s;
+ *
+ * RESearch::Substitute: substitute the matched portions in a new string.
+ *
+ * int RESearch::Substitute(src, dst)
+ * char *src;
+ * char *dst;
+ *
+ * re_fail: failure routine for RESearch::Execute.
+ *
+ * void re_fail(msg, op)
+ * char *msg;
+ * char op;
+ *
+ * Regular Expressions:
+ *
+ * [1] char matches itself, unless it is a special
+ * character (metachar): . \ [ ] * + ^ $
+ *
+ * [2] . matches any character.
+ *
+ * [3] \ matches the character following it, except
+ * when followed by a left or right round bracket,
+ * a digit 1 to 9 or a left or right angle bracket.
+ * (see [7], [8] and [9])
+ * It is used as an escape character for all
+ * other meta-characters, and itself. When used
+ * in a set ([4]), it is treated as an ordinary
+ * character.
+ *
+ * [4] [set] matches one of the characters in the set.
+ * If the first character in the set is "^",
+ * it matches a character NOT in the set, i.e.
+ * complements the set. A shorthand S-E is
+ * used to specify a set of characters S upto
+ * E, inclusive. The special characters "]" and
+ * "-" have no special meaning if they appear
+ * as the first chars in the set.
+ * examples: match:
+ *
+ * [a-z] any lowercase alpha
+ *
+ * [^]-] any char except ] and -
+ *
+ * [^A-Z] any char except uppercase
+ * alpha
+ *
+ * [a-zA-Z] any alpha
+ *
+ * [5] * any regular expression form [1] to [4], followed by
+ * closure char (*) matches zero or more matches of
+ * that form.
+ *
+ * [6] + same as [5], except it matches one or more.
+ *
+ * [7] a regular expression in the form [1] to [10], enclosed
+ * as \(form\) matches what form matches. The enclosure
+ * creates a set of tags, used for [8] and for
+ * pattern substution. The tagged forms are numbered
+ * starting from 1.
+ *
+ * [8] a \ followed by a digit 1 to 9 matches whatever a
+ * previously tagged regular expression ([7]) matched.
+ *
+ * [9] \< a regular expression starting with a \< construct
+ * \> and/or ending with a \> construct, restricts the
+ * pattern matching to the beginning of a word, and/or
+ * the end of a word. A word is defined to be a character
+ * string beginning and/or ending with the characters
+ * A-Z a-z 0-9 and _. It must also be preceded and/or
+ * followed by any character outside those mentioned.
+ *
+ * [10] a composite regular expression xy where x and y
+ * are in the form [1] to [10] matches the longest
+ * match of x followed by a match for y.
+ *
+ * [11] ^ a regular expression starting with a ^ character
+ * $ and/or ending with a $ character, restricts the
+ * pattern matching to the beginning of the line,
+ * or the end of line. [anchors] Elsewhere in the
+ * pattern, ^ and $ are treated as ordinary characters.
+ *
+ *
+ * Acknowledgements:
+ *
+ * HCR's Hugh Redelmeier has been most helpful in various
+ * stages of development. He convinced me to include BOW
+ * and EOW constructs, originally invented by Rob Pike at
+ * the University of Toronto.
+ *
+ * References:
+ * Software tools Kernighan & Plauger
+ * Software tools in Pascal Kernighan & Plauger
+ * Grep [rsx-11 C dist] David Conroy
+ * ed - text editor Un*x Programmer's Manual
+ * Advanced editing on Un*x B. W. Kernighan
+ * RegExp routines Henry Spencer
+ *
+ * Notes:
+ *
+ * This implementation uses a bit-set representation for character
+ * classes for speed and compactness. Each character is represented
+ * by one bit in a 128-bit block. Thus, CCL always takes a
+ * constant 16 bytes in the internal nfa, and RESearch::Execute does a single
+ * bit comparison to locate the character in the set.
+ *
+ * Examples:
+ *
+ * pattern: foo*.*
+ * compile: CHR f CHR o CLO CHR o END CLO ANY END END
+ * matches: fo foo fooo foobar fobar foxx ...
+ *
+ * pattern: fo[ob]a[rz]
+ * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
+ * matches: fobar fooar fobaz fooaz
+ *
+ * pattern: foo\\+
+ * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
+ * matches: foo\ foo\\ foo\\\ ...
+ *
+ * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
+ * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
+ * matches: foo1foo foo2foo foo3foo
+ *
+ * pattern: \(fo.*\)-\1
+ * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
+ * matches: foo-foo fo-fo fob-fob foobar-foobar ...
+ */
+
+#include "RESearch.h"
+
+#define EXTEND
+
+#define OKP 1
+#define NOP 0
+
+#define CHR 1
+#define ANY 2
+#define CCL 3
+#define BOL 4
+#define EOL 5
+#define BOT 6
+#define EOT 7
+#define BOW 8
+#define EOW 9
+#define REF 10
+#define CLO 11
+
+#define END 0
+
+/*
+ * The following defines are not meant to be changeable.
+ * They are for readability only.
+ */
+#define BLKIND 0170
+#define BITIND 07
+
+#define ASCIIB 0177
+
+const char bitarr[] = {1,2,4,8,16,32,64,'\200'};
+
+#define badpat(x) (*nfa = END, x)
+
+RESearch::RESearch() {
+ sta = NOP; /* status of lastpat */
+ bol = 0;
+ for (int i=0; i<BITBLK; i++)
+ bittab[i] = 0;
+}
+
+void RESearch::ChSet(char c) {
+ bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
+}
+
+const char *RESearch::Compile(char *pat) {
+ char *p; /* pattern pointer */
+ char *mp=nfa; /* nfa pointer */
+ char *lp; /* saved pointer.. */
+ char *sp=nfa; /* another one.. */
+
+ int tagi = 0; /* tag stack index */
+ int tagc = 1; /* actual tag count */
+
+ int n;
+ char mask; /* xor mask -CCL/NCL */
+ int c1, c2;
+
+ if (!pat || !*pat)
+ if (sta)
+ return 0;
+ else
+ return badpat("No previous regular expression");
+ sta = NOP;
+
+ for (p = pat; *p; p++) {
+ lp = mp;
+ switch(*p) {
+
+ case '.': /* match any char.. */
+ *mp++ = ANY;
+ break;
+
+ case '^': /* match beginning.. */
+ if (p == pat)
+ *mp++ = BOL;
+ else {
+ *mp++ = CHR;
+ *mp++ = *p;
+ }
+ break;
+
+ case '$': /* match endofline.. */
+ if (!*(p+1))
+ *mp++ = EOL;
+ else {
+ *mp++ = CHR;
+ *mp++ = *p;
+ }
+ break;
+
+ case '[': /* match char class..*/
+ *mp++ = CCL;
+
+ if (*++p == '^') {
+ mask = '\377';
+ p++;
+ }
+ else
+ mask = 0;
+
+ if (*p == '-') /* real dash */
+ ChSet(*p++);
+ if (*p == ']') /* real brac */
+ ChSet(*p++);
+ while (*p && *p != ']') {
+ if (*p == '-' && *(p+1) && *(p+1) != ']') {
+ p++;
+ c1 = *(p-2) + 1;
+ c2 = *p++;
+ while (c1 <= c2)
+ ChSet(static_cast<char>(c1++));
+ }
+#ifdef EXTEND
+ else if (*p == '\\' && *(p+1)) {
+ p++;
+ ChSet(*p++);
+ }
+#endif
+ else
+ ChSet(*p++);
+ }
+ if (!*p)
+ return badpat("Missing ]");
+
+ for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
+ *mp++ = static_cast<char>(mask ^ bittab[n]);
+
+ break;
+
+ case '*': /* match 0 or more.. */
+ case '+': /* match 1 or more.. */
+ if (p == pat)
+ return badpat("Empty closure");
+ lp = sp; /* previous opcode */
+ if (*lp == CLO) /* equivalence.. */
+ break;
+ switch(*lp) {
+
+ case BOL:
+ case BOT:
+ case EOT:
+ case BOW:
+ case EOW:
+ case REF:
+ return badpat("Illegal closure");
+ default:
+ break;
+ }
+
+ if (*p == '+')
+ for (sp = mp; lp < sp; lp++)
+ *mp++ = *lp;
+
+ *mp++ = END;
+ *mp++ = END;
+ sp = mp;
+ while (--mp > lp)
+ *mp = mp[-1];
+ *mp = CLO;
+ mp = sp;
+ break;
+
+ case '\\': /* tags, backrefs .. */
+ switch(*++p) {
+
+ case '(':
+ if (tagc < MAXTAG) {
+ tagstk[++tagi] = tagc;
+ *mp++ = BOT;
+ *mp++ = static_cast<char>(tagc++);
+ }
+ else
+ return badpat("Too many \\(\\) pairs");
+ break;
+ case ')':
+ if (*sp == BOT)
+ return badpat("Null pattern inside \\(\\)");
+ if (tagi > 0) {
+ *mp++ = static_cast<char>(EOT);
+ *mp++ = static_cast<char>(tagstk[tagi--]);
+ }
+ else
+ return badpat("Unmatched \\)");
+ break;
+ case '<':
+ *mp++ = BOW;
+ break;
+ case '>':
+ if (*sp == BOW)
+ return badpat("Null pattern inside \\<\\>");
+ *mp++ = EOW;
+ break;
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ n = *p-'0';
+ if (tagi > 0 && tagstk[tagi] == n)
+ return badpat("Cyclical reference");
+ if (tagc > n) {
+ *mp++ = static_cast<char>(REF);
+ *mp++ = static_cast<char>(n);
+ }
+ else
+ return badpat("Undetermined reference");
+ break;
+#ifdef EXTEND
+ case 'a':
+ *mp++ = CHR;
+ *mp++ = '\a';
+ break;
+ case 'b':
+ *mp++ = CHR;
+ *mp++ = '\b';
+ break;
+ case 'n':
+ *mp++ = CHR;
+ *mp++ = '\n';
+ break;
+ case 'f':
+ *mp++ = CHR;
+ *mp++ = '\f';
+ break;
+ case 'r':
+ *mp++ = CHR;
+ *mp++ = '\r';
+ break;
+ case 't':
+ *mp++ = CHR;
+ *mp++ = '\t';
+ break;
+ case 'v':
+ *mp++ = CHR;
+ *mp++ = '\v';
+ break;
+#endif
+ default:
+ *mp++ = CHR;
+ *mp++ = *p;
+ }
+ break;
+
+ default : /* an ordinary char */
+ *mp++ = CHR;
+ *mp++ = *p;
+ break;
+ }
+ sp = lp;
+ }
+ if (tagi > 0)
+ return badpat("Unmatched \\(");
+ *mp = END;
+ sta = OKP;
+ return 0;
+}
+
+/*
+ * RESearch::Execute:
+ * execute nfa to find a match.
+ *
+ * special cases: (nfa[0])
+ * BOL
+ * Match only once, starting from the
+ * beginning.
+ * CHR
+ * First locate the character without
+ * calling PMatch, and if found, call
+ * PMatch for the remaining string.
+ * END
+ * RESearch::Compile failed, poor luser did not
+ * check for it. Fail fast.
+ *
+ * If a match is found, bopat[0] and eopat[0] are set
+ * to the beginning and the end of the matched fragment,
+ * respectively.
+ *
+ */
+
+int RESearch::Execute(CharacterIndexer &ci, int lp) {
+ char c;
+ int ep = 0;
+ char *ap = nfa;
+
+ bol = lp;
+ failure = 0;
+
+ for (int i=0;i<MAXTAG;i++) {
+ bopat[i] = NOTFOUND;
+ eopat[i] = NOTFOUND;
+ }
+
+ switch(*ap) {
+
+ case BOL: /* anchored: match from BOL only */
+ ep = PMatch(ci, lp, ap);
+ break;
+ case CHR: /* ordinary char: locate it fast */
+ c = *(ap+1);
+ while (ci.CharAt(lp) && ci.CharAt(lp) != c)
+ lp++;
+ if (!ci.CharAt(lp)) /* if EOS, fail, else fall thru. */
+ return 0;
+ default: /* regular matching all the way. */
+ while (ci.CharAt(lp)) {
+ ep = PMatch(ci, lp, ap);
+ if (ep != NOTFOUND)
+ break;
+ lp++;
+ }
+ break;
+ case END: /* munged automaton. fail always */
+ return 0;
+ }
+ if (ep == NOTFOUND)
+ return 0;
+
+ bopat[0] = lp;
+ eopat[0] = ep;
+ return 1;
+}
+
+/*
+ * PMatch: internal routine for the hard part
+ *
+ * This code is partly snarfed from an early grep written by
+ * David Conroy. The backref and tag stuff, and various other
+ * innovations are by oz.
+ *
+ * special case optimizations: (nfa[n], nfa[n+1])
+ * CLO ANY
+ * We KNOW .* will match everything upto the
+ * end of line. Thus, directly go to the end of
+ * line, without recursive PMatch calls. As in
+ * the other closure cases, the remaining pattern
+ * must be matched by moving backwards on the
+ * string recursively, to find a match for xy
+ * (x is ".*" and y is the remaining pattern)
+ * where the match satisfies the LONGEST match for
+ * x followed by a match for y.
+ * CLO CHR
+ * We can again scan the string forward for the
+ * single char and at the point of failure, we
+ * execute the remaining nfa recursively, same as
+ * above.
+ *
+ * At the end of a successful match, bopat[n] and eopat[n]
+ * are set to the beginning and end of subpatterns matched
+ * by tagged expressions (n = 1 to 9).
+ *
+ */
+
+extern void re_fail(char *,char);
+
+/*
+ * character classification table for word boundary operators BOW
+ * and EOW. the reason for not using ctype macros is that we can
+ * let the user add into our own table. see RESearch::ModifyWord. This table
+ * is not in the bitset form, since we may wish to extend it in the
+ * future for other character classifications.
+ *
+ * TRUE for 0-9 A-Z a-z _
+ */
+static char chrtyp[MAXCHR] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+ 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 0, 0, 0, 0, 0
+ };
+
+#define inascii(x) (0177&(x))
+#define iswordc(x) chrtyp[inascii(x)]
+#define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
+
+/*
+ * skip values for CLO XXX to skip past the closure
+ */
+
+#define ANYSKIP 2 /* [CLO] ANY END ... */
+#define CHRSKIP 3 /* [CLO] CHR chr END ... */
+#define CCLSKIP 18 /* [CLO] CCL 16bytes END ... */
+
+int RESearch::PMatch(CharacterIndexer &ci, int lp, char *ap) {
+ int op, c, n;
+ int e; /* extra pointer for CLO */
+ int bp; /* beginning of subpat.. */
+ int ep; /* ending of subpat.. */
+ int are; /* to save the line ptr. */
+
+ while ((op = *ap++) != END)
+ switch(op) {
+
+ case CHR:
+ if (ci.CharAt(lp++) != *ap++)
+ return NOTFOUND;
+ break;
+ case ANY:
+ if (!ci.CharAt(lp++))
+ return NOTFOUND;
+ break;
+ case CCL:
+ c = ci.CharAt(lp++);
+ if (!isinset(ap,c))
+ return NOTFOUND;
+ ap += BITBLK;
+ break;
+ case BOL:
+ if (lp != bol)
+ return NOTFOUND;
+ break;
+ case EOL:
+ if (ci.CharAt(lp))
+ return NOTFOUND;
+ break;
+ case BOT:
+ bopat[*ap++] = lp;
+ break;
+ case EOT:
+ eopat[*ap++] = lp;
+ break;
+ case BOW:
+ if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp)))
+ return NOTFOUND;
+ break;
+ case EOW:
+ if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp)))
+ return NOTFOUND;
+ break;
+ case REF:
+ n = *ap++;
+ bp = bopat[n];
+ ep = eopat[n];
+ while (bp < ep)
+ if (ci.CharAt(bp++) != ci.CharAt(lp++))
+ return NOTFOUND;
+ break;
+ case CLO:
+ are = lp;
+ switch(*ap) {
+
+ case ANY:
+ while (ci.CharAt(lp))
+ lp++;
+ n = ANYSKIP;
+ break;
+ case CHR:
+ c = *(ap+1);
+ while (ci.CharAt(lp) && c == ci.CharAt(lp))
+ lp++;
+ n = CHRSKIP;
+ break;
+ case CCL:
+ while (((c = ci.CharAt(lp)) != 0) && isinset(ap+1,c))
+ lp++;
+ n = CCLSKIP;
+ break;
+ default:
+ failure = true;
+ //re_fail("closure: bad nfa.", *ap);
+ return NOTFOUND;
+ }
+
+ ap += n;
+
+ while (lp >= are) {
+ if ((e = PMatch(ci, lp, ap)) != NOTFOUND)
+ return e;
+ --lp;
+ }
+ return NOTFOUND;
+ default:
+ //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
+ return NOTFOUND;
+ }
+ return lp;
+}
+
+/*
+ * RESearch::ModifyWord:
+ * add new characters into the word table to change RESearch::Execute's
+ * understanding of what a word should look like. Note that we
+ * only accept additions into the word definition.
+ *
+ * If the string parameter is 0 or null string, the table is
+ * reset back to the default containing A-Z a-z 0-9 _. [We use
+ * the compact bitset representation for the default table]
+ */
+
+static char deftab[16] = {
+ 0, 0, 0, 0, 0, 0, '\377', 003, '\376', '\377', '\377', '\207',
+ '\376', '\377', '\377', 007
+};
+
+void RESearch::ModifyWord(char *s) {
+ int i;
+
+ if (!s || !*s) {
+ for (i = 0; i < MAXCHR; i++)
+ if (!isinset(deftab,i))
+ iswordc(i) = 0;
+ }
+ else
+ while(*s)
+ iswordc(*s++) = 1;
+}
+
+/*
+ * RESearch::Substitute:
+ * substitute the matched portions of the src in dst.
+ *
+ * & substitute the entire matched pattern.
+ *
+ * \digit substitute a subpattern, with the given tag number.
+ * Tags are numbered from 1 to 9. If the particular
+ * tagged subpattern does not exist, null is substituted.
+ */
+int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {
+ char c;
+ int pin;
+ int bp;
+ int ep;
+
+ if (!*src || !bopat[0])
+ return 0;
+
+ while ((c = *src++) != 0) {
+ switch(c) {
+
+ case '&':
+ pin = 0;
+ break;
+
+ case '\\':
+ c = *src++;
+ if (c >= '0' && c <= '9') {
+ pin = c - '0';
+ break;
+ }
+
+ default:
+ *dst++ = c;
+ continue;
+ }
+
+ if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) {
+ while (ci.CharAt(bp) && bp < ep)
+ *dst++ = ci.CharAt(bp++);
+ if (bp < ep)
+ return 0;
+ }
+ }
+ *dst = (char) 0;
+ return 1;
+}
+
+#ifdef DEBUG
+/*
+ * symbolic - produce a symbolic dump of the nfa
+ */
+void symbolic(char *s) {
+ printf("pattern: %s\n", s);
+ printf("nfacode:\n");
+ nfadump(nfa);
+}
+
+static void nfadump(char *ap) {
+ int n;
+
+ while (*ap != END)
+ switch(*ap++) {
+ case CLO:
+ printf("CLOSURE");
+ nfadump(ap);
+ switch(*ap) {
+ case CHR:
+ n = CHRSKIP;
+ break;
+ case ANY:
+ n = ANYSKIP;
+ break;
+ case CCL:
+ n = CCLSKIP;
+ break;
+ }
+ ap += n;
+ break;
+ case CHR:
+ printf("\tCHR %c\n",*ap++);
+ break;
+ case ANY:
+ printf("\tANY .\n");
+ break;
+ case BOL:
+ printf("\tBOL -\n");
+ break;
+ case EOL:
+ printf("\tEOL -\n");
+ break;
+ case BOT:
+ printf("BOT: %d\n",*ap++);
+ break;
+ case EOT:
+ printf("EOT: %d\n",*ap++);
+ break;
+ case BOW:
+ printf("BOW\n");
+ break;
+ case EOW:
+ printf("EOW\n");
+ break;
+ case REF:
+ printf("REF: %d\n",*ap++);
+ break;
+ case CCL:
+ printf("\tCCL [");
+ for (n = 0; n < MAXCHR; n++)
+ if (isinset(ap,(char)n)) {
+ if (n < ' ')
+ printf("^%c", n ^ 0x040);
+ else
+ printf("%c", n);
+ }
+ printf("]\n");
+ ap += BITBLK;
+ break;
+ default:
+ printf("bad nfa. opcode %o\n", ap[-1]);
+ exit(1);
+ break;
+ }
+}
+#endif
diff --git a/src/RESearch.h b/src/RESearch.h
new file mode 100644
index 000000000..f9cf7fdc5
--- /dev/null
+++ b/src/RESearch.h
@@ -0,0 +1,54 @@
+// Scintilla source code edit control
+/** @file RESearch.h
+ ** Interface to the regular expression search library.
+ **/
+// Written by Neil Hodgson <neilh@scintilla.org>
+// Based on the work of Ozan S. Yigit.
+// This file is in the public domain.
+
+#ifndef RESEARCH_H
+#define RESEARCH_H
+
+/*
+ * The following defines are not meant to be changeable.
+ * They are for readability only.
+ */
+#define MAXCHR 128
+#define CHRBIT 8
+#define BITBLK MAXCHR/CHRBIT
+
+class CharacterIndexer {
+public:
+ virtual char CharAt(int index)=0;
+};
+
+class RESearch {
+
+public:
+ RESearch();
+ void ChSet(char c);
+ const char *Compile(char *pat);
+ int Execute(CharacterIndexer &ci, int lp);
+ void ModifyWord(char *s);
+ int Substitute(CharacterIndexer &ci, char *src, char *dst);
+
+ enum {MAXTAG=10};
+ enum {MAXNFA=1024};
+ enum {NOTFOUND=-1};
+
+ int bopat[MAXTAG];
+ int eopat[MAXTAG];
+
+private:
+ int PMatch(CharacterIndexer &ci, int lp, char *ap);
+
+ int bol;
+ int tagstk[MAXTAG]; /* subpat tag stack..*/
+ char nfa[MAXNFA]; /* automaton.. */
+ int sta;
+ char bittab[BITBLK]; /* bit table for CCL */
+ /* pre-set bits... */
+ int failure;
+};
+
+#endif