aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/RESearch.cxx301
-rw-r--r--src/RESearch.h16
2 files changed, 156 insertions, 161 deletions
diff --git a/src/RESearch.cxx b/src/RESearch.cxx
index f1fda7304..00e183060 100644
--- a/src/RESearch.cxx
+++ b/src/RESearch.cxx
@@ -4,7 +4,7 @@
**/
/*
- * regex - Regular expression pattern matching and replacement
+ * regex - Regular expression pattern matching and replacement
*
* By: Ozan S. Yigit (oz)
* Dept. of Computer Science
@@ -15,7 +15,7 @@
* Removed all use of register.
* Converted to modern function prototypes.
* Put all global/static variables into an object so this code can be
- * used from multiple threads etc.
+ * used from multiple threads, etc.
*
* These routines are the PUBLIC DOMAIN equivalents of regex
* routines as found in 4.nBSD UN*X, with minor extensions.
@@ -30,64 +30,62 @@
* Modification history removed.
*
* Interfaces:
- * RESearch::Compile: compile a regular expression into a NFA.
+ * RESearch::Compile: compile a regular expression into a NFA.
*
- * char *RESearch::Compile(s)
- * char *s;
+ * const char *RESearch::Compile(const char *pat, int length,
+ * bool caseSensitive, bool posix)
*
- * RESearch::Execute: execute the NFA to match a pattern.
+ * Returns a short error string if they fail.
*
- * int RESearch::Execute(s)
- * char *s;
+ * RESearch::Execute: execute the NFA to match a pattern.
*
- * RESearch::ModifyWord change RESearch::Execute's understanding of what a "word"
- * looks like (for \< and \>) by adding into the
- * hidden word-syntax table.
+ * int RESearch::Execute(characterIndexer &ci, int lp, int endp)
*
- * void RESearch::ModifyWord(s)
- * char *s;
+ * RESearch::Substitute: substitute the matched portions in a new string.
*
- * RESearch::Substitute: substitute the matched portions in a new string.
+ * int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst)
*
- * int RESearch::Substitute(src, dst)
- * char *src;
- * char *dst;
+ * re_fail: failure routine for RESearch::Execute. (no longer used)
*
- * re_fail: failure routine for RESearch::Execute.
- *
- * void re_fail(msg, op)
- * char *msg;
- * char op;
+ * void re_fail(char *msg, char op)
*
* Regular Expressions:
*
* [1] char matches itself, unless it is a special
* character (metachar): . \ [ ] * + ^ $
+ * and ( ) if posix option.
*
* [2] . matches any character.
*
- * [3] \ matches the character following it, except
- * when followed by a left or right round bracket,
- * a digit 1 to 9 or a left or right angle bracket.
- * (see [7], [8] and [9])
- * It is used as an escape character for all
- * other meta-characters, and itself. When used
- * in a set ([4]), it is treated as an ordinary
- * character.
+ * [3] \ matches the character following it, except:
+ * - \a, \b, \f, \n, \t, \v match the
+ * corresponding C escape char;
+ * - if not in posix mode, when followed by a
+ * left or right round bracket (see [7]);
+ * - when followed by a digit 1 to 9 (see [8]);
+ * - when followed by a left or right angle bracket
+ * (see [9]).
+ * It is used as an escape character for all
+ * other meta-characters, and itself. When used
+ * in a set ([4]), it is treated as an ordinary
+ * character (except for escape chars).
*
* [4] [set] matches one of the characters in the set.
* If the first character in the set is "^",
* it matches a character NOT in the set, i.e.
- * complements the set. A shorthand S-E is
- * used to specify a set of characters S upto
- * E, inclusive. The special characters "]" and
- * "-" have no special meaning if they appear
- * as the first chars in the set.
+ * complements the set. A shorthand S-E (start-end)
+ * is used to specify a set of characters S upto
+ * E, inclusive. The special characters "]" and
+ * "-" have no special meaning if they appear
+ * as the first chars in the set. To include both,
+ * put - first: [-]A-Z]:
+ * [-]|] matches these 2 chars,
+ * []-|] matches from ] to | chars.
* examples: match:
*
* [a-z] any lowercase alpha
*
- * [^]-] any char except ] and -
+ * [^-]] any char except - and ]
*
* [^A-Z] any char except uppercase
* alpha
@@ -101,77 +99,77 @@
* [6] + same as [5], except it matches one or more.
*
* [7] a regular expression in the form [1] to [10], enclosed
- * as \(form\) matches what form matches. The enclosure
- * creates a set of tags, used for [8] and for
- * pattern substution. The tagged forms are numbered
- * starting from 1.
+ * as \(form\) (or (form) with posix flag) matches what
+ * form matches. The enclosure creates a set of tags,
+ * used for [8] and for pattern substitution.
+ * The tagged forms are numbered starting from 1.
*
* [8] a \ followed by a digit 1 to 9 matches whatever a
* previously tagged regular expression ([7]) matched.
*
- * [9] \< a regular expression starting with a \< construct
- * \> and/or ending with a \> construct, restricts the
- * pattern matching to the beginning of a word, and/or
- * the end of a word. A word is defined to be a character
- * string beginning and/or ending with the characters
- * A-Z a-z 0-9 and _. It must also be preceded and/or
- * followed by any character outside those mentioned.
+ * [9] \< a regular expression starting with a \< construct
+ * \> and/or ending with a \> construct, restricts the
+ * pattern matching to the beginning of a word, and/or
+ * the end of a word. A word is defined to be a character
+ * string beginning and/or ending with the characters
+ * A-Z a-z 0-9 and _. It must also be preceded and/or
+ * followed by any character outside those mentioned.
*
* [10] a composite regular expression xy where x and y
* are in the form [1] to [10] matches the longest
* match of x followed by a match for y.
*
- * [11] ^ a regular expression starting with a ^ character
- * $ and/or ending with a $ character, restricts the
+ * [11] ^ a regular expression starting with a ^ character
+ * $ and/or ending with a $ character, restricts the
* pattern matching to the beginning of the line,
* or the end of line. [anchors] Elsewhere in the
- * pattern, ^ and $ are treated as ordinary characters.
+ * pattern, ^ and $ are treated as ordinary characters.
*
*
* Acknowledgements:
*
- * HCR's Hugh Redelmeier has been most helpful in various
- * stages of development. He convinced me to include BOW
- * and EOW constructs, originally invented by Rob Pike at
- * the University of Toronto.
+ * HCR's Hugh Redelmeier has been most helpful in various
+ * stages of development. He convinced me to include BOW
+ * and EOW constructs, originally invented by Rob Pike at
+ * the University of Toronto.
*
* References:
- * Software tools Kernighan & Plauger
+ * Software tools Kernighan & Plauger
* Software tools in Pascal Kernighan & Plauger
* Grep [rsx-11 C dist] David Conroy
- * ed - text editor Un*x Programmer's Manual
- * Advanced editing on Un*x B. W. Kernighan
- * RegExp routines Henry Spencer
+ * ed - text editor Un*x Programmer's Manual
+ * Advanced editing on Un*x B. W. Kernighan
+ * RegExp routines Henry Spencer
*
* Notes:
*
* This implementation uses a bit-set representation for character
* classes for speed and compactness. Each character is represented
- * by one bit in a 128-bit block. Thus, CCL always takes a
- * constant 16 bytes in the internal nfa, and RESearch::Execute does a single
+ * by one bit in a 256-bit block. Thus, CCL always takes a
+ * constant 32 bytes in the internal nfa, and RESearch::Execute does a single
* bit comparison to locate the character in the set.
*
* Examples:
*
- * pattern: foo*.*
- * compile: CHR f CHR o CLO CHR o END CLO ANY END END
- * matches: fo foo fooo foobar fobar foxx ...
+ * pattern: foo*.*
+ * compile: CHR f CHR o CLO CHR o END CLO ANY END END
+ * matches: fo foo fooo foobar fobar foxx ...
*
- * pattern: fo[ob]a[rz]
- * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
- * matches: fobar fooar fobaz fooaz
+ * pattern: fo[ob]a[rz]
+ * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
+ * matches: fobar fooar fobaz fooaz
*
- * pattern: foo\\+
- * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
- * matches: foo\ foo\\ foo\\\ ...
+ * pattern: foo\\+
+ * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
+ * matches: foo\ foo\\ foo\\\ ...
*
- * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
- * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
- * matches: foo1foo foo2foo foo3foo
+ * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
+ * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
+ * matches: foo1foo foo2foo foo3foo
*
- * pattern: \(fo.*\)-\1
- * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
- * matches: foo-foo fo-fo fob-fob foobar-foobar ...
+ * pattern: \(fo.*\)-\1
+ * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
+ * matches: foo-foo fo-fo fob-fob foobar-foobar ...
*/
#include "CharClassify.h"
@@ -192,8 +190,8 @@
#define EOL 5
#define BOT 6
#define EOT 7
-#define BOW 8
-#define EOW 9
+#define BOW 8
+#define EOW 9
#define REF 10
#define CLO 11
@@ -203,17 +201,15 @@
* The following defines are not meant to be changeable.
* They are for readability only.
*/
-#define BLKIND 0370
-#define BITIND 07
-
-#define ASCIIB 0177
+#define BLKIND 0370
+#define BITIND 07
const char bitarr[] = {1,2,4,8,16,32,64,'\200'};
#define badpat(x) (*nfa = END, x)
/*
- * character classification table for word boundary operators BOW
+ * Character classification table for word boundary operators BOW
* and EOW is passed in by the creator of this object (Scintilla
* Document). The Document default state is that word chars are:
* 0-9,a-z, A-Z and _
@@ -229,7 +225,7 @@ RESearch::~RESearch() {
}
void RESearch::Init() {
- sta = NOP; /* status of lastpat */
+ sta = NOP; /* status of lastpat */
bol = 0;
for (int i=0; i<MAXTAG; i++)
pat[i] = 0;
@@ -299,15 +295,15 @@ const char escapeValue(char ch) {
const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, bool posix) {
char *mp=nfa; /* nfa pointer */
- char *lp; /* saved pointer.. */
- char *sp=nfa; /* another one.. */
+ char *lp; /* saved pointer */
+ char *sp=nfa; /* another one */
char *mpMax = mp + MAXNFA - BITBLK - 10;
int tagi = 0; /* tag stack index */
int tagc = 1; /* actual tag count */
int n;
- char mask; /* xor mask -CCL/NCL */
+ char mask; /* xor mask -CCL/NCL */
int c1, c2;
if (!pat || !length)
@@ -317,18 +313,18 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b
return badpat("No previous regular expression");
sta = NOP;
- const char *p=pat; /* pattern pointer */
+ const char *p=pat; /* pattern pointer */
for (int i=0; i<length; i++, p++) {
if (mp > mpMax)
return badpat("Pattern too long");
lp = mp;
switch(*p) {
- case '.': /* match any char.. */
+ case '.': /* match any char */
*mp++ = ANY;
break;
- case '^': /* match beginning.. */
+ case '^': /* match beginning */
if (p == pat)
*mp++ = BOL;
else {
@@ -337,7 +333,7 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b
}
break;
- case '$': /* match endofline.. */
+ case '$': /* match endofline */
if (!*(p+1))
*mp++ = EOL;
else {
@@ -346,7 +342,7 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b
}
break;
- case '[': /* match char class..*/
+ case '[': /* match char class */
*mp++ = CCL;
i++;
@@ -357,7 +353,7 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b
} else
mask = 0;
- if (*p == '-') { /* real dash */
+ if (*p == '-') { /* real dash */
i++;
ChSet(*p++);
}
@@ -398,12 +394,12 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b
break;
- case '*': /* match 0 or more.. */
- case '+': /* match 1 or more.. */
+ case '*': /* match 0 or more... */
+ case '+': /* match 1 or more... */
if (p == pat)
return badpat("Empty closure");
lp = sp; /* previous opcode */
- if (*lp == CLO) /* equivalence.. */
+ if (*lp == CLO) /* equivalence... */
break;
switch(*lp) {
@@ -431,7 +427,7 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b
mp = sp;
break;
- case '\\': /* tags, backrefs .. */
+ case '\\': /* tags, backrefs... */
i++;
switch(*++p) {
@@ -497,7 +493,7 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b
}
break;
- default : /* an ordinary char */
+ default : /* an ordinary char */
if (posix && *p == '(') {
if (tagc < MAXTAG) {
tagstk[++tagi] = tagc;
@@ -538,23 +534,23 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b
/*
* RESearch::Execute:
- * execute nfa to find a match.
- *
- * special cases: (nfa[0])
- * BOL
- * Match only once, starting from the
- * beginning.
- * CHR
- * First locate the character without
- * calling PMatch, and if found, call
- * PMatch for the remaining string.
- * END
- * RESearch::Compile failed, poor luser did not
- * check for it. Fail fast.
- *
- * If a match is found, bopat[0] and eopat[0] are set
- * to the beginning and the end of the matched fragment,
- * respectively.
+ * execute nfa to find a match.
+ *
+ * special cases: (nfa[0])
+ * BOL
+ * Match only once, starting from the
+ * beginning.
+ * CHR
+ * First locate the character without
+ * calling PMatch, and if found, call
+ * PMatch for the remaining string.
+ * END
+ * RESearch::Compile failed, poor luser did not
+ * check for it. Fail fast.
+ *
+ * If a match is found, bopat[0] and eopat[0] are set
+ * to the beginning and the end of the matched fragment,
+ * respectively.
*
*/
@@ -585,7 +581,7 @@ int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {
c = *(ap+1);
while ((lp < endp) && (ci.CharAt(lp) != c))
lp++;
- if (lp >= endp) /* if EOS, fail, else fall thru. */
+ if (lp >= endp) /* if EOS, fail, else fall thru. */
return 0;
default: /* regular matching all the way. */
while (lp < endp) {
@@ -609,51 +605,50 @@ int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {
/*
* PMatch: internal routine for the hard part
*
- * This code is partly snarfed from an early grep written by
- * David Conroy. The backref and tag stuff, and various other
- * innovations are by oz.
- *
- * special case optimizations: (nfa[n], nfa[n+1])
- * CLO ANY
- * We KNOW .* will match everything upto the
- * end of line. Thus, directly go to the end of
- * line, without recursive PMatch calls. As in
- * the other closure cases, the remaining pattern
- * must be matched by moving backwards on the
- * string recursively, to find a match for xy
- * (x is ".*" and y is the remaining pattern)
- * where the match satisfies the LONGEST match for
- * x followed by a match for y.
- * CLO CHR
- * We can again scan the string forward for the
- * single char and at the point of failure, we
- * execute the remaining nfa recursively, same as
- * above.
- *
- * At the end of a successful match, bopat[n] and eopat[n]
- * are set to the beginning and end of subpatterns matched
- * by tagged expressions (n = 1 to 9).
- *
+ * This code is partly snarfed from an early grep written by
+ * David Conroy. The backref and tag stuff, and various other
+ * innovations are by oz.
+ *
+ * special case optimizations: (nfa[n], nfa[n+1])
+ * CLO ANY
+ * We KNOW .* will match everything upto the
+ * end of line. Thus, directly go to the end of
+ * line, without recursive PMatch calls. As in
+ * the other closure cases, the remaining pattern
+ * must be matched by moving backwards on the
+ * string recursively, to find a match for xy
+ * (x is ".*" and y is the remaining pattern)
+ * where the match satisfies the LONGEST match for
+ * x followed by a match for y.
+ * CLO CHR
+ * We can again scan the string forward for the
+ * single char and at the point of failure, we
+ * execute the remaining nfa recursively, same as
+ * above.
+ *
+ * At the end of a successful match, bopat[n] and eopat[n]
+ * are set to the beginning and end of subpatterns matched
+ * by tagged expressions (n = 1 to 9).
*/
extern void re_fail(char *,char);
-#define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
+#define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
/*
* skip values for CLO XXX to skip past the closure
*/
-#define ANYSKIP 2 /* [CLO] ANY END ... */
-#define CHRSKIP 3 /* [CLO] CHR chr END ... */
-#define CCLSKIP 34 /* [CLO] CCL 32bytes END ... */
+#define ANYSKIP 2 /* [CLO] ANY END */
+#define CHRSKIP 3 /* [CLO] CHR chr END */
+#define CCLSKIP 34 /* [CLO] CCL 32 bytes END */
int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
int op, c, n;
- int e; /* extra pointer for CLO */
- int bp; /* beginning of subpat.. */
- int ep; /* ending of subpat.. */
- int are; /* to save the line ptr. */
+ int e; /* extra pointer for CLO */
+ int bp; /* beginning of subpat... */
+ int ep; /* ending of subpat... */
+ int are; /* to save the line ptr. */
while ((op = *ap++) != END)
switch(op) {
@@ -745,13 +740,13 @@ int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
/*
* RESearch::Substitute:
- * substitute the matched portions of the src in dst.
+ * substitute the matched portions of the src in dst.
*
- * & substitute the entire matched pattern.
+ * & substitute the entire matched pattern.
*
- * \digit substitute a subpattern, with the given tag number.
- * Tags are numbered from 1 to 9. If the particular
- * tagged subpattern does not exist, null is substituted.
+ * \digit substitute a subpattern, with the given tag number.
+ * Tags are numbered from 1 to 9. If the particular
+ * tagged subpattern does not exist, null is substituted.
*/
int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {
char c;
diff --git a/src/RESearch.h b/src/RESearch.h
index 25205951f..aa8557918 100644
--- a/src/RESearch.h
+++ b/src/RESearch.h
@@ -29,11 +29,7 @@ class RESearch {
public:
RESearch(CharClassify *charClassTable);
~RESearch();
- void Init();
- void Clear();
bool GrabMatches(CharacterIndexer &ci);
- void ChSet(char c);
- void ChSetWithCase(char c, bool caseSensitive);
const char *Compile(const char *pat, int length, bool caseSensitive, bool posix);
int Execute(CharacterIndexer &ci, int lp, int endp);
int Substitute(CharacterIndexer &ci, char *src, char *dst);
@@ -47,14 +43,18 @@ public:
char *pat[MAXTAG];
private:
+ void Init();
+ void Clear();
+ void ChSet(char c);
+ void ChSetWithCase(char c, bool caseSensitive);
+
int PMatch(CharacterIndexer &ci, int lp, int endp, char *ap);
int bol;
- int tagstk[MAXTAG]; /* subpat tag stack..*/
- char nfa[MAXNFA]; /* automaton.. */
+ int tagstk[MAXTAG]; /* subpat tag stack */
+ char nfa[MAXNFA]; /* automaton */
int sta;
- char bittab[BITBLK]; /* bit table for CCL */
- /* pre-set bits... */
+ char bittab[BITBLK]; /* bit table for CCL pre-set bits */
int failure;
CharClassify *charClass;
bool iswordc(unsigned char x) {