diff options
author | nyamatongwe <devnull@localhost> | 2006-02-27 00:00:25 +0000 |
---|---|---|
committer | nyamatongwe <devnull@localhost> | 2006-02-27 00:00:25 +0000 |
commit | cca359f1d365a3ee3db3892fbdec8e141b212afc (patch) | |
tree | 53ce864408dc58548d84fc2b64b716ac4fc598b8 /src | |
parent | 38a5a1bfddd9beab8370970084514eba5cfd8486 (diff) | |
download | scintilla-mirror-cca359f1d365a3ee3db3892fbdec8e141b212afc.tar.gz |
Patch from Philippe makes some methods private, improve comments and
whitespace.
Diffstat (limited to 'src')
-rw-r--r-- | src/RESearch.cxx | 301 | ||||
-rw-r--r-- | src/RESearch.h | 16 |
2 files changed, 156 insertions, 161 deletions
diff --git a/src/RESearch.cxx b/src/RESearch.cxx index f1fda7304..00e183060 100644 --- a/src/RESearch.cxx +++ b/src/RESearch.cxx @@ -4,7 +4,7 @@ **/ /* - * regex - Regular expression pattern matching and replacement + * regex - Regular expression pattern matching and replacement * * By: Ozan S. Yigit (oz) * Dept. of Computer Science @@ -15,7 +15,7 @@ * Removed all use of register. * Converted to modern function prototypes. * Put all global/static variables into an object so this code can be - * used from multiple threads etc. + * used from multiple threads, etc. * * These routines are the PUBLIC DOMAIN equivalents of regex * routines as found in 4.nBSD UN*X, with minor extensions. @@ -30,64 +30,62 @@ * Modification history removed. * * Interfaces: - * RESearch::Compile: compile a regular expression into a NFA. + * RESearch::Compile: compile a regular expression into a NFA. * - * char *RESearch::Compile(s) - * char *s; + * const char *RESearch::Compile(const char *pat, int length, + * bool caseSensitive, bool posix) * - * RESearch::Execute: execute the NFA to match a pattern. + * Returns a short error string if they fail. * - * int RESearch::Execute(s) - * char *s; + * RESearch::Execute: execute the NFA to match a pattern. * - * RESearch::ModifyWord change RESearch::Execute's understanding of what a "word" - * looks like (for \< and \>) by adding into the - * hidden word-syntax table. + * int RESearch::Execute(characterIndexer &ci, int lp, int endp) * - * void RESearch::ModifyWord(s) - * char *s; + * RESearch::Substitute: substitute the matched portions in a new string. * - * RESearch::Substitute: substitute the matched portions in a new string. + * int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) * - * int RESearch::Substitute(src, dst) - * char *src; - * char *dst; + * re_fail: failure routine for RESearch::Execute. (no longer used) * - * re_fail: failure routine for RESearch::Execute. - * - * void re_fail(msg, op) - * char *msg; - * char op; + * void re_fail(char *msg, char op) * * Regular Expressions: * * [1] char matches itself, unless it is a special * character (metachar): . \ [ ] * + ^ $ + * and ( ) if posix option. * * [2] . matches any character. * - * [3] \ matches the character following it, except - * when followed by a left or right round bracket, - * a digit 1 to 9 or a left or right angle bracket. - * (see [7], [8] and [9]) - * It is used as an escape character for all - * other meta-characters, and itself. When used - * in a set ([4]), it is treated as an ordinary - * character. + * [3] \ matches the character following it, except: + * - \a, \b, \f, \n, \t, \v match the + * corresponding C escape char; + * - if not in posix mode, when followed by a + * left or right round bracket (see [7]); + * - when followed by a digit 1 to 9 (see [8]); + * - when followed by a left or right angle bracket + * (see [9]). + * It is used as an escape character for all + * other meta-characters, and itself. When used + * in a set ([4]), it is treated as an ordinary + * character (except for escape chars). * * [4] [set] matches one of the characters in the set. * If the first character in the set is "^", * it matches a character NOT in the set, i.e. - * complements the set. A shorthand S-E is - * used to specify a set of characters S upto - * E, inclusive. The special characters "]" and - * "-" have no special meaning if they appear - * as the first chars in the set. + * complements the set. A shorthand S-E (start-end) + * is used to specify a set of characters S upto + * E, inclusive. The special characters "]" and + * "-" have no special meaning if they appear + * as the first chars in the set. To include both, + * put - first: [-]A-Z]: + * [-]|] matches these 2 chars, + * []-|] matches from ] to | chars. * examples: match: * * [a-z] any lowercase alpha * - * [^]-] any char except ] and - + * [^-]] any char except - and ] * * [^A-Z] any char except uppercase * alpha @@ -101,77 +99,77 @@ * [6] + same as [5], except it matches one or more. * * [7] a regular expression in the form [1] to [10], enclosed - * as \(form\) matches what form matches. The enclosure - * creates a set of tags, used for [8] and for - * pattern substution. The tagged forms are numbered - * starting from 1. + * as \(form\) (or (form) with posix flag) matches what + * form matches. The enclosure creates a set of tags, + * used for [8] and for pattern substitution. + * The tagged forms are numbered starting from 1. * * [8] a \ followed by a digit 1 to 9 matches whatever a * previously tagged regular expression ([7]) matched. * - * [9] \< a regular expression starting with a \< construct - * \> and/or ending with a \> construct, restricts the - * pattern matching to the beginning of a word, and/or - * the end of a word. A word is defined to be a character - * string beginning and/or ending with the characters - * A-Z a-z 0-9 and _. It must also be preceded and/or - * followed by any character outside those mentioned. + * [9] \< a regular expression starting with a \< construct + * \> and/or ending with a \> construct, restricts the + * pattern matching to the beginning of a word, and/or + * the end of a word. A word is defined to be a character + * string beginning and/or ending with the characters + * A-Z a-z 0-9 and _. It must also be preceded and/or + * followed by any character outside those mentioned. * * [10] a composite regular expression xy where x and y * are in the form [1] to [10] matches the longest * match of x followed by a match for y. * - * [11] ^ a regular expression starting with a ^ character - * $ and/or ending with a $ character, restricts the + * [11] ^ a regular expression starting with a ^ character + * $ and/or ending with a $ character, restricts the * pattern matching to the beginning of the line, * or the end of line. [anchors] Elsewhere in the - * pattern, ^ and $ are treated as ordinary characters. + * pattern, ^ and $ are treated as ordinary characters. * * * Acknowledgements: * - * HCR's Hugh Redelmeier has been most helpful in various - * stages of development. He convinced me to include BOW - * and EOW constructs, originally invented by Rob Pike at - * the University of Toronto. + * HCR's Hugh Redelmeier has been most helpful in various + * stages of development. He convinced me to include BOW + * and EOW constructs, originally invented by Rob Pike at + * the University of Toronto. * * References: - * Software tools Kernighan & Plauger + * Software tools Kernighan & Plauger * Software tools in Pascal Kernighan & Plauger * Grep [rsx-11 C dist] David Conroy - * ed - text editor Un*x Programmer's Manual - * Advanced editing on Un*x B. W. Kernighan - * RegExp routines Henry Spencer + * ed - text editor Un*x Programmer's Manual + * Advanced editing on Un*x B. W. Kernighan + * RegExp routines Henry Spencer * * Notes: * * This implementation uses a bit-set representation for character * classes for speed and compactness. Each character is represented - * by one bit in a 128-bit block. Thus, CCL always takes a - * constant 16 bytes in the internal nfa, and RESearch::Execute does a single + * by one bit in a 256-bit block. Thus, CCL always takes a + * constant 32 bytes in the internal nfa, and RESearch::Execute does a single * bit comparison to locate the character in the set. * * Examples: * - * pattern: foo*.* - * compile: CHR f CHR o CLO CHR o END CLO ANY END END - * matches: fo foo fooo foobar fobar foxx ... + * pattern: foo*.* + * compile: CHR f CHR o CLO CHR o END CLO ANY END END + * matches: fo foo fooo foobar fobar foxx ... * - * pattern: fo[ob]a[rz] - * compile: CHR f CHR o CCL bitset CHR a CCL bitset END - * matches: fobar fooar fobaz fooaz + * pattern: fo[ob]a[rz] + * compile: CHR f CHR o CCL bitset CHR a CCL bitset END + * matches: fobar fooar fobaz fooaz * - * pattern: foo\\+ - * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END - * matches: foo\ foo\\ foo\\\ ... + * pattern: foo\\+ + * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END + * matches: foo\ foo\\ foo\\\ ... * - * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo) - * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END - * matches: foo1foo foo2foo foo3foo + * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo) + * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END + * matches: foo1foo foo2foo foo3foo * - * pattern: \(fo.*\)-\1 - * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END - * matches: foo-foo fo-fo fob-fob foobar-foobar ... + * pattern: \(fo.*\)-\1 + * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END + * matches: foo-foo fo-fo fob-fob foobar-foobar ... */ #include "CharClassify.h" @@ -192,8 +190,8 @@ #define EOL 5 #define BOT 6 #define EOT 7 -#define BOW 8 -#define EOW 9 +#define BOW 8 +#define EOW 9 #define REF 10 #define CLO 11 @@ -203,17 +201,15 @@ * The following defines are not meant to be changeable. * They are for readability only. */ -#define BLKIND 0370 -#define BITIND 07 - -#define ASCIIB 0177 +#define BLKIND 0370 +#define BITIND 07 const char bitarr[] = {1,2,4,8,16,32,64,'\200'}; #define badpat(x) (*nfa = END, x) /* - * character classification table for word boundary operators BOW + * Character classification table for word boundary operators BOW * and EOW is passed in by the creator of this object (Scintilla * Document). The Document default state is that word chars are: * 0-9,a-z, A-Z and _ @@ -229,7 +225,7 @@ RESearch::~RESearch() { } void RESearch::Init() { - sta = NOP; /* status of lastpat */ + sta = NOP; /* status of lastpat */ bol = 0; for (int i=0; i<MAXTAG; i++) pat[i] = 0; @@ -299,15 +295,15 @@ const char escapeValue(char ch) { const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, bool posix) { char *mp=nfa; /* nfa pointer */ - char *lp; /* saved pointer.. */ - char *sp=nfa; /* another one.. */ + char *lp; /* saved pointer */ + char *sp=nfa; /* another one */ char *mpMax = mp + MAXNFA - BITBLK - 10; int tagi = 0; /* tag stack index */ int tagc = 1; /* actual tag count */ int n; - char mask; /* xor mask -CCL/NCL */ + char mask; /* xor mask -CCL/NCL */ int c1, c2; if (!pat || !length) @@ -317,18 +313,18 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b return badpat("No previous regular expression"); sta = NOP; - const char *p=pat; /* pattern pointer */ + const char *p=pat; /* pattern pointer */ for (int i=0; i<length; i++, p++) { if (mp > mpMax) return badpat("Pattern too long"); lp = mp; switch(*p) { - case '.': /* match any char.. */ + case '.': /* match any char */ *mp++ = ANY; break; - case '^': /* match beginning.. */ + case '^': /* match beginning */ if (p == pat) *mp++ = BOL; else { @@ -337,7 +333,7 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b } break; - case '$': /* match endofline.. */ + case '$': /* match endofline */ if (!*(p+1)) *mp++ = EOL; else { @@ -346,7 +342,7 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b } break; - case '[': /* match char class..*/ + case '[': /* match char class */ *mp++ = CCL; i++; @@ -357,7 +353,7 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b } else mask = 0; - if (*p == '-') { /* real dash */ + if (*p == '-') { /* real dash */ i++; ChSet(*p++); } @@ -398,12 +394,12 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b break; - case '*': /* match 0 or more.. */ - case '+': /* match 1 or more.. */ + case '*': /* match 0 or more... */ + case '+': /* match 1 or more... */ if (p == pat) return badpat("Empty closure"); lp = sp; /* previous opcode */ - if (*lp == CLO) /* equivalence.. */ + if (*lp == CLO) /* equivalence... */ break; switch(*lp) { @@ -431,7 +427,7 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b mp = sp; break; - case '\\': /* tags, backrefs .. */ + case '\\': /* tags, backrefs... */ i++; switch(*++p) { @@ -497,7 +493,7 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b } break; - default : /* an ordinary char */ + default : /* an ordinary char */ if (posix && *p == '(') { if (tagc < MAXTAG) { tagstk[++tagi] = tagc; @@ -538,23 +534,23 @@ const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, b /* * RESearch::Execute: - * execute nfa to find a match. - * - * special cases: (nfa[0]) - * BOL - * Match only once, starting from the - * beginning. - * CHR - * First locate the character without - * calling PMatch, and if found, call - * PMatch for the remaining string. - * END - * RESearch::Compile failed, poor luser did not - * check for it. Fail fast. - * - * If a match is found, bopat[0] and eopat[0] are set - * to the beginning and the end of the matched fragment, - * respectively. + * execute nfa to find a match. + * + * special cases: (nfa[0]) + * BOL + * Match only once, starting from the + * beginning. + * CHR + * First locate the character without + * calling PMatch, and if found, call + * PMatch for the remaining string. + * END + * RESearch::Compile failed, poor luser did not + * check for it. Fail fast. + * + * If a match is found, bopat[0] and eopat[0] are set + * to the beginning and the end of the matched fragment, + * respectively. * */ @@ -585,7 +581,7 @@ int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) { c = *(ap+1); while ((lp < endp) && (ci.CharAt(lp) != c)) lp++; - if (lp >= endp) /* if EOS, fail, else fall thru. */ + if (lp >= endp) /* if EOS, fail, else fall thru. */ return 0; default: /* regular matching all the way. */ while (lp < endp) { @@ -609,51 +605,50 @@ int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) { /* * PMatch: internal routine for the hard part * - * This code is partly snarfed from an early grep written by - * David Conroy. The backref and tag stuff, and various other - * innovations are by oz. - * - * special case optimizations: (nfa[n], nfa[n+1]) - * CLO ANY - * We KNOW .* will match everything upto the - * end of line. Thus, directly go to the end of - * line, without recursive PMatch calls. As in - * the other closure cases, the remaining pattern - * must be matched by moving backwards on the - * string recursively, to find a match for xy - * (x is ".*" and y is the remaining pattern) - * where the match satisfies the LONGEST match for - * x followed by a match for y. - * CLO CHR - * We can again scan the string forward for the - * single char and at the point of failure, we - * execute the remaining nfa recursively, same as - * above. - * - * At the end of a successful match, bopat[n] and eopat[n] - * are set to the beginning and end of subpatterns matched - * by tagged expressions (n = 1 to 9). - * + * This code is partly snarfed from an early grep written by + * David Conroy. The backref and tag stuff, and various other + * innovations are by oz. + * + * special case optimizations: (nfa[n], nfa[n+1]) + * CLO ANY + * We KNOW .* will match everything upto the + * end of line. Thus, directly go to the end of + * line, without recursive PMatch calls. As in + * the other closure cases, the remaining pattern + * must be matched by moving backwards on the + * string recursively, to find a match for xy + * (x is ".*" and y is the remaining pattern) + * where the match satisfies the LONGEST match for + * x followed by a match for y. + * CLO CHR + * We can again scan the string forward for the + * single char and at the point of failure, we + * execute the remaining nfa recursively, same as + * above. + * + * At the end of a successful match, bopat[n] and eopat[n] + * are set to the beginning and end of subpatterns matched + * by tagged expressions (n = 1 to 9). */ extern void re_fail(char *,char); -#define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND]) +#define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND]) /* * skip values for CLO XXX to skip past the closure */ -#define ANYSKIP 2 /* [CLO] ANY END ... */ -#define CHRSKIP 3 /* [CLO] CHR chr END ... */ -#define CCLSKIP 34 /* [CLO] CCL 32bytes END ... */ +#define ANYSKIP 2 /* [CLO] ANY END */ +#define CHRSKIP 3 /* [CLO] CHR chr END */ +#define CCLSKIP 34 /* [CLO] CCL 32 bytes END */ int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) { int op, c, n; - int e; /* extra pointer for CLO */ - int bp; /* beginning of subpat.. */ - int ep; /* ending of subpat.. */ - int are; /* to save the line ptr. */ + int e; /* extra pointer for CLO */ + int bp; /* beginning of subpat... */ + int ep; /* ending of subpat... */ + int are; /* to save the line ptr. */ while ((op = *ap++) != END) switch(op) { @@ -745,13 +740,13 @@ int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) { /* * RESearch::Substitute: - * substitute the matched portions of the src in dst. + * substitute the matched portions of the src in dst. * - * & substitute the entire matched pattern. + * & substitute the entire matched pattern. * - * \digit substitute a subpattern, with the given tag number. - * Tags are numbered from 1 to 9. If the particular - * tagged subpattern does not exist, null is substituted. + * \digit substitute a subpattern, with the given tag number. + * Tags are numbered from 1 to 9. If the particular + * tagged subpattern does not exist, null is substituted. */ int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) { char c; diff --git a/src/RESearch.h b/src/RESearch.h index 25205951f..aa8557918 100644 --- a/src/RESearch.h +++ b/src/RESearch.h @@ -29,11 +29,7 @@ class RESearch { public: RESearch(CharClassify *charClassTable); ~RESearch(); - void Init(); - void Clear(); bool GrabMatches(CharacterIndexer &ci); - void ChSet(char c); - void ChSetWithCase(char c, bool caseSensitive); const char *Compile(const char *pat, int length, bool caseSensitive, bool posix); int Execute(CharacterIndexer &ci, int lp, int endp); int Substitute(CharacterIndexer &ci, char *src, char *dst); @@ -47,14 +43,18 @@ public: char *pat[MAXTAG]; private: + void Init(); + void Clear(); + void ChSet(char c); + void ChSetWithCase(char c, bool caseSensitive); + int PMatch(CharacterIndexer &ci, int lp, int endp, char *ap); int bol; - int tagstk[MAXTAG]; /* subpat tag stack..*/ - char nfa[MAXNFA]; /* automaton.. */ + int tagstk[MAXTAG]; /* subpat tag stack */ + char nfa[MAXNFA]; /* automaton */ int sta; - char bittab[BITBLK]; /* bit table for CCL */ - /* pre-set bits... */ + char bittab[BITBLK]; /* bit table for CCL pre-set bits */ int failure; CharClassify *charClass; bool iswordc(unsigned char x) { |