diff options
Diffstat (limited to 'lexers/LexPerl.cxx')
-rw-r--r-- | lexers/LexPerl.cxx | 342 |
1 files changed, 282 insertions, 60 deletions
diff --git a/lexers/LexPerl.cxx b/lexers/LexPerl.cxx index 5efc1d636..8a0f6422e 100644 --- a/lexers/LexPerl.cxx +++ b/lexers/LexPerl.cxx @@ -69,6 +69,10 @@ using namespace Scintilla; #define BACK_OPERATOR 1 // whitespace/comments are insignificant #define BACK_KEYWORD 2 // operators/keywords are needed for disambiguation +// all interpolated styles are different from their parent styles by a constant difference +// we also assume SCE_PL_STRING_VAR is the interpolated style with the smallest value +#define INTERPOLATE_SHIFT (SCE_PL_STRING_VAR - SCE_PL_STRING) + static bool isPerlKeyword(unsigned int start, unsigned int end, WordList &keywords, LexAccessor &styler) { // old-style keyword matcher; needed because GetCurrent() needs // current segment to be committed, but we may abandon early... @@ -352,11 +356,19 @@ struct OptionSetPerl : public OptionSet<OptionsPerl> { }; class LexerPerl : public ILexer { + CharacterSet setWordStart; + CharacterSet setWord; + CharacterSet setSpecialVar; + CharacterSet setControlVar; WordList keywords; OptionsPerl options; OptionSetPerl osPerl; public: - LexerPerl() { + LexerPerl() : + setWordStart(CharacterSet::setAlpha, "_", 0x80, true), + setWord(CharacterSet::setAlphaNum, "_", 0x80, true), + setSpecialVar(CharacterSet::setNone, "\"$;<>&`'+,./\\%:=~!?@[]"), + setControlVar(CharacterSet::setNone, "ACDEFHILMNOPRSTVWX") { } ~LexerPerl() { } @@ -390,6 +402,7 @@ public: static ILexer *LexerFactoryPerl() { return new LexerPerl(); } + void InterpolateSegment(StyleContext &sc, int maxSeg, bool isPattern=false); }; int SCI_METHOD LexerPerl::PropertySet(const char *key, const char *val) { @@ -418,6 +431,90 @@ int SCI_METHOD LexerPerl::WordListSet(int n, const char *wl) { return firstModification; } +void LexerPerl::InterpolateSegment(StyleContext &sc, int maxSeg, bool isPattern) { + // interpolate a segment (with no active backslashes or delimiters within) + // switch in or out of an interpolation style or continue current style + // commit variable patterns if found, trim segment, repeat until done + while (maxSeg > 0) { + bool isVar = false; + int sLen = 0; + if ((maxSeg > 1) && (sc.ch == '$' || sc.ch == '@')) { + // $#[$]*word [$@][$]*word (where word or {word} is always present) + bool braces = false; + sLen = 1; + if (sc.ch == '$' && sc.chNext == '#') { // starts with $# + sLen++; + } + while ((maxSeg > sLen) && (sc.GetRelative(sLen) == '$')) // >0 $ dereference within + sLen++; + if ((maxSeg > sLen) && (sc.GetRelative(sLen) == '{')) { // { start for {word} + sLen++; + braces = true; + } + if (maxSeg > sLen) { + int c = sc.GetRelative(sLen); + if (setWordStart.Contains(c)) { // word (various) + sLen++; + isVar = true; + while ((maxSeg > sLen) && setWord.Contains(sc.GetRelative(sLen))) + sLen++; + } else if (braces && IsADigit(c) && (sLen == 2)) { // digit for ${digit} + sLen++; + isVar = true; + } + } + if (braces) { + if ((maxSeg > sLen) && (sc.GetRelative(sLen) == '}')) { // } end for {word} + sLen++; + } else + isVar = false; + } + } + if (!isVar && (maxSeg > 1)) { // $- or @-specific variable patterns + sLen = 1; + int c = sc.chNext; + if (sc.ch == '$') { + if (IsADigit(c)) { // $[0-9] and slurp trailing digits + sLen++; + isVar = true; + while ((maxSeg > sLen) && IsADigit(sc.GetRelative(sLen))) + sLen++; + } else if (setSpecialVar.Contains(c)) { // $ special variables + sLen++; + isVar = true; + } else if (!isPattern && ((c == '(') || (c == ')') || (c == '|'))) { // $ additional + sLen++; + isVar = true; + } else if (c == '^') { // $^A control-char style + sLen++; + if ((maxSeg > sLen) && setControlVar.Contains(sc.GetRelative(sLen))) { + sLen++; + isVar = true; + } + } + } else if (sc.ch == '@') { + if (!isPattern && ((c == '+') || (c == '-'))) { // @ specials non-pattern + sLen++; + isVar = true; + } + } + } + if (isVar) { // commit as interpolated variable or normal character + if (sc.state < SCE_PL_STRING_VAR) + sc.SetState(sc.state + INTERPOLATE_SHIFT); + sc.Forward(sLen); + maxSeg -= sLen; + } else { + if (sc.state >= SCE_PL_STRING_VAR) + sc.SetState(sc.state - INTERPOLATE_SHIFT); + sc.Forward(); + maxSeg--; + } + } + if (sc.state >= SCE_PL_STRING_VAR) + sc.SetState(sc.state - INTERPOLATE_SHIFT); +} + void SCI_METHOD LexerPerl::Lex(unsigned int startPos, int length, int initStyle, IDocument *pAccess) { LexAccessor styler(pAccess); @@ -426,8 +523,6 @@ void SCI_METHOD LexerPerl::Lex(unsigned int startPos, int length, int initStyle, reWords.Set("elsif if split while"); // charset classes - CharacterSet setWordStart(CharacterSet::setAlpha, "_", 0x80, true); - CharacterSet setWord(CharacterSet::setAlphaNum, "_", 0x80, true); CharacterSet setSingleCharOp(CharacterSet::setNone, "rwxoRWXOezsfdlpSbctugkTBMAC"); // lexing of "%*</" operators is non-trivial; these are missing in the set below CharacterSet setPerlOperator(CharacterSet::setNone, "^&\\()-+=|{}[]:;>,?!.~"); @@ -512,10 +607,13 @@ void SCI_METHOD LexerPerl::Lex(unsigned int startPos, int length, int initStyle, // Includes strings (may be multi-line), numbers (additional state), format // bodies, as well as POD sections. if (initStyle == SCE_PL_HERE_Q - || initStyle == SCE_PL_HERE_QQ - || initStyle == SCE_PL_HERE_QX - || initStyle == SCE_PL_FORMAT + || initStyle == SCE_PL_HERE_QQ + || initStyle == SCE_PL_HERE_QX + || initStyle == SCE_PL_FORMAT + || initStyle == SCE_PL_HERE_QQ_VAR + || initStyle == SCE_PL_HERE_QX_VAR ) { + // backtrack through multiple styles to reach the delimiter start int delim = (initStyle == SCE_PL_FORMAT) ? SCE_PL_FORMAT_IDENT:SCE_PL_HERE_DELIM; while ((startPos > 1) && (styler.StyleAt(startPos) != delim)) { startPos--; @@ -523,15 +621,34 @@ void SCI_METHOD LexerPerl::Lex(unsigned int startPos, int length, int initStyle, startPos = styler.LineStart(styler.GetLine(startPos)); initStyle = styler.StyleAt(startPos - 1); } - if (initStyle == SCE_PL_STRING_Q - || initStyle == SCE_PL_STRING_QQ - || initStyle == SCE_PL_STRING_QX - || initStyle == SCE_PL_STRING_QR + if (initStyle == SCE_PL_STRING + || initStyle == SCE_PL_STRING_QQ + || initStyle == SCE_PL_BACKTICKS + || initStyle == SCE_PL_STRING_QX + || initStyle == SCE_PL_REGEX + || initStyle == SCE_PL_STRING_QR + || initStyle == SCE_PL_REGSUBST + || initStyle == SCE_PL_STRING_VAR + || initStyle == SCE_PL_STRING_QQ_VAR + || initStyle == SCE_PL_BACKTICKS_VAR + || initStyle == SCE_PL_STRING_QX_VAR + || initStyle == SCE_PL_REGEX_VAR + || initStyle == SCE_PL_STRING_QR_VAR + || initStyle == SCE_PL_REGSUBST_VAR + ) { + // for interpolation, must backtrack through a mix of two different styles + int otherStyle = (initStyle >= SCE_PL_STRING_VAR) ? + initStyle - INTERPOLATE_SHIFT : initStyle + INTERPOLATE_SHIFT; + while (startPos > 1) { + int st = styler.StyleAt(startPos - 1); + if ((st != initStyle) && (st != otherStyle)) + break; + startPos--; + } + initStyle = SCE_PL_DEFAULT; + } else if (initStyle == SCE_PL_STRING_Q || initStyle == SCE_PL_STRING_QW - || initStyle == SCE_PL_REGEX - || initStyle == SCE_PL_REGSUBST - || initStyle == SCE_PL_STRING - || initStyle == SCE_PL_BACKTICKS + || initStyle == SCE_PL_XLAT || initStyle == SCE_PL_CHARACTER || initStyle == SCE_PL_NUMBER || initStyle == SCE_PL_IDENTIFIER @@ -762,19 +879,49 @@ void SCI_METHOD LexerPerl::Lex(unsigned int startPos, int length, int initStyle, break; case SCE_PL_HERE_Q: case SCE_PL_HERE_QQ: - case SCE_PL_HERE_QX: { - // also implies HereDoc.State == 2 - sc.Complete(); - if (HereDoc.DelimiterLength == 0 || sc.Match(HereDoc.Delimiter)) { + case SCE_PL_HERE_QX: + // also implies HereDoc.State == 2 + sc.Complete(); + if (HereDoc.DelimiterLength == 0 || sc.Match(HereDoc.Delimiter)) { + int c = sc.GetRelative(HereDoc.DelimiterLength); + if (c == '\r' || c == '\n') { // peek first, do not consume match sc.Forward(HereDoc.DelimiterLength); - if (sc.atLineEnd || ((sc.ch == '\r' && sc.chNext == '\n'))) { sc.SetState(SCE_PL_DEFAULT); backFlag = BACK_NONE; HereDoc.State = 0; + if (!sc.atLineEnd) + sc.Forward(); + break; } } + if (sc.state == SCE_PL_HERE_Q) { // \EOF and 'EOF' non-interpolated while (!sc.atLineEnd) sc.Forward(); + break; + } + while (!sc.atLineEnd) { // "EOF" and `EOF` interpolated + int s = 0, endType = 0; + int maxSeg = endPos - sc.currentPos; + while (s < maxSeg) { // scan to break string into segments + int c = sc.GetRelative(s); + if (c == '\\') { + endType = 1; break; + } else if (c == '\r' || c == '\n') { + endType = 2; break; + } + s++; + } + if (s > 0) // process non-empty segments + InterpolateSegment(sc, s); + if (endType == 1) { + sc.Forward(); + // \ at end-of-line does not appear to have any effect, skip + if (sc.ch != '\r' && sc.ch != '\n') + sc.Forward(); + } else if (endType == 2) { + if (!sc.atLineEnd) + sc.Forward(); + } } break; case SCE_PL_POD: @@ -826,45 +973,89 @@ void SCI_METHOD LexerPerl::Lex(unsigned int startPos, int length, int initStyle, sc.SetState(SCE_PL_DEFAULT); } else if (!Quote.Up && !IsASpace(sc.ch)) { Quote.Open(sc.ch); - } else if (sc.ch == '\\' && Quote.Up != '\\') { - sc.Forward(); - } else if (sc.ch == Quote.Down) { - Quote.Count--; - if (Quote.Count == 0) - Quote.Rep--; - } else if (sc.ch == Quote.Up) { - Quote.Count++; + } else { + int s = 0, endType = 0; + int maxSeg = endPos - sc.currentPos; + while (s < maxSeg) { // scan to break string into segments + int c = sc.GetRelative(s); + if (IsASpace(c)) { + break; + } else if (c == '\\' && Quote.Up != '\\') { + endType = 1; break; + } else if (c == Quote.Down) { + Quote.Count--; + if (Quote.Count == 0) { + Quote.Rep--; + break; + } + } else if (c == Quote.Up) + Quote.Count++; + s++; + } + if (s > 0) { // process non-empty segments + if (Quote.Up != '\'') { + InterpolateSegment(sc, s, true); + } else // non-interpolated path + sc.Forward(s); + } + if (endType == 1) + sc.Forward(); } break; case SCE_PL_REGSUBST: + case SCE_PL_XLAT: if (Quote.Rep <= 0) { if (!setModifiers.Contains(sc.ch)) sc.SetState(SCE_PL_DEFAULT); } else if (!Quote.Up && !IsASpace(sc.ch)) { Quote.Open(sc.ch); - } else if (sc.ch == '\\' && Quote.Up != '\\') { - sc.Forward(); - } else if (Quote.Count == 0 && Quote.Rep == 1) { - // We matched something like s(...) or tr{...}, Perl 5.10 - // appears to allow almost any character for use as the - // next delimiters. Whitespace and comments are accepted in - // between, but we'll limit to whitespace here. - // For '#', if no whitespace in between, it's a delimiter. - if (IsASpace(sc.ch)) { - // Keep going - } else if (sc.ch == '#' && IsASpaceOrTab(sc.chPrev)) { - sc.SetState(SCE_PL_DEFAULT); - } else { - Quote.Open(sc.ch); + } else { + int s = 0, endType = 0; + int maxSeg = endPos - sc.currentPos; + bool isPattern = (Quote.Rep == 2); + while (s < maxSeg) { // scan to break string into segments + int c = sc.GetRelative(s); + if (c == '\\' && Quote.Up != '\\') { + endType = 2; break; + } else if (Quote.Count == 0 && Quote.Rep == 1) { + // We matched something like s(...) or tr{...}, Perl 5.10 + // appears to allow almost any character for use as the + // next delimiters. Whitespace and comments are accepted in + // between, but we'll limit to whitespace here. + // For '#', if no whitespace in between, it's a delimiter. + if (IsASpace(c)) { + // Keep going + } else if (c == '#' && IsASpaceOrTab(sc.GetRelative(s - 1))) { + endType = 3; + } else + Quote.Open(c); + break; + } else if (c == Quote.Down) { + Quote.Count--; + if (Quote.Count == 0) { + Quote.Rep--; + endType = 1; + } + if (Quote.Up == Quote.Down) + Quote.Count++; + if (endType == 1) + break; + } else if (c == Quote.Up) { + Quote.Count++; + } else if (IsASpace(c)) + break; + s++; } - } else if (sc.ch == Quote.Down) { - Quote.Count--; - if (Quote.Count == 0) - Quote.Rep--; - if (Quote.Up == Quote.Down) - Quote.Count++; - } else if (sc.ch == Quote.Up) { - Quote.Count++; + if (s > 0) { // process non-empty segments + if (sc.state == SCE_PL_REGSUBST && Quote.Up != '\'') { + InterpolateSegment(sc, s, isPattern); + } else // non-interpolated path + sc.Forward(s); + } + if (endType == 2) { + sc.Forward(); + } else if (endType == 3) + sc.SetState(SCE_PL_DEFAULT); } break; case SCE_PL_STRING_Q: @@ -876,14 +1067,45 @@ void SCI_METHOD LexerPerl::Lex(unsigned int startPos, int length, int initStyle, case SCE_PL_BACKTICKS: if (!Quote.Down && !IsASpace(sc.ch)) { Quote.Open(sc.ch); - } else if (sc.ch == '\\' && Quote.Up != '\\') { - sc.Forward(); - } else if (sc.ch == Quote.Down) { - Quote.Count--; - if (Quote.Count == 0) + } else { + int s = 0, endType = 0; + int maxSeg = endPos - sc.currentPos; + while (s < maxSeg) { // scan to break string into segments + int c = sc.GetRelative(s); + if (IsASpace(c)) { + break; + } else if (c == '\\' && Quote.Up != '\\') { + endType = 2; break; + } else if (c == Quote.Down) { + Quote.Count--; + if (Quote.Count == 0) { + endType = 3; break; + } + } else if (c == Quote.Up) + Quote.Count++; + s++; + } + if (s > 0) { // process non-empty segments + switch (sc.state) { + case SCE_PL_STRING: + case SCE_PL_STRING_QQ: + case SCE_PL_BACKTICKS: + InterpolateSegment(sc, s); + break; + case SCE_PL_STRING_QX: + if (Quote.Up != '\'') { + InterpolateSegment(sc, s); + break; + } + // (continued for ' delim) + default: // non-interpolated path + sc.Forward(s); + } + } + if (endType == 2) { + sc.Forward(); + } else if (endType == 3) sc.ForwardSetState(SCE_PL_DEFAULT); - } else if (sc.ch == Quote.Up) { - Quote.Count++; } break; case SCE_PL_SUB_PROTOTYPE: { @@ -906,8 +1128,8 @@ void SCI_METHOD LexerPerl::Lex(unsigned int startPos, int length, int initStyle, if (sc.Match('.')) { sc.Forward(); if (sc.atLineEnd || ((sc.ch == '\r' && sc.chNext == '\n'))) - sc.SetState(SCE_PL_DEFAULT); - } + sc.SetState(SCE_PL_DEFAULT); + } while (!sc.atLineEnd) sc.Forward(); } @@ -1026,10 +1248,10 @@ void SCI_METHOD LexerPerl::Lex(unsigned int startPos, int length, int initStyle, sc.ChangeState(SCE_PL_STRING_Q); Quote.New(); } else if (sc.ch == 'y' && !setWord.Contains(sc.chNext)) { - sc.ChangeState(SCE_PL_REGSUBST); + sc.ChangeState(SCE_PL_XLAT); Quote.New(2); } else if (sc.Match('t', 'r') && !setWord.Contains(sc.GetRelative(2))) { - sc.ChangeState(SCE_PL_REGSUBST); + sc.ChangeState(SCE_PL_XLAT); Quote.New(2); sc.Forward(); fw++; |