diff options
author | nyamatongwe <unknown> | 2005-09-13 23:55:48 +0000 |
---|---|---|
committer | nyamatongwe <unknown> | 2005-09-13 23:55:48 +0000 |
commit | 7f1a58873d0662d0a82a703c85756f48d90d3a9a (patch) | |
tree | 1083ab7f3893b8a4dfa22f93ff7092b750b5f028 | |
parent | a110498be71cc77ffe566c1dff1e5c33516cfbfe (diff) | |
download | scintilla-mirror-7f1a58873d0662d0a82a703c85756f48d90d3a9a.tar.gz |
Update from Eric Promislow improves here doc recognition.
-rw-r--r-- | src/LexRuby.cxx | 360 |
1 files changed, 323 insertions, 37 deletions
diff --git a/src/LexRuby.cxx b/src/LexRuby.cxx index e8a4d2917..f5ad020f4 100644 --- a/src/LexRuby.cxx +++ b/src/LexRuby.cxx @@ -28,13 +28,32 @@ static inline bool isEOLChar(char ch) { return (ch == '\r') || (ch == '\n'); } -static inline bool isRubyOperatorChar(char ch) { - return strchr("%^&*\\()-+=|{}[]:;<>,/?!.~",ch) != NULL; +#define isSafeASCII(ch) ((unsigned int)(ch) <= 127) +// This one's redundant, but makes for more readable code +#define isHighBitChar(ch) ((unsigned int)(ch) > 127) + +static inline bool isSafeAlpha(char ch) { + return (isSafeASCII(ch) && isalpha(ch)) || ch == '_'; } +static inline bool isSafeAlnum(char ch) { + return (isSafeASCII(ch) && isalnum(ch)) || ch == '_'; +} -static inline bool isSafeAlpha(char ch) { - return ((unsigned int) ch <= 127) && isalpha(ch); +static inline bool isSafeAlnumOrHigh(char ch) { + return isHighBitChar(ch) || isalnum(ch) || ch == '_'; +} + +static inline bool isSafeDigit(char ch) { + return isSafeASCII(ch) && isdigit(ch); +} + +static inline bool isSafeWordcharOrHigh(char ch) { + return isHighBitChar(ch) || iswordchar(ch); +} + +static bool inline iswhitespace(char ch) { + return ch == ' ' || ch == '\t'; } #define MAX_KEYWORD_LENGTH 200 @@ -248,7 +267,258 @@ static bool RE_CanFollowKeyword(const char *keyword) { } return false; } + +// Look at chars up to but not including endPos +// Don't look at styles in case we're looking forward + +static int skipWhitespace(int startPos, + int endPos, + Accessor &styler) { + for (int i = startPos; i < endPos; i++) { + if (!iswhitespace(styler[i])) { + return i; + } + } + return endPos; +} + +// This routine looks for false positives like +// undef foo, << +// There aren't too many. +// +// iPrev points to the start of << + +static bool sureThisIsHeredoc(int iPrev, + Accessor &styler, + char *prevWord) { + + // Not so fast, since Ruby's so dynamic. Check the context + // to make sure we're OK. + int prevStyle; + int lineStart = styler.GetLine(iPrev); + int lineStartPosn = styler.LineStart(lineStart); + styler.Flush(); + + // Find the first word after some whitespace + int firstWordPosn = skipWhitespace(lineStartPosn, iPrev, styler); + if (firstWordPosn >= iPrev) { + // Have something like {^ <<} + //XXX Look at the first previous non-comment non-white line + // to establish the context. Not too likely though. + return true; + } else { + switch (prevStyle = styler.StyleAt(firstWordPosn)) { + case SCE_RB_WORD: + case SCE_RB_WORD_DEMOTED: + case SCE_RB_IDENTIFIER: + break; + default: + return true; + } + } + int firstWordEndPosn = firstWordPosn; + char *dst = prevWord; + for (;;) { + if (firstWordEndPosn >= iPrev || + styler.StyleAt(firstWordEndPosn) != prevStyle) { + *dst = 0; + break; + } + *dst++ = styler[firstWordEndPosn]; + firstWordEndPosn += 1; + } + //XXX Write a style-aware thing to regex scintilla buffer objects + if (!strcmp(prevWord, "undef") + || !strcmp(prevWord, "def") + || !strcmp(prevWord, "alias")) { + // These keywords are what we were looking for + return false; + } + return true; +} + +// Routine that saves us from allocating a buffer for the here-doc target +// targetEndPos points one past the end of the current target +static bool haveTargetMatch(int currPos, + int lengthDoc, + int targetStartPos, + int targetEndPos, + Accessor &styler) { + if (lengthDoc - currPos < targetEndPos - targetStartPos) { + return false; + } + int i, j; + for (i = targetStartPos, j = currPos; + i < targetEndPos && j < lengthDoc; + i++, j++) { + if (styler[i] != styler[j]) { + return false; + } + } + return true; +} + +// We need a check because the form +// [identifier] <<[target] +// is ambiguous. The Ruby lexer/parser resolves it by +// looking to see if [identifier] names a variable or a +// function. If it's the first, it's the start of a here-doc. +// If it's a var, it's an operator. This lexer doesn't +// maintain a symbol table, so it looks ahead to see what's +// going on, in cases where we have +// ^[white-space]*[identifier([.|::]identifier)*][white-space]*<<[target] +// +// If there's no occurrence of [target] on a line, assume we don't. + +// return true == yes, we have no heredocs + +static bool sureThisIsNotHeredoc(int lt2StartPos, + Accessor &styler) { + int prevStyle; + // Use full document, not just part we're styling + int lengthDoc = styler.Length(); + int lineStart = styler.GetLine(lt2StartPos); + int lineStartPosn = styler.LineStart(lineStart); + styler.Flush(); + const bool definitely_not_a_here_doc = true; + const bool looks_like_a_here_doc = false; + + // Find the first word after some whitespace + int firstWordPosn = skipWhitespace(lineStartPosn, lt2StartPos, styler); + if (firstWordPosn >= lt2StartPos) { + return definitely_not_a_here_doc; + } + prevStyle = styler.StyleAt(firstWordPosn); + // If we have '<<' following a keyword, it's not a heredoc + if (prevStyle != SCE_RB_IDENTIFIER) { + return definitely_not_a_here_doc; + } + int newStyle = prevStyle; + // Some compilers incorrectly warn about uninit newStyle + for (firstWordPosn += 1; firstWordPosn <= lt2StartPos; firstWordPosn += 1) { + // Inner loop looks at the name + for (; firstWordPosn <= lt2StartPos; firstWordPosn += 1) { + newStyle = styler.StyleAt(firstWordPosn); + if (newStyle != prevStyle) { + break; + } + } + // Do we have '::' or '.'? + if (firstWordPosn < lt2StartPos && newStyle == SCE_RB_OPERATOR) { + char ch = styler[firstWordPosn]; + if (ch == '.') { + // yes + } else if (ch == ':') { + if (styler.StyleAt(++firstWordPosn) != SCE_RB_OPERATOR) { + return definitely_not_a_here_doc; + } else if (styler[firstWordPosn] != ':') { + return definitely_not_a_here_doc; + } + } else { + break; + } + } else { + break; + } + } + // Skip next batch of white-space + firstWordPosn = skipWhitespace(firstWordPosn, lt2StartPos, styler); + if (firstWordPosn != lt2StartPos) { + // Have [[^ws[identifier]ws[*something_else*]ws<< + return definitely_not_a_here_doc; + } + // OK, now 'j' will point to the current spot moving ahead + int j = firstWordPosn + 1; + if (styler.StyleAt(j) != SCE_RB_OPERATOR || styler[j] != '<') { + // This shouldn't happen + return definitely_not_a_here_doc; + } + int nextLineStartPosn = styler.LineStart(lineStart + 1); + if (nextLineStartPosn >= lengthDoc) { + return definitely_not_a_here_doc; + } + j = skipWhitespace(j + 1, nextLineStartPosn, styler); + if (j >= lengthDoc) { + return definitely_not_a_here_doc; + } + bool allow_indent; + int target_start, target_end; + // From this point on no more styling, since we're looking ahead + if (styler[j] == '-') { + allow_indent = true; + j++; + } else { + allow_indent = false; + } + + // Allow for quoted targets. + char target_quote = 0; + switch (styler[j]) { + case '\'': + case '"': + case '`': + target_quote = styler[j]; + j += 1; + } + if (isSafeAlnum(styler[j])) { + // Init target_end because some compilers think it won't + // be initialized by the time it's used + target_start = target_end = j; + j++; + } else { + return definitely_not_a_here_doc; + } + for (; j < lengthDoc; j++) { + if (!isSafeAlnum(styler[j])) { + if (target_quote && styler[j] != target_quote) { + // unquoted end + return definitely_not_a_here_doc; + } + + // And for now make sure that it's a newline + // don't handle arbitrary expressions yet + + target_end = j; + if (target_quote) { + // Now we can move to the character after the string delimiter. + j += 1; + } + j = skipWhitespace(j, lengthDoc, styler); + if (j >= lengthDoc) { + return definitely_not_a_here_doc; + } else { + char ch = styler[j]; + if (ch == '#' || isEOLChar(ch)) { + // This is OK, so break and continue; + break; + } else { + return definitely_not_a_here_doc; + } + } + } + } + + // Just look at the start of each line + int last_line = styler.GetLine(lengthDoc - 1); + // But don't go too far + if (last_line > lineStart + 50) { + last_line = lineStart + 50; + } + for (int line_num = lineStart + 1; line_num <= last_line; line_num++) { + if (allow_indent) { + j = skipWhitespace(styler.LineStart(line_num), lengthDoc, styler); + } else { + j = styler.LineStart(line_num); + } + // target_end is one past the end + if (haveTargetMatch(j, lengthDoc, target_start, target_end, styler)) { + // We got it + return looks_like_a_here_doc; + } + } + return definitely_not_a_here_doc; +} //todo: if we aren't looking at a stdio character, // move to the start of the first line that is not in a @@ -417,11 +687,11 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle, // Regular transitions if (state == SCE_RB_DEFAULT) { - if (isdigit(ch)) { + if (isSafeDigit(ch)) { styler.ColourTo(i - 1, state); state = SCE_RB_NUMBER; numDots = 0; - } else if (iswordstart(ch)) { + } else if (isHighBitChar(ch) || iswordstart(ch)) { styler.ColourTo(i - 1, state); state = SCE_RB_WORD; } else if (ch == '#') { @@ -435,7 +705,7 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle, && styler.SafeGetCharAt(i + 3) == 'g' && styler.SafeGetCharAt(i + 4) == 'i' && styler.SafeGetCharAt(i + 5) == 'n' - && !iswordchar(styler.SafeGetCharAt(i + 6))) { + && !isSafeWordcharOrHigh(styler.SafeGetCharAt(i + 6))) { styler.ColourTo(i - 1, state); state = SCE_RB_POD; } else { @@ -480,23 +750,35 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle, Quote.Open(ch); } else if (ch == '<' && chNext == '<' && chNext2 != '=') { - // Recognise the '<<' symbol - either a here document or a binary op - + // Recognise the '<<' symbol - either a here document or a binary op styler.ColourTo(i - 1, state); i++; chNext = chNext2; styler.ColourTo(i, SCE_RB_OPERATOR); - - if (preferRE) { - state = SCE_RB_HERE_DELIM; - HereDoc.State = 0; + + if (! (strchr("\"\'`_-", chNext2) || isSafeAlpha(chNext2))) { + // It's definitely not a here-doc, + // based on Ruby's lexer/parser in the + // heredoc_identifier routine. + // Nothing else to do. + } else if (preferRE) { + if (sureThisIsHeredoc(i - 1, styler, prevWord)) { + state = SCE_RB_HERE_DELIM; + HereDoc.State = 0; + } + // else leave it in default state } else { - // leave state as default - // We don't have all the heuristics Perl has for indications - // of a here-doc, because '<<' is overloadable and used - // for so many other classes. - preferRE = true; + if (sureThisIsNotHeredoc(i - 1, styler)) { + // leave state as default + // We don't have all the heuristics Perl has for indications + // of a here-doc, because '<<' is overloadable and used + // for so many other classes. + } else { + state = SCE_RB_HERE_DELIM; + HereDoc.State = 0; + } } + preferRE = (state != SCE_RB_HERE_DELIM); } else if (ch == ':') { styler.ColourTo(i - 1, state); if (chNext == ':') { @@ -505,7 +787,7 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle, advance_char(i, ch, chNext, chNext2); // pass by ref state = SCE_RB_DEFAULT; preferRE = false; - } else if (iswordchar(chNext)) { + } else if (isSafeWordcharOrHigh(chNext)) { state = SCE_RB_SYMBOL; } else if (strchr("[*!~+-*/%=<>&^|", chNext)) { // Do the operator analysis in-line, looking ahead @@ -592,7 +874,7 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle, } else if (ch == '%') { styler.ColourTo(i - 1, state); bool have_string = false; - if (strchr(q_chars, chNext) && !iswordchar(chNext2)) { + if (strchr(q_chars, chNext) && !isSafeWordcharOrHigh(chNext2)) { Quote.New(); const char *hit = strchr(q_chars, chNext); if (hit != NULL) { @@ -603,7 +885,9 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle, chNext = styler.SafeGetCharAt(i + 1); have_string = true; } - } else if (!iswordchar(chNext)) { + } else if (!isSafeWordcharOrHigh(chNext)) { + // Ruby doesn't allow high bit chars here, + // but the editor host might state = SCE_RB_STRING_QQ; Quote.Open(chNext); advance_char(i, ch, chNext, chNext2); // pass by ref @@ -614,7 +898,7 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle, // stay in default preferRE = true; } - } else if (isoperator(ch)) { + } else if (isoperator(ch) || ch == '.') { styler.ColourTo(i - 1, state); styler.ColourTo(i, SCE_RB_OPERATOR); // If we're ending an expression or block, @@ -625,7 +909,7 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle, // we aren't ending an object exp'n, and ops // like : << / are unary operators. - preferRE = (strchr(")}]", ch) == NULL); + preferRE = (strchr(")}].", ch) == NULL); // Stay in default state } else if (isEOLChar(ch)) { // Make sure it's a true line-end, with no backslash @@ -636,7 +920,7 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle, } } } else if (state == SCE_RB_WORD) { - if (ch == '.' || !iswordchar(ch)) { + if (ch == '.' || !isSafeWordcharOrHigh(ch)) { // Words include x? in all contexts, // and <letters>= after either 'def' or a dot // Move along until a complete word is on our left @@ -645,7 +929,7 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle, // but we don't for now. if (ch == '=' - && iswordchar(chPrev) + && isSafeWordcharOrHigh(chPrev) && (chNext == '(' || strchr(" \t\n\r", chNext) != NULL) && (!strcmp(prevWord, "def") @@ -654,8 +938,8 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle, // This means that <name>=<name> is always lexed as // <name>, (op, =), <name> } else if ((ch == '?' || ch == '!') - && iswordchar(chPrev) - && !iswordchar(chNext)) { + && isSafeWordcharOrHigh(chPrev) + && !isSafeWordcharOrHigh(chNext)) { // <name>? is a name -- Get it the next time // But <name>?<name> is always lexed as // <name>, (op, ?), <name> @@ -691,11 +975,16 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle, default: preferRE = false; } + if (ch == '.') { + // We might be redefining an operator-method + preferRE = false; + } + // And if it's the first redo_char(i, ch, chNext, chNext2, state); // pass by ref } } } else if (state == SCE_RB_NUMBER) { - if (isalnum(ch) || ch == '_') { + if (isSafeAlnumOrHigh(ch) || ch == '_') { // Keep going } else if (ch == '.' && ++numDots == 1) { // Keep going @@ -763,7 +1052,7 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle, HereDoc.Delimiter[HereDoc.DelimiterLength] = '\0'; } } else { // an unquoted here-doc delimiter - if (isalnum(ch) || ch == '_') { + if (isSafeAlnumOrHigh(ch) || ch == '_') { HereDoc.Delimiter[HereDoc.DelimiterLength++] = ch; HereDoc.Delimiter[HereDoc.DelimiterLength] = '\0'; } else { @@ -813,13 +1102,13 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle, } else if (state == SCE_RB_CLASS_VAR || state == SCE_RB_INSTANCE_VAR || state == SCE_RB_SYMBOL) { - if (!iswordchar(ch)) { + if (!isSafeWordcharOrHigh(ch)) { styler.ColourTo(i - 1, state); redo_char(i, ch, chNext, chNext2, state); // pass by ref preferRE = false; } } else if (state == SCE_RB_GLOBAL) { - if (!iswordchar(ch)) { + if (!isSafeWordcharOrHigh(ch)) { // handle special globals here as well if (chPrev == '$') { if (ch == '-') { @@ -927,7 +1216,8 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle, } } -// Helper functions for folding +// Helper functions for folding, disambiguation keywords +// Assert that there are no high-bit chars static void getPrevWord(int pos, char *prevWord, @@ -966,10 +1256,6 @@ static bool keywordIsAmbiguous(const char *prevWord) } } -static bool inline iswhitespace(char ch) { - return ch == ' ' || ch == '\t'; -} - // Demote keywords in the following conditions: // if, while, unless, until modify a statement // do after a while or until, as a noise word (like then after if) @@ -1253,4 +1539,4 @@ static const char * const rubyWordListDesc[] = { 0 }; -LexerModule lmRuby(SCLEX_RUBY, ColouriseRbDoc, "ruby", FoldRbDoc, rubyWordListDesc, 6); +LexerModule lmRuby(SCLEX_RUBY, ColouriseRbDoc, "ruby", FoldRbDoc, rubyWordListDesc); |