1 files changed, 323 insertions, 37 deletions
diff --git a/src/LexRuby.cxx b/src/LexRuby.cxx
index e8a4d2917..f5ad020f4 100644
--- a/src/LexRuby.cxx
+++ b/src/LexRuby.cxx
@@ -28,13 +28,32 @@ static inline bool isEOLChar(char ch) {
 	return (ch == '\r') || (ch == '\n');
 }
 
-static inline bool isRubyOperatorChar(char ch) {
-	return strchr("%^&*\\()-+=|{}[]:;<>,/?!.~",ch) != NULL;
+#define isSafeASCII(ch) ((unsigned int)(ch) <= 127)
+// This one's redundant, but makes for more readable code
+#define isHighBitChar(ch) ((unsigned int)(ch) > 127)
+
+static inline bool isSafeAlpha(char ch) {
+    return (isSafeASCII(ch) && isalpha(ch)) || ch == '_';
 }
 
+static inline bool isSafeAlnum(char ch) {
+    return (isSafeASCII(ch) && isalnum(ch)) || ch == '_';
+}
 
-static inline bool isSafeAlpha(char ch) {
-    return ((unsigned int) ch <= 127) && isalpha(ch);
+static inline bool isSafeAlnumOrHigh(char ch) {
+    return isHighBitChar(ch) || isalnum(ch) || ch == '_';
+}
+
+static inline bool isSafeDigit(char ch) {
+    return isSafeASCII(ch) && isdigit(ch);
+}
+
+static inline bool isSafeWordcharOrHigh(char ch) {
+    return isHighBitChar(ch) || iswordchar(ch);
+}
+
+static bool inline iswhitespace(char ch) {
+	return ch == ' ' || ch == '\t';
 }
 
 #define MAX_KEYWORD_LENGTH 200
@@ -248,7 +267,258 @@ static bool RE_CanFollowKeyword(const char *keyword) {
     }
     return false;
 }
+
+// Look at chars up to but not including endPos
+// Don't look at styles in case we're looking forward
+
+static int skipWhitespace(int startPos,
+                           int endPos,
+                           Accessor &styler) {
+    for (int i = startPos; i < endPos; i++) {
+        if (!iswhitespace(styler[i])) {
+            return i;
+        }
+    }
+    return endPos;
+}
+    
+// This routine looks for false positives like
+// undef foo, <<
+// There aren't too many.
+//
+// iPrev points to the start of <<
+
+static bool sureThisIsHeredoc(int iPrev, 
+                              Accessor &styler,
+                              char *prevWord) {
+                    
+    // Not so fast, since Ruby's so dynamic.  Check the context
+    // to make sure we're OK.
+    int prevStyle;
+    int lineStart = styler.GetLine(iPrev);
+    int lineStartPosn = styler.LineStart(lineStart);
+    styler.Flush();
+
+    // Find the first word after some whitespace
+    int firstWordPosn = skipWhitespace(lineStartPosn, iPrev, styler);
+    if (firstWordPosn >= iPrev) {
+        // Have something like {^     <<}
+		//XXX Look at the first previous non-comment non-white line
+		// to establish the context.  Not too likely though.
+        return true;
+    } else {
+        switch (prevStyle = styler.StyleAt(firstWordPosn)) {
+        case SCE_RB_WORD:
+        case SCE_RB_WORD_DEMOTED:
+        case SCE_RB_IDENTIFIER:
+            break;
+        default:
+            return true;
+        }
+    }
+    int firstWordEndPosn = firstWordPosn;
+    char *dst = prevWord;
+    for (;;) {
+        if (firstWordEndPosn >= iPrev ||
+            styler.StyleAt(firstWordEndPosn) != prevStyle) {
+            *dst = 0;
+            break;
+        }
+        *dst++ = styler[firstWordEndPosn];
+        firstWordEndPosn += 1;
+    }
+    //XXX Write a style-aware thing to regex scintilla buffer objects
+    if (!strcmp(prevWord, "undef")
+        || !strcmp(prevWord, "def")
+        || !strcmp(prevWord, "alias")) {
+        // These keywords are what we were looking for
+        return false;
+    }
+    return true;
+}
+
+// Routine that saves us from allocating a buffer for the here-doc target
+// targetEndPos points one past the end of the current target
+static bool haveTargetMatch(int currPos,
+                            int lengthDoc,
+                            int targetStartPos,
+                            int targetEndPos,
+                            Accessor &styler) {
+    if (lengthDoc - currPos < targetEndPos - targetStartPos) {
+        return false;
+    }
+    int i, j;
+    for (i = targetStartPos, j = currPos;
+         i < targetEndPos && j < lengthDoc;
+         i++, j++) {
+        if (styler[i] != styler[j]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+// We need a check because the form
+// [identifier] <<[target]
+// is ambiguous.  The Ruby lexer/parser resolves it by
+// looking to see if [identifier] names a variable or a
+// function.  If it's the first, it's the start of a here-doc.
+// If it's a var, it's an operator.  This lexer doesn't
+// maintain a symbol table, so it looks ahead to see what's
+// going on, in cases where we have
+// ^[white-space]*[identifier([.|::]identifier)*][white-space]*<<[target]
+//
+// If there's no occurrence of [target] on a line, assume we don't.
+
+// return true == yes, we have no heredocs
+
+static bool sureThisIsNotHeredoc(int lt2StartPos,
+                                 Accessor &styler) {
+    int prevStyle;
+     // Use full document, not just part we're styling
+    int lengthDoc = styler.Length();
+    int lineStart = styler.GetLine(lt2StartPos);
+    int lineStartPosn = styler.LineStart(lineStart);
+    styler.Flush();
+    const bool definitely_not_a_here_doc = true;
+    const bool looks_like_a_here_doc = false;
+    
+    // Find the first word after some whitespace
+    int firstWordPosn = skipWhitespace(lineStartPosn, lt2StartPos, styler);
+    if (firstWordPosn >= lt2StartPos) {
+        return definitely_not_a_here_doc;
+    }
+    prevStyle = styler.StyleAt(firstWordPosn);
+    // If we have '<<' following a keyword, it's not a heredoc
+    if (prevStyle != SCE_RB_IDENTIFIER) {
+        return definitely_not_a_here_doc;
+    }
+    int newStyle = prevStyle;
+    // Some compilers incorrectly warn about uninit newStyle
+    for (firstWordPosn += 1; firstWordPosn <= lt2StartPos; firstWordPosn += 1) {
+        // Inner loop looks at the name
+        for (; firstWordPosn <= lt2StartPos; firstWordPosn += 1) {
+            newStyle = styler.StyleAt(firstWordPosn);
+            if (newStyle != prevStyle) {
+                break;
+            }
+        }
+        // Do we have '::' or '.'?
+        if (firstWordPosn < lt2StartPos && newStyle == SCE_RB_OPERATOR) {
+            char ch = styler[firstWordPosn];
+            if (ch == '.') {
+                // yes
+            } else if (ch == ':') {
+                if (styler.StyleAt(++firstWordPosn) != SCE_RB_OPERATOR) {
+                    return definitely_not_a_here_doc;
+                } else if (styler[firstWordPosn] != ':') {
+                    return definitely_not_a_here_doc;
+                }
+            } else {
+                break;
+            }
+        } else {
+            break;
+        }
+    }
+    // Skip next batch of white-space
+    firstWordPosn = skipWhitespace(firstWordPosn, lt2StartPos, styler);
+    if (firstWordPosn != lt2StartPos) {
+        // Have [[^ws[identifier]ws[*something_else*]ws<<
+        return definitely_not_a_here_doc;
+    }
+    // OK, now 'j' will point to the current spot moving ahead
+	int j = firstWordPosn + 1;
+    if (styler.StyleAt(j) != SCE_RB_OPERATOR || styler[j] != '<') {
+        // This shouldn't happen
+        return definitely_not_a_here_doc;
+    }
+    int nextLineStartPosn = styler.LineStart(lineStart + 1);
+    if (nextLineStartPosn >= lengthDoc) {
+        return definitely_not_a_here_doc;
+    }
+    j = skipWhitespace(j + 1, nextLineStartPosn, styler);
+    if (j >= lengthDoc) {
+        return definitely_not_a_here_doc;
+    }
+    bool allow_indent;
+    int target_start, target_end;
+    // From this point on no more styling, since we're looking ahead
+    if (styler[j] == '-') {
+        allow_indent = true;
+        j++;
+    } else {
+        allow_indent = false;
+    }
+
+    // Allow for quoted targets.
+    char target_quote = 0;
+    switch (styler[j]) {
+    case '\'':
+    case '"':
+    case '`':
+        target_quote = styler[j];
+        j += 1;
+    }
     
+    if (isSafeAlnum(styler[j])) {
+        // Init target_end because some compilers think it won't
+        // be initialized by the time it's used
+        target_start = target_end = j;
+        j++;
+    } else {
+        return definitely_not_a_here_doc;
+    }
+    for (; j < lengthDoc; j++) {
+        if (!isSafeAlnum(styler[j])) {
+            if (target_quote && styler[j] != target_quote) {
+                // unquoted end
+                return definitely_not_a_here_doc;
+            }
+
+            // And for now make sure that it's a newline
+            // don't handle arbitrary expressions yet
+            
+            target_end = j;
+			if (target_quote) {
+				// Now we can move to the character after the string delimiter.
+				j += 1;
+			}
+            j = skipWhitespace(j, lengthDoc, styler);
+            if (j >= lengthDoc) {
+                return definitely_not_a_here_doc;
+            } else {
+                char ch = styler[j];
+                if (ch == '#' || isEOLChar(ch)) {
+                    // This is OK, so break and continue;
+                    break;
+                } else {
+                    return definitely_not_a_here_doc;
+                }
+            }
+        }
+    }
+
+    // Just look at the start of each line
+    int last_line = styler.GetLine(lengthDoc - 1);
+    // But don't go too far
+    if (last_line > lineStart + 50) {
+        last_line = lineStart + 50;
+    }
+    for (int line_num = lineStart + 1; line_num <= last_line; line_num++) {
+        if (allow_indent) {
+            j = skipWhitespace(styler.LineStart(line_num), lengthDoc, styler);
+        } else {
+            j = styler.LineStart(line_num);
+        }
+        // target_end is one past the end
+        if (haveTargetMatch(j, lengthDoc, target_start, target_end, styler)) {
+            // We got it
+            return looks_like_a_here_doc;
+        }
+    }
+    return definitely_not_a_here_doc;
+}
 
 //todo: if we aren't looking at a stdio character,
 // move to the start of the first line that is not in a 
@@ -417,11 +687,11 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle,
 
         // Regular transitions
 		if (state == SCE_RB_DEFAULT) {
-            if (isdigit(ch)) {
+            if (isSafeDigit(ch)) {
             	styler.ColourTo(i - 1, state);
 				state = SCE_RB_NUMBER;
                 numDots = 0;
-            } else if (iswordstart(ch)) {
+            } else if (isHighBitChar(ch) || iswordstart(ch)) {
             	styler.ColourTo(i - 1, state);
 				state = SCE_RB_WORD;
 			} else if (ch == '#') {
@@ -435,7 +705,7 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle,
                     && styler.SafeGetCharAt(i + 3) == 'g'
                     && styler.SafeGetCharAt(i + 4) == 'i'
                     && styler.SafeGetCharAt(i + 5) == 'n'
-                    && !iswordchar(styler.SafeGetCharAt(i + 6))) {
+                    && !isSafeWordcharOrHigh(styler.SafeGetCharAt(i + 6))) {
                     styler.ColourTo(i - 1, state);
                     state = SCE_RB_POD;
 				} else {
@@ -480,23 +750,35 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle,
                 Quote.Open(ch);
 			} else if (ch == '<' && chNext == '<' && chNext2 != '=') {
 
-            // Recognise the '<<' symbol - either a here document or a binary op
-                
+                // Recognise the '<<' symbol - either a here document or a binary op
 				styler.ColourTo(i - 1, state);
                 i++;
                 chNext = chNext2;
 				styler.ColourTo(i, SCE_RB_OPERATOR);
-                
-                if (preferRE) {
-                    state = SCE_RB_HERE_DELIM;
-				    HereDoc.State = 0;
+
+                if (! (strchr("\"\'`_-", chNext2) || isSafeAlpha(chNext2))) {
+                    // It's definitely not a here-doc,
+                    // based on Ruby's lexer/parser in the
+                    // heredoc_identifier routine.
+                    // Nothing else to do.
+                } else if (preferRE) {
+                    if (sureThisIsHeredoc(i - 1, styler, prevWord)) {
+                        state = SCE_RB_HERE_DELIM;
+                        HereDoc.State = 0;
+                    }
+                    // else leave it in default state
                 } else {
-                    // leave state as default
-                    // We don't have all the heuristics Perl has for indications
-                    // of a here-doc, because '<<' is overloadable and used
-                    // for so many other classes.
-					preferRE = true;
+                    if (sureThisIsNotHeredoc(i - 1, styler)) {
+                        // leave state as default
+                        // We don't have all the heuristics Perl has for indications
+                        // of a here-doc, because '<<' is overloadable and used
+                        // for so many other classes.
+                    } else {
+                        state = SCE_RB_HERE_DELIM;
+                        HereDoc.State = 0;
+                    }
                 }
+                preferRE = (state != SCE_RB_HERE_DELIM);
             } else if (ch == ':') {
 				styler.ColourTo(i - 1, state);
                 if (chNext == ':') {
@@ -505,7 +787,7 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle,
                     advance_char(i, ch, chNext, chNext2); // pass by ref
                     state = SCE_RB_DEFAULT;
 					preferRE = false;
-                } else if (iswordchar(chNext)) {
+                } else if (isSafeWordcharOrHigh(chNext)) {
 					state = SCE_RB_SYMBOL;
                 } else if (strchr("[*!~+-*/%=<>&^|", chNext)) {
                     // Do the operator analysis in-line, looking ahead
@@ -592,7 +874,7 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle,
             } else if (ch == '%') {
                 styler.ColourTo(i - 1, state);
                 bool have_string = false;
-                if (strchr(q_chars, chNext) && !iswordchar(chNext2)) {
+                if (strchr(q_chars, chNext) && !isSafeWordcharOrHigh(chNext2)) {
                     Quote.New();
                     const char *hit = strchr(q_chars, chNext);
                     if (hit != NULL) {
@@ -603,7 +885,9 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle,
 						chNext = styler.SafeGetCharAt(i + 1);
                         have_string = true;
                     }
-                } else if (!iswordchar(chNext)) {
+                } else if (!isSafeWordcharOrHigh(chNext)) {
+                    // Ruby doesn't allow high bit chars here,
+                    // but the editor host might
                     state = SCE_RB_STRING_QQ;
                     Quote.Open(chNext);
                     advance_char(i, ch, chNext, chNext2); // pass by ref
@@ -614,7 +898,7 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle,
                     // stay in default
                     preferRE = true;
                 }
-            } else if (isoperator(ch)) {
+            } else if (isoperator(ch) || ch == '.') {
 				styler.ColourTo(i - 1, state);
 				styler.ColourTo(i, SCE_RB_OPERATOR);
                 // If we're ending an expression or block,
@@ -625,7 +909,7 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle,
                 // we aren't ending an object exp'n, and ops
                 // like : << / are unary operators.
                 
-                preferRE = (strchr(")}]", ch) == NULL);
+                preferRE = (strchr(")}].", ch) == NULL);
                 // Stay in default state
             } else if (isEOLChar(ch)) {
                 // Make sure it's a true line-end, with no backslash
@@ -636,7 +920,7 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle,
                 }
             }
         } else if (state == SCE_RB_WORD) {
-            if (ch == '.' || !iswordchar(ch)) {
+            if (ch == '.' || !isSafeWordcharOrHigh(ch)) {
                 // Words include x? in all contexts,
                 // and <letters>= after either 'def' or a dot
                 // Move along until a complete word is on our left
@@ -645,7 +929,7 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle,
                 // but we don't for now.
                 
                 if (ch == '='
-                    && iswordchar(chPrev)
+                    && isSafeWordcharOrHigh(chPrev)
                     && (chNext == '('
                         || strchr(" \t\n\r", chNext) != NULL)
                     && (!strcmp(prevWord, "def")
@@ -654,8 +938,8 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle,
                     // This means that <name>=<name> is always lexed as
                     // <name>, (op, =), <name>
                 } else if ((ch == '?' || ch == '!')
-                           && iswordchar(chPrev)
-                           && !iswordchar(chNext)) {
+                           && isSafeWordcharOrHigh(chPrev)
+                           && !isSafeWordcharOrHigh(chNext)) {
                     // <name>? is a name -- Get it the next time
                     // But <name>?<name> is always lexed as
                     // <name>, (op, ?), <name>
@@ -691,11 +975,16 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle,
                         default:
                             preferRE = false;
                     }
+                    if (ch == '.') {
+                        // We might be redefining an operator-method
+                        preferRE = false;
+                    }
+                    // And if it's the first 
                     redo_char(i, ch, chNext, chNext2, state); // pass by ref
                 }
             }
         } else if (state == SCE_RB_NUMBER) {
-            if (isalnum(ch) || ch == '_') {
+            if (isSafeAlnumOrHigh(ch) || ch == '_') {
                 // Keep going
             } else if (ch == '.' && ++numDots == 1) {
                 // Keep going
@@ -763,7 +1052,7 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle,
 						HereDoc.Delimiter[HereDoc.DelimiterLength] = '\0';
                     }
                 } else { // an unquoted here-doc delimiter
-					if (isalnum(ch) || ch == '_') {
+					if (isSafeAlnumOrHigh(ch) || ch == '_') {
 						HereDoc.Delimiter[HereDoc.DelimiterLength++] = ch;
 						HereDoc.Delimiter[HereDoc.DelimiterLength] = '\0';
 					} else {
@@ -813,13 +1102,13 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle,
         } else if (state == SCE_RB_CLASS_VAR
                    || state == SCE_RB_INSTANCE_VAR
                    || state == SCE_RB_SYMBOL) {
-            if (!iswordchar(ch)) {
+            if (!isSafeWordcharOrHigh(ch)) {
                 styler.ColourTo(i - 1, state);
                 redo_char(i, ch, chNext, chNext2, state); // pass by ref
                 preferRE = false;
             }
         } else if (state == SCE_RB_GLOBAL) {
-            if (!iswordchar(ch)) {
+            if (!isSafeWordcharOrHigh(ch)) {
                 // handle special globals here as well
                 if (chPrev == '$') {
                     if (ch == '-') {
@@ -927,7 +1216,8 @@ static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle,
     }
 }
 
-// Helper functions for folding
+// Helper functions for folding, disambiguation keywords
+// Assert that there are no high-bit chars 
 
 static void getPrevWord(int pos,
                         char *prevWord,
@@ -966,10 +1256,6 @@ static bool keywordIsAmbiguous(const char *prevWord)
     }
 }
 
-static bool inline iswhitespace(char ch) {
-	return ch == ' ' || ch == '\t';
-}
-
 // Demote keywords in the following conditions:
 // if, while, unless, until modify a statement
 // do after a while or until, as a noise word (like then after if) 
@@ -1253,4 +1539,4 @@ static const char * const rubyWordListDesc[] = {
 	0
 };
 
-LexerModule lmRuby(SCLEX_RUBY, ColouriseRbDoc, "ruby", FoldRbDoc, rubyWordListDesc, 6);
+LexerModule lmRuby(SCLEX_RUBY, ColouriseRbDoc, "ruby", FoldRbDoc, rubyWordListDesc);