1 files changed, 203 insertions, 0 deletions
diff --git a/lexers/LexStata.cxx b/lexers/LexStata.cxx
new file mode 100644
index 000000000..78410c627
--- /dev/null
+++ b/lexers/LexStata.cxx
@@ -0,0 +1,203 @@
+// Scintilla source code edit control
+/** @file LexStata.cxx
+ ** Lexer for Stata
+ **/
+// Author: Luke Rasmussen (luke.rasmussen@gmail.com)
+//
+// The License.txt file describes the conditions under which this software may
+// be distributed.
+//
+// Developed as part of the StatTag project at Northwestern University Feinberg
+// School of Medicine with funding from Northwestern University Clinical and
+// Translational Sciences Institute through CTSA grant UL1TR001422.  This work
+// has not been reviewed or endorsed by NCATS or the NIH.
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <assert.h>
+#include <ctype.h>
+
+#include "ILexer.h"
+#include "Scintilla.h"
+#include "SciLexer.h"
+
+#include "WordList.h"
+#include "LexAccessor.h"
+#include "Accessor.h"
+#include "StyleContext.h"
+#include "CharacterSet.h"
+#include "LexerModule.h"
+
+using namespace Scintilla;
+
+static void ColouriseStataDoc(Sci_PositionU startPos, Sci_Position length, int initStyle, WordList *keywordlists[],
+    Accessor &styler) {
+
+    WordList &keywords = *keywordlists[0];
+    WordList &types = *keywordlists[1];
+    
+    CharacterSet setCouldBePostOp(CharacterSet::setNone, "+-");
+    CharacterSet setWordStart(CharacterSet::setAlpha, "_", 0x80, true);
+    CharacterSet setWord(CharacterSet::setAlphaNum, "._", 0x80, true);
+
+    StyleContext sc(startPos, length, initStyle, styler);
+    bool lineHasNonCommentChar = false;
+    for (; sc.More(); sc.Forward()) {
+        if (sc.atLineStart) {
+          lineHasNonCommentChar = false;
+        }
+
+        // Determine if the current state should terminate.
+        switch (sc.state) {
+            case SCE_STATA_OPERATOR:
+                sc.SetState(SCE_STATA_DEFAULT);
+                break;
+            case SCE_STATA_NUMBER:
+                // We accept almost anything because of hex. and number suffixes
+                if (!setWord.Contains(sc.ch)) {
+                    sc.SetState(SCE_STATA_DEFAULT);
+                }
+                break;
+            case SCE_STATA_IDENTIFIER:
+                if (!setWord.Contains(sc.ch) || (sc.ch == '.')) {
+                    char s[1000];
+                    sc.GetCurrent(s, sizeof(s));
+                    if (keywords.InList(s)) {
+                        sc.ChangeState(SCE_STATA_WORD);
+                    }
+                    else if (types.InList(s)) {
+                        sc.ChangeState(SCE_STATA_TYPE);
+                    }
+                    sc.SetState(SCE_STATA_DEFAULT);
+                }
+                break;
+            case SCE_STATA_COMMENTBLOCK:
+                if (sc.Match('*', '/')) {
+                    sc.Forward();
+                    sc.ForwardSetState(SCE_STATA_DEFAULT);
+                }
+                break;
+            case SCE_STATA_COMMENT:
+            case SCE_STATA_COMMENTLINE:
+                if (sc.atLineStart) {
+                    sc.SetState(SCE_STATA_DEFAULT);
+                }
+                break;
+            case SCE_STATA_STRING:
+                if (sc.ch == '\\') {
+                    // Per Stata documentation, the following characters are the only ones that can
+                    // be escaped (not our typical set of quotes, etc.):
+                    // https://www.stata.com/support/faqs/programming/backslashes-and-macros/
+                    if (sc.chNext == '$' || sc.chNext == '`' || sc.chNext == '\\') {
+                        sc.Forward();
+                    }
+                }
+                else if (sc.ch == '\"') {
+                    sc.ForwardSetState(SCE_STATA_DEFAULT);
+                }
+                break;
+        }
+
+        // Determine if a new state should be entered.
+        if (sc.state == SCE_STATA_DEFAULT) {
+            if (IsADigit(sc.ch) || (sc.ch == '.' && IsADigit(sc.chNext))) {
+                lineHasNonCommentChar = true;
+                sc.SetState(SCE_STATA_NUMBER);
+            }
+            else if (setWordStart.Contains(sc.ch)) {
+                lineHasNonCommentChar = true;
+                sc.SetState(SCE_STATA_IDENTIFIER);
+            }
+            else if (sc.Match('*') && !lineHasNonCommentChar) {
+                sc.SetState(SCE_STATA_COMMENT);
+            }
+            else if (sc.Match('/', '*')) {
+                sc.SetState(SCE_STATA_COMMENTBLOCK);
+                sc.Forward();	// Eat the * so it isn't used for the end of the comment
+            }
+            else if (sc.Match('/', '/')) {
+                sc.SetState(SCE_STATA_COMMENTLINE);
+            }
+            else if (sc.ch == '\"') {
+                lineHasNonCommentChar = true;
+                sc.SetState(SCE_STATA_STRING);
+            }
+            else if (isoperator(sc.ch)) {
+                lineHasNonCommentChar = true;
+                sc.SetState(SCE_STATA_OPERATOR);
+            }
+        }
+    }
+
+    sc.Complete();
+}
+
+// Store both the current line's fold level and the next lines in the
+// level store to make it easy to pick up with each increment
+// and to make it possible to fiddle the current level for "} else {".
+static void FoldStataDoc(Sci_PositionU startPos, Sci_Position length, int, WordList *[],
+    Accessor &styler) {
+    bool foldCompact = styler.GetPropertyInt("fold.compact", 1) != 0;
+    bool foldAtElse = styler.GetPropertyInt("fold.at.else", 0) != 0;
+    Sci_PositionU endPos = startPos + length;
+    int visibleChars = 0;
+    Sci_Position lineCurrent = styler.GetLine(startPos);
+    int levelCurrent = SC_FOLDLEVELBASE;
+    if (lineCurrent > 0)
+        levelCurrent = styler.LevelAt(lineCurrent - 1) >> 16;
+    int levelMinCurrent = levelCurrent;
+    int levelNext = levelCurrent;
+    char chNext = styler[startPos];
+    int styleNext = styler.StyleAt(startPos);
+    for (Sci_PositionU i = startPos; i < endPos; i++) {
+        char ch = chNext;
+        chNext = styler.SafeGetCharAt(i + 1);
+        int style = styleNext;
+        styleNext = styler.StyleAt(i + 1);
+        bool atEOL = (ch == '\r' && chNext != '\n') || (ch == '\n');
+        if (style == SCE_R_OPERATOR) {
+            if (ch == '{') {
+                // Measure the minimum before a '{' to allow
+                // folding on "} else {"
+                if (levelMinCurrent > levelNext) {
+                    levelMinCurrent = levelNext;
+                }
+                levelNext++;
+            }
+            else if (ch == '}') {
+                levelNext--;
+            }
+        }
+        if (atEOL) {
+            int levelUse = levelCurrent;
+            if (foldAtElse) {
+                levelUse = levelMinCurrent;
+            }
+            int lev = levelUse | levelNext << 16;
+            if (visibleChars == 0 && foldCompact)
+                lev |= SC_FOLDLEVELWHITEFLAG;
+            if (levelUse < levelNext)
+                lev |= SC_FOLDLEVELHEADERFLAG;
+            if (lev != styler.LevelAt(lineCurrent)) {
+                styler.SetLevel(lineCurrent, lev);
+            }
+            lineCurrent++;
+            levelCurrent = levelNext;
+            levelMinCurrent = levelCurrent;
+            visibleChars = 0;
+        }
+        if (!isspacechar(ch))
+            visibleChars++;
+    }
+}
+
+
+static const char * const StataWordLists[] = {
+    "Language Keywords",
+    "Types",
+    0,
+};
+
+LexerModule lmStata(SCLEX_STATA, ColouriseStataDoc, "stata", FoldStataDoc, StataWordLists);