diff options
author | nyamatongwe <devnull@localhost> | 2013-01-19 12:34:07 +1100 |
---|---|---|
committer | nyamatongwe <devnull@localhost> | 2013-01-19 12:34:07 +1100 |
commit | b7591665bdf69d4a1693a734dc1d69a66b62e569 (patch) | |
tree | 308d64c1439f745955ff5f3e9a7d303e75ef77ef | |
parent | ce2de4557e269d91c3e9445713e6115e797e177a (diff) | |
download | scintilla-mirror-b7591665bdf69d4a1693a734dc1d69a66b62e569.tar.gz |
Implement Unicode line ends and substyles in C++ lexer.
-rw-r--r-- | lexers/LexCPP.cxx | 93 | ||||
-rw-r--r-- | test/simpleTests.py | 195 |
2 files changed, 267 insertions, 21 deletions
diff --git a/lexers/LexCPP.cxx b/lexers/LexCPP.cxx index 87482176d..f30cdfbdd 100644 --- a/lexers/LexCPP.cxx +++ b/lexers/LexCPP.cxx @@ -30,6 +30,7 @@ #include "LexerModule.h" #include "OptionSet.h" #include "SparseState.h" +#include "SubStyles.h" #ifdef SCI_NAMESPACE using namespace Scintilla; @@ -87,7 +88,8 @@ static std::string GetRestOfLine(LexAccessor &styler, int start, bool allowSpace std::string restOfLine; int i =0; char ch = styler.SafeGetCharAt(start, '\n'); - while ((ch != '\r') && (ch != '\n')) { + int endLine = styler.LineEnd(styler.GetLine(start)); + while (((start+i) < endLine) && (ch != '\r')) { char chNext = styler.SafeGetCharAt(start + i + 1, '\n'); if (ch == '/' && (chNext == '/' || chNext == '*')) break; @@ -310,7 +312,9 @@ struct OptionSetCPP : public OptionSet<OptionsCPP> { } }; -class LexerCPP : public ILexer { +static const char styleSubable[] = {SCE_C_IDENTIFIER, SCE_C_COMMENTDOCKEYWORD, 0}; + +class LexerCPP : public ILexerWithSubStyles { bool caseSensitive; CharacterSet setWord; CharacterSet setNegationOp; @@ -329,6 +333,8 @@ class LexerCPP : public ILexer { OptionSetCPP osCPP; SparseState<std::string> rawStringTerminators; enum { activeFlag = 0x40 }; + enum { ssIdentifier, ssDocKeyword }; + SubStyles subStyles; public: LexerCPP(bool caseSensitive_) : caseSensitive(caseSensitive_), @@ -336,7 +342,8 @@ public: setNegationOp(CharacterSet::setNone, "!"), setArithmethicOp(CharacterSet::setNone, "+-/*%"), setRelOp(CharacterSet::setNone, "=!<>"), - setLogicalOp(CharacterSet::setNone, "|&") { + setLogicalOp(CharacterSet::setNone, "|&"), + subStyles(styleSubable, 0x80, 0x40, activeFlag) { } virtual ~LexerCPP() { } @@ -344,7 +351,7 @@ public: delete this; } int SCI_METHOD Version() const { - return lvOriginal; + return lvSubStyles; } const char * SCI_METHOD PropertyNames() { return osCPP.PropertyNames(); @@ -367,6 +374,32 @@ public: return 0; } + int SCI_METHOD LineEndTypesSupported() { + return SC_LINE_END_TYPE_UNICODE; + }; + + int SCI_METHOD AllocateSubStyles(int styleBase, int numberStyles) { + return subStyles.Allocate(styleBase, numberStyles); + } + int SCI_METHOD SubStylesStart(int styleBase) { + return subStyles.Start(styleBase); + } + int SCI_METHOD SubStylesLength(int styleBase) { + return subStyles.Length(styleBase); + } + void SCI_METHOD FreeSubStyles() { + subStyles.Free(); + } + void SCI_METHOD SetIdentifiers(int style, const char *identifiers) { + subStyles.SetIdentifiers(style, identifiers); + } + int SCI_METHOD DistanceToSecondaryStyles() { + return activeFlag; + } + const char * SCI_METHOD GetSubStyleBases() { + return styleSubable; + } + static ILexer *LexerFactoryCPP() { return new LexerCPP(true); } @@ -479,15 +512,10 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle, (MaskActive(initStyle) == SCE_C_COMMENTLINEDOC)) { // Set continuationLine if last character of previous line is '\' if (lineCurrent > 0) { - int chBack = styler.SafeGetCharAt(startPos-1, 0); - int chBack2 = styler.SafeGetCharAt(startPos-2, 0); - int lineEndChar = '!'; - if (chBack2 == '\r' && chBack == '\n') { - lineEndChar = styler.SafeGetCharAt(startPos-3, 0); - } else if (chBack == '\n' || chBack == '\r') { - lineEndChar = chBack2; + int endLinePrevious = styler.LineEnd(lineCurrent - 1); + if (endLinePrevious > 0) { + continuationLine = styler.SafeGetCharAt(endLinePrevious-1) == '\\'; } - continuationLine = lineEndChar == '\\'; } } @@ -501,7 +529,7 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle, } } - StyleContext sc(startPos, length, initStyle, styler, 0x7f); + StyleContext sc(startPos, length, initStyle, styler, static_cast<char>(0xff)); LinePPState preproc = vlls.ForLine(lineCurrent); bool definitionsChanged = false; @@ -527,6 +555,11 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle, int activitySet = preproc.IsInactive() ? activeFlag : 0; + const WordClassifier &classifierIdentifiers = subStyles.Classifier(SCE_C_IDENTIFIER); + const WordClassifier &classifierDocKeyWords = subStyles.Classifier(SCE_C_COMMENTDOCKEYWORD); + + int lineEndNext = styler.LineEnd(lineCurrent); + for (; sc.More();) { if (sc.atLineStart) { @@ -554,6 +587,7 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle, if (sc.atLineEnd) { lineCurrent++; + lineEndNext = styler.LineEnd(lineCurrent); vlls.Add(lineCurrent, preproc); if (rawStringTerminator != "") { rawSTNew.Set(lineCurrent-1, rawStringTerminator); @@ -562,11 +596,13 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle, // Handle line continuation generically. if (sc.ch == '\\') { - if (sc.chNext == '\n' || sc.chNext == '\r') { + if (static_cast<int>((sc.currentPos+1)) >= lineEndNext) { lineCurrent++; + lineEndNext = styler.LineEnd(lineCurrent); vlls.Add(lineCurrent, preproc); sc.Forward(); if (sc.ch == '\r' && sc.chNext == '\n') { + // Even in UTF-8, \r and \n are separate sc.Forward(); } continuationLine = true; @@ -591,7 +627,7 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle, } break; case SCE_C_IDENTIFIER: - if (!setWord.Contains(sc.ch) || (sc.ch == '.')) { + if (sc.atLineStart || sc.atLineEnd || !setWord.Contains(sc.ch) || (sc.ch == '.')) { char s[1000]; if (caseSensitive) { sc.GetCurrent(s, sizeof(s)); @@ -605,6 +641,11 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle, sc.ChangeState(SCE_C_WORD2|activitySet); } else if (keywords4.InList(s)) { sc.ChangeState(SCE_C_GLOBALCLASS|activitySet); + } else { + int subStyle = classifierIdentifiers.ValueFor(s); + if (subStyle >= 0) { + sc.ChangeState(subStyle|activitySet); + } } const bool literalString = sc.ch == '\"'; if (literalString || sc.ch == '\'') { @@ -697,8 +738,15 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle, } else { sc.GetCurrentLowered(s, sizeof(s)); } - if (!IsASpace(sc.ch) || !keywords3.InList(s + 1)) { + if (!IsASpace(sc.ch)) { sc.ChangeState(SCE_C_COMMENTDOCKEYWORDERROR|activitySet); + } else if (!keywords3.InList(s + 1)) { + int subStyleCDKW = classifierDocKeyWords.ValueFor(s+1); + if (subStyleCDKW >= 0) { + sc.ChangeState(subStyleCDKW|activitySet); + } else { + sc.ChangeState(SCE_C_COMMENTDOCKEYWORDERROR|activitySet); + } } sc.SetState(styleBeforeDCKeyword|activitySet); } @@ -755,7 +803,7 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle, while ((sc.ch < 0x80) && islower(sc.ch)) sc.Forward(); // gobble regex flags sc.SetState(SCE_C_DEFAULT|activitySet); - } else if (sc.ch == '\\' && (sc.chNext != '\n' && sc.chNext != '\r')) { + } else if (sc.ch == '\\' && (static_cast<int>(sc.currentPos+1) < lineEndNext)) { // Gobble up the escaped character sc.Forward(); } else if (sc.ch == '[') { @@ -787,7 +835,7 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle, } break; case SCE_C_UUID: - if (sc.ch == '\r' || sc.ch == '\n' || sc.ch == ')') { + if (sc.atLineEnd || sc.ch == ')') { sc.SetState(SCE_C_DEFAULT|activitySet); } } @@ -795,6 +843,7 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle, if (sc.atLineEnd && !atLineEndBeforeSwitch) { // State exit processing consumed characters up to end of line. lineCurrent++; + lineEndNext = styler.LineEnd(lineCurrent); vlls.Add(lineCurrent, preproc); } @@ -816,7 +865,7 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle, } else { sc.SetState(SCE_C_NUMBER|activitySet); } - } else if (setWordStart.Contains(sc.ch) || (sc.ch == '@')) { + } else if (!sc.atLineEnd && (setWordStart.Contains(sc.ch) || (sc.ch == '@'))) { if (lastWordWasUUID) { sc.SetState(SCE_C_UUID|activitySet); lastWordWasUUID = false; @@ -945,7 +994,7 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle, } } } - } else if (isoperator(static_cast<char>(sc.ch))) { + } else if (isoperator(sc.ch)) { sc.SetState(SCE_C_OPERATOR|activitySet); } } @@ -980,6 +1029,7 @@ void SCI_METHOD LexerCPP::Fold(unsigned int startPos, int length, int initStyle, int levelCurrent = SC_FOLDLEVELBASE; if (lineCurrent > 0) levelCurrent = styler.LevelAt(lineCurrent-1) >> 16; + unsigned int lineStartNext = styler.LineStart(lineCurrent+1); int levelMinCurrent = levelCurrent; int levelNext = levelCurrent; char chNext = styler[startPos]; @@ -992,7 +1042,7 @@ void SCI_METHOD LexerCPP::Fold(unsigned int startPos, int length, int initStyle, int stylePrev = style; style = styleNext; styleNext = MaskActive(styler.StyleAt(i + 1)); - bool atEOL = (ch == '\r' && chNext != '\n') || (ch == '\n'); + bool atEOL = i == (lineStartNext-1); if (options.foldComment && options.foldCommentMultiline && IsStreamCommentStyle(style)) { if (!IsStreamCommentStyle(stylePrev) && (stylePrev != SCE_C_COMMENTLINEDOC)) { levelNext++; @@ -1060,6 +1110,7 @@ void SCI_METHOD LexerCPP::Fold(unsigned int startPos, int length, int initStyle, styler.SetLevel(lineCurrent, lev); } lineCurrent++; + lineStartNext = styler.LineStart(lineCurrent+1); levelCurrent = levelNext; levelMinCurrent = levelCurrent; if (atEOL && (i == static_cast<unsigned int>(styler.Length()-1))) { diff --git a/test/simpleTests.py b/test/simpleTests.py index 9085bcf87..04c9ed145 100644 --- a/test/simpleTests.py +++ b/test/simpleTests.py @@ -282,6 +282,201 @@ class TestSimple(unittest.TestCase): self.assertEquals(self.ed.Contents(), b"x" + lineEnds[lineEndType] + b"y") self.assertEquals(self.ed.LineLength(0), 1 + len(lineEnds[lineEndType])) + # Several tests for unicode line ends U+2028 and U+2029 + + def testUnicodeLineEnds(self): + # Add two lines separated with U+2028 and ensure it is seen as two lines + # Then remove U+2028 and should be just 1 lines + self.ed.Lexer = self.ed.SCLEX_CPP + self.ed.SetCodePage(65001) + self.ed.SetLineEndTypesAllowed(1) + self.ed.AddText(5, b"x\xe2\x80\xa8y") + self.assertEquals(self.ed.LineCount, 2) + self.assertEquals(self.ed.GetLineEndPosition(0), 1) + self.assertEquals(self.ed.GetLineEndPosition(1), 5) + self.assertEquals(self.ed.LineLength(0), 4) + self.assertEquals(self.ed.LineLength(1), 1) + self.ed.TargetStart = 1 + self.ed.TargetEnd = 4 + self.ed.ReplaceTarget(0, b"") + self.assertEquals(self.ed.LineCount, 1) + self.assertEquals(self.ed.LineLength(0), 2) + self.assertEquals(self.ed.GetLineEndPosition(0), 2) + + def testUnicodeLineEndsWithCodePage0(self): + # Try the Unicode line ends when not in Unicode mode -> should remain 1 line + self.ed.SetCodePage(0) + self.ed.AddText(5, b"x\xe2\x80\xa8y") + self.assertEquals(self.ed.LineCount, 1) + self.ed.AddText(4, b"x\xc2\x85y") + self.assertEquals(self.ed.LineCount, 1) + + def testUnicodeLineEndsSwitchToUnicodeAndBack(self): + # Add the Unicode line ends when not in Unicode mode + self.ed.SetCodePage(0) + self.ed.AddText(5, b"x\xe2\x80\xa8y") + self.assertEquals(self.ed.LineCount, 1) + # Into UTF-8 mode - should now be interpreting as two lines + self.ed.Lexer = self.ed.SCLEX_CPP + self.ed.SetCodePage(65001) + self.ed.SetLineEndTypesAllowed(1) + self.assertEquals(self.ed.LineCount, 2) + # Back to code page 0 and 1 line + self.ed.SetCodePage(0) + self.assertEquals(self.ed.LineCount, 1) + + def testUFragmentedEOLCompletion(self): + # Add 2 starting bytes of UTF-8 line end then complete it + self.ed.ClearAll() + self.ed.AddText(4, b"x\xe2\x80y") + self.assertEquals(self.ed.LineCount, 1) + self.assertEquals(self.ed.GetLineEndPosition(0), 4) + self.ed.SetSel(3,3) + self.ed.AddText(1, b"\xa8") + self.assertEquals(self.ed.Contents(), b"x\xe2\x80\xa8y") + self.assertEquals(self.ed.LineCount, 2) + + # Add 1 starting bytes of UTF-8 line end then complete it + self.ed.ClearAll() + self.ed.AddText(3, b"x\xe2y") + self.assertEquals(self.ed.LineCount, 1) + self.assertEquals(self.ed.GetLineEndPosition(0), 3) + self.ed.SetSel(2,2) + self.ed.AddText(2, b"\x80\xa8") + self.assertEquals(self.ed.Contents(), b"x\xe2\x80\xa8y") + self.assertEquals(self.ed.LineCount, 2) + + def testUFragmentedEOLStart(self): + # Add end of UTF-8 line end then insert start + self.ed.Lexer = self.ed.SCLEX_CPP + self.ed.SetCodePage(65001) + self.ed.SetLineEndTypesAllowed(1) + self.assertEquals(self.ed.LineCount, 1) + self.ed.AddText(4, b"x\x80\xa8y") + self.assertEquals(self.ed.LineCount, 1) + self.ed.SetSel(1,1) + self.ed.AddText(1, b"\xe2") + self.assertEquals(self.ed.LineCount, 2) + + def testUBreakApartEOL(self): + # Add two lines separated by U+2029 then remove and add back each byte ensuring + # only one line after each removal of any byte in line end and 2 lines after reinsertion + self.ed.Lexer = self.ed.SCLEX_CPP + self.ed.SetCodePage(65001) + self.ed.SetLineEndTypesAllowed(1) + text = b"x\xe2\x80\xa9y"; + self.ed.AddText(5, text) + self.assertEquals(self.ed.LineCount, 2) + + for i in range(len(text)): + self.ed.TargetStart = i + self.ed.TargetEnd = i + 1 + self.ed.ReplaceTarget(0, b"") + if i in [0, 4]: + # Removing text characters does not change number of lines + self.assertEquals(self.ed.LineCount, 2) + else: + # Removing byte from line end, removes 1 line + self.assertEquals(self.ed.LineCount, 1) + + self.ed.TargetEnd = i + self.ed.ReplaceTarget(1, text[i:i+1]) + self.assertEquals(self.ed.LineCount, 2) + + def testURemoveEOLFragment(self): + # Add UTF-8 line end then delete each byte causing line end to disappear + self.ed.Lexer = self.ed.SCLEX_CPP + self.ed.SetCodePage(65001) + self.ed.SetLineEndTypesAllowed(1) + for i in range(3): + self.ed.ClearAll() + self.ed.AddText(5, b"x\xe2\x80\xa8y") + self.assertEquals(self.ed.LineCount, 2) + self.ed.TargetStart = i+1 + self.ed.TargetEnd = i+2 + self.ed.ReplaceTarget(0, b"") + self.assertEquals(self.ed.LineCount, 1) + + # Several tests for unicode NEL line ends U+0085 + + def testNELLineEnds(self): + # Add two lines separated with U+0085 and ensure it is seen as two lines + # Then remove U+0085 and should be just 1 lines + self.ed.Lexer = self.ed.SCLEX_CPP + self.ed.SetCodePage(65001) + self.ed.SetLineEndTypesAllowed(1) + self.ed.AddText(4, b"x\xc2\x85y") + self.assertEquals(self.ed.LineCount, 2) + self.assertEquals(self.ed.GetLineEndPosition(0), 1) + self.assertEquals(self.ed.GetLineEndPosition(1), 4) + self.assertEquals(self.ed.LineLength(0), 3) + self.assertEquals(self.ed.LineLength(1), 1) + self.ed.TargetStart = 1 + self.ed.TargetEnd = 3 + self.ed.ReplaceTarget(0, b"") + self.assertEquals(self.ed.LineCount, 1) + self.assertEquals(self.ed.LineLength(0), 2) + self.assertEquals(self.ed.GetLineEndPosition(0), 2) + + def testNELFragmentedEOLCompletion(self): + # Add starting byte of UTF-8 NEL then complete it + self.ed.AddText(3, b"x\xc2y") + self.assertEquals(self.ed.LineCount, 1) + self.assertEquals(self.ed.GetLineEndPosition(0), 3) + self.ed.SetSel(2,2) + self.ed.AddText(1, b"\x85") + self.assertEquals(self.ed.Contents(), b"x\xc2\x85y") + self.assertEquals(self.ed.LineCount, 2) + + def testNELFragmentedEOLStart(self): + # Add end of UTF-8 NEL then insert start + self.ed.Lexer = self.ed.SCLEX_CPP + self.ed.SetCodePage(65001) + self.ed.SetLineEndTypesAllowed(1) + self.assertEquals(self.ed.LineCount, 1) + self.ed.AddText(4, b"x\x85y") + self.assertEquals(self.ed.LineCount, 1) + self.ed.SetSel(1,1) + self.ed.AddText(1, b"\xc2") + self.assertEquals(self.ed.LineCount, 2) + + def testNELBreakApartEOL(self): + # Add two lines separated by U+0085 then remove and add back each byte ensuring + # only one line after each removal of any byte in line end and 2 lines after reinsertion + self.ed.Lexer = self.ed.SCLEX_CPP + self.ed.SetCodePage(65001) + self.ed.SetLineEndTypesAllowed(1) + text = b"x\xc2\x85y"; + self.ed.AddText(4, text) + self.assertEquals(self.ed.LineCount, 2) + + for i in range(len(text)): + self.ed.TargetStart = i + self.ed.TargetEnd = i + 1 + self.ed.ReplaceTarget(0, b"") + if i in [0, 3]: + # Removing text characters does not change number of lines + self.assertEquals(self.ed.LineCount, 2) + else: + # Removing byte from line end, removes 1 line + self.assertEquals(self.ed.LineCount, 1) + + self.ed.TargetEnd = i + self.ed.ReplaceTarget(1, text[i:i+1]) + self.assertEquals(self.ed.LineCount, 2) + + def testNELRemoveEOLFragment(self): + # Add UTF-8 NEL then delete each byte causing line end to disappear + self.ed.SetCodePage(65001) + for i in range(2): + self.ed.ClearAll() + self.ed.AddText(4, b"x\xc2\x85y") + self.assertEquals(self.ed.LineCount, 2) + self.ed.TargetStart = i+1 + self.ed.TargetEnd = i+2 + self.ed.ReplaceTarget(0, b"") + self.assertEquals(self.ed.LineCount, 1) + def testGoto(self): self.ed.AddText(5, b"a\nb\nc") self.assertEquals(self.ed.CurrentPos, 5) |