aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authornyamatongwe <unknown>2013-01-19 12:34:07 +1100
committernyamatongwe <unknown>2013-01-19 12:34:07 +1100
commitb0b690ee11bb21e262561d89683444db4e697cf5 (patch)
tree4e7042ebb4d5e0bab1721fe0ec8cfac754f22da8
parent5d17740fdedcea321a23ffd3350aa7adbf4c2329 (diff)
downloadscintilla-mirror-b0b690ee11bb21e262561d89683444db4e697cf5.tar.gz
Implement Unicode line ends and substyles in C++ lexer.
-rw-r--r--lexers/LexCPP.cxx93
-rw-r--r--test/simpleTests.py195
2 files changed, 267 insertions, 21 deletions
diff --git a/lexers/LexCPP.cxx b/lexers/LexCPP.cxx
index 87482176d..f30cdfbdd 100644
--- a/lexers/LexCPP.cxx
+++ b/lexers/LexCPP.cxx
@@ -30,6 +30,7 @@
#include "LexerModule.h"
#include "OptionSet.h"
#include "SparseState.h"
+#include "SubStyles.h"
#ifdef SCI_NAMESPACE
using namespace Scintilla;
@@ -87,7 +88,8 @@ static std::string GetRestOfLine(LexAccessor &styler, int start, bool allowSpace
std::string restOfLine;
int i =0;
char ch = styler.SafeGetCharAt(start, '\n');
- while ((ch != '\r') && (ch != '\n')) {
+ int endLine = styler.LineEnd(styler.GetLine(start));
+ while (((start+i) < endLine) && (ch != '\r')) {
char chNext = styler.SafeGetCharAt(start + i + 1, '\n');
if (ch == '/' && (chNext == '/' || chNext == '*'))
break;
@@ -310,7 +312,9 @@ struct OptionSetCPP : public OptionSet<OptionsCPP> {
}
};
-class LexerCPP : public ILexer {
+static const char styleSubable[] = {SCE_C_IDENTIFIER, SCE_C_COMMENTDOCKEYWORD, 0};
+
+class LexerCPP : public ILexerWithSubStyles {
bool caseSensitive;
CharacterSet setWord;
CharacterSet setNegationOp;
@@ -329,6 +333,8 @@ class LexerCPP : public ILexer {
OptionSetCPP osCPP;
SparseState<std::string> rawStringTerminators;
enum { activeFlag = 0x40 };
+ enum { ssIdentifier, ssDocKeyword };
+ SubStyles subStyles;
public:
LexerCPP(bool caseSensitive_) :
caseSensitive(caseSensitive_),
@@ -336,7 +342,8 @@ public:
setNegationOp(CharacterSet::setNone, "!"),
setArithmethicOp(CharacterSet::setNone, "+-/*%"),
setRelOp(CharacterSet::setNone, "=!<>"),
- setLogicalOp(CharacterSet::setNone, "|&") {
+ setLogicalOp(CharacterSet::setNone, "|&"),
+ subStyles(styleSubable, 0x80, 0x40, activeFlag) {
}
virtual ~LexerCPP() {
}
@@ -344,7 +351,7 @@ public:
delete this;
}
int SCI_METHOD Version() const {
- return lvOriginal;
+ return lvSubStyles;
}
const char * SCI_METHOD PropertyNames() {
return osCPP.PropertyNames();
@@ -367,6 +374,32 @@ public:
return 0;
}
+ int SCI_METHOD LineEndTypesSupported() {
+ return SC_LINE_END_TYPE_UNICODE;
+ };
+
+ int SCI_METHOD AllocateSubStyles(int styleBase, int numberStyles) {
+ return subStyles.Allocate(styleBase, numberStyles);
+ }
+ int SCI_METHOD SubStylesStart(int styleBase) {
+ return subStyles.Start(styleBase);
+ }
+ int SCI_METHOD SubStylesLength(int styleBase) {
+ return subStyles.Length(styleBase);
+ }
+ void SCI_METHOD FreeSubStyles() {
+ subStyles.Free();
+ }
+ void SCI_METHOD SetIdentifiers(int style, const char *identifiers) {
+ subStyles.SetIdentifiers(style, identifiers);
+ }
+ int SCI_METHOD DistanceToSecondaryStyles() {
+ return activeFlag;
+ }
+ const char * SCI_METHOD GetSubStyleBases() {
+ return styleSubable;
+ }
+
static ILexer *LexerFactoryCPP() {
return new LexerCPP(true);
}
@@ -479,15 +512,10 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle,
(MaskActive(initStyle) == SCE_C_COMMENTLINEDOC)) {
// Set continuationLine if last character of previous line is '\'
if (lineCurrent > 0) {
- int chBack = styler.SafeGetCharAt(startPos-1, 0);
- int chBack2 = styler.SafeGetCharAt(startPos-2, 0);
- int lineEndChar = '!';
- if (chBack2 == '\r' && chBack == '\n') {
- lineEndChar = styler.SafeGetCharAt(startPos-3, 0);
- } else if (chBack == '\n' || chBack == '\r') {
- lineEndChar = chBack2;
+ int endLinePrevious = styler.LineEnd(lineCurrent - 1);
+ if (endLinePrevious > 0) {
+ continuationLine = styler.SafeGetCharAt(endLinePrevious-1) == '\\';
}
- continuationLine = lineEndChar == '\\';
}
}
@@ -501,7 +529,7 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle,
}
}
- StyleContext sc(startPos, length, initStyle, styler, 0x7f);
+ StyleContext sc(startPos, length, initStyle, styler, static_cast<char>(0xff));
LinePPState preproc = vlls.ForLine(lineCurrent);
bool definitionsChanged = false;
@@ -527,6 +555,11 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle,
int activitySet = preproc.IsInactive() ? activeFlag : 0;
+ const WordClassifier &classifierIdentifiers = subStyles.Classifier(SCE_C_IDENTIFIER);
+ const WordClassifier &classifierDocKeyWords = subStyles.Classifier(SCE_C_COMMENTDOCKEYWORD);
+
+ int lineEndNext = styler.LineEnd(lineCurrent);
+
for (; sc.More();) {
if (sc.atLineStart) {
@@ -554,6 +587,7 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle,
if (sc.atLineEnd) {
lineCurrent++;
+ lineEndNext = styler.LineEnd(lineCurrent);
vlls.Add(lineCurrent, preproc);
if (rawStringTerminator != "") {
rawSTNew.Set(lineCurrent-1, rawStringTerminator);
@@ -562,11 +596,13 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle,
// Handle line continuation generically.
if (sc.ch == '\\') {
- if (sc.chNext == '\n' || sc.chNext == '\r') {
+ if (static_cast<int>((sc.currentPos+1)) >= lineEndNext) {
lineCurrent++;
+ lineEndNext = styler.LineEnd(lineCurrent);
vlls.Add(lineCurrent, preproc);
sc.Forward();
if (sc.ch == '\r' && sc.chNext == '\n') {
+ // Even in UTF-8, \r and \n are separate
sc.Forward();
}
continuationLine = true;
@@ -591,7 +627,7 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle,
}
break;
case SCE_C_IDENTIFIER:
- if (!setWord.Contains(sc.ch) || (sc.ch == '.')) {
+ if (sc.atLineStart || sc.atLineEnd || !setWord.Contains(sc.ch) || (sc.ch == '.')) {
char s[1000];
if (caseSensitive) {
sc.GetCurrent(s, sizeof(s));
@@ -605,6 +641,11 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle,
sc.ChangeState(SCE_C_WORD2|activitySet);
} else if (keywords4.InList(s)) {
sc.ChangeState(SCE_C_GLOBALCLASS|activitySet);
+ } else {
+ int subStyle = classifierIdentifiers.ValueFor(s);
+ if (subStyle >= 0) {
+ sc.ChangeState(subStyle|activitySet);
+ }
}
const bool literalString = sc.ch == '\"';
if (literalString || sc.ch == '\'') {
@@ -697,8 +738,15 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle,
} else {
sc.GetCurrentLowered(s, sizeof(s));
}
- if (!IsASpace(sc.ch) || !keywords3.InList(s + 1)) {
+ if (!IsASpace(sc.ch)) {
sc.ChangeState(SCE_C_COMMENTDOCKEYWORDERROR|activitySet);
+ } else if (!keywords3.InList(s + 1)) {
+ int subStyleCDKW = classifierDocKeyWords.ValueFor(s+1);
+ if (subStyleCDKW >= 0) {
+ sc.ChangeState(subStyleCDKW|activitySet);
+ } else {
+ sc.ChangeState(SCE_C_COMMENTDOCKEYWORDERROR|activitySet);
+ }
}
sc.SetState(styleBeforeDCKeyword|activitySet);
}
@@ -755,7 +803,7 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle,
while ((sc.ch < 0x80) && islower(sc.ch))
sc.Forward(); // gobble regex flags
sc.SetState(SCE_C_DEFAULT|activitySet);
- } else if (sc.ch == '\\' && (sc.chNext != '\n' && sc.chNext != '\r')) {
+ } else if (sc.ch == '\\' && (static_cast<int>(sc.currentPos+1) < lineEndNext)) {
// Gobble up the escaped character
sc.Forward();
} else if (sc.ch == '[') {
@@ -787,7 +835,7 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle,
}
break;
case SCE_C_UUID:
- if (sc.ch == '\r' || sc.ch == '\n' || sc.ch == ')') {
+ if (sc.atLineEnd || sc.ch == ')') {
sc.SetState(SCE_C_DEFAULT|activitySet);
}
}
@@ -795,6 +843,7 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle,
if (sc.atLineEnd && !atLineEndBeforeSwitch) {
// State exit processing consumed characters up to end of line.
lineCurrent++;
+ lineEndNext = styler.LineEnd(lineCurrent);
vlls.Add(lineCurrent, preproc);
}
@@ -816,7 +865,7 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle,
} else {
sc.SetState(SCE_C_NUMBER|activitySet);
}
- } else if (setWordStart.Contains(sc.ch) || (sc.ch == '@')) {
+ } else if (!sc.atLineEnd && (setWordStart.Contains(sc.ch) || (sc.ch == '@'))) {
if (lastWordWasUUID) {
sc.SetState(SCE_C_UUID|activitySet);
lastWordWasUUID = false;
@@ -945,7 +994,7 @@ void SCI_METHOD LexerCPP::Lex(unsigned int startPos, int length, int initStyle,
}
}
}
- } else if (isoperator(static_cast<char>(sc.ch))) {
+ } else if (isoperator(sc.ch)) {
sc.SetState(SCE_C_OPERATOR|activitySet);
}
}
@@ -980,6 +1029,7 @@ void SCI_METHOD LexerCPP::Fold(unsigned int startPos, int length, int initStyle,
int levelCurrent = SC_FOLDLEVELBASE;
if (lineCurrent > 0)
levelCurrent = styler.LevelAt(lineCurrent-1) >> 16;
+ unsigned int lineStartNext = styler.LineStart(lineCurrent+1);
int levelMinCurrent = levelCurrent;
int levelNext = levelCurrent;
char chNext = styler[startPos];
@@ -992,7 +1042,7 @@ void SCI_METHOD LexerCPP::Fold(unsigned int startPos, int length, int initStyle,
int stylePrev = style;
style = styleNext;
styleNext = MaskActive(styler.StyleAt(i + 1));
- bool atEOL = (ch == '\r' && chNext != '\n') || (ch == '\n');
+ bool atEOL = i == (lineStartNext-1);
if (options.foldComment && options.foldCommentMultiline && IsStreamCommentStyle(style)) {
if (!IsStreamCommentStyle(stylePrev) && (stylePrev != SCE_C_COMMENTLINEDOC)) {
levelNext++;
@@ -1060,6 +1110,7 @@ void SCI_METHOD LexerCPP::Fold(unsigned int startPos, int length, int initStyle,
styler.SetLevel(lineCurrent, lev);
}
lineCurrent++;
+ lineStartNext = styler.LineStart(lineCurrent+1);
levelCurrent = levelNext;
levelMinCurrent = levelCurrent;
if (atEOL && (i == static_cast<unsigned int>(styler.Length()-1))) {
diff --git a/test/simpleTests.py b/test/simpleTests.py
index 9085bcf87..04c9ed145 100644
--- a/test/simpleTests.py
+++ b/test/simpleTests.py
@@ -282,6 +282,201 @@ class TestSimple(unittest.TestCase):
self.assertEquals(self.ed.Contents(), b"x" + lineEnds[lineEndType] + b"y")
self.assertEquals(self.ed.LineLength(0), 1 + len(lineEnds[lineEndType]))
+ # Several tests for unicode line ends U+2028 and U+2029
+
+ def testUnicodeLineEnds(self):
+ # Add two lines separated with U+2028 and ensure it is seen as two lines
+ # Then remove U+2028 and should be just 1 lines
+ self.ed.Lexer = self.ed.SCLEX_CPP
+ self.ed.SetCodePage(65001)
+ self.ed.SetLineEndTypesAllowed(1)
+ self.ed.AddText(5, b"x\xe2\x80\xa8y")
+ self.assertEquals(self.ed.LineCount, 2)
+ self.assertEquals(self.ed.GetLineEndPosition(0), 1)
+ self.assertEquals(self.ed.GetLineEndPosition(1), 5)
+ self.assertEquals(self.ed.LineLength(0), 4)
+ self.assertEquals(self.ed.LineLength(1), 1)
+ self.ed.TargetStart = 1
+ self.ed.TargetEnd = 4
+ self.ed.ReplaceTarget(0, b"")
+ self.assertEquals(self.ed.LineCount, 1)
+ self.assertEquals(self.ed.LineLength(0), 2)
+ self.assertEquals(self.ed.GetLineEndPosition(0), 2)
+
+ def testUnicodeLineEndsWithCodePage0(self):
+ # Try the Unicode line ends when not in Unicode mode -> should remain 1 line
+ self.ed.SetCodePage(0)
+ self.ed.AddText(5, b"x\xe2\x80\xa8y")
+ self.assertEquals(self.ed.LineCount, 1)
+ self.ed.AddText(4, b"x\xc2\x85y")
+ self.assertEquals(self.ed.LineCount, 1)
+
+ def testUnicodeLineEndsSwitchToUnicodeAndBack(self):
+ # Add the Unicode line ends when not in Unicode mode
+ self.ed.SetCodePage(0)
+ self.ed.AddText(5, b"x\xe2\x80\xa8y")
+ self.assertEquals(self.ed.LineCount, 1)
+ # Into UTF-8 mode - should now be interpreting as two lines
+ self.ed.Lexer = self.ed.SCLEX_CPP
+ self.ed.SetCodePage(65001)
+ self.ed.SetLineEndTypesAllowed(1)
+ self.assertEquals(self.ed.LineCount, 2)
+ # Back to code page 0 and 1 line
+ self.ed.SetCodePage(0)
+ self.assertEquals(self.ed.LineCount, 1)
+
+ def testUFragmentedEOLCompletion(self):
+ # Add 2 starting bytes of UTF-8 line end then complete it
+ self.ed.ClearAll()
+ self.ed.AddText(4, b"x\xe2\x80y")
+ self.assertEquals(self.ed.LineCount, 1)
+ self.assertEquals(self.ed.GetLineEndPosition(0), 4)
+ self.ed.SetSel(3,3)
+ self.ed.AddText(1, b"\xa8")
+ self.assertEquals(self.ed.Contents(), b"x\xe2\x80\xa8y")
+ self.assertEquals(self.ed.LineCount, 2)
+
+ # Add 1 starting bytes of UTF-8 line end then complete it
+ self.ed.ClearAll()
+ self.ed.AddText(3, b"x\xe2y")
+ self.assertEquals(self.ed.LineCount, 1)
+ self.assertEquals(self.ed.GetLineEndPosition(0), 3)
+ self.ed.SetSel(2,2)
+ self.ed.AddText(2, b"\x80\xa8")
+ self.assertEquals(self.ed.Contents(), b"x\xe2\x80\xa8y")
+ self.assertEquals(self.ed.LineCount, 2)
+
+ def testUFragmentedEOLStart(self):
+ # Add end of UTF-8 line end then insert start
+ self.ed.Lexer = self.ed.SCLEX_CPP
+ self.ed.SetCodePage(65001)
+ self.ed.SetLineEndTypesAllowed(1)
+ self.assertEquals(self.ed.LineCount, 1)
+ self.ed.AddText(4, b"x\x80\xa8y")
+ self.assertEquals(self.ed.LineCount, 1)
+ self.ed.SetSel(1,1)
+ self.ed.AddText(1, b"\xe2")
+ self.assertEquals(self.ed.LineCount, 2)
+
+ def testUBreakApartEOL(self):
+ # Add two lines separated by U+2029 then remove and add back each byte ensuring
+ # only one line after each removal of any byte in line end and 2 lines after reinsertion
+ self.ed.Lexer = self.ed.SCLEX_CPP
+ self.ed.SetCodePage(65001)
+ self.ed.SetLineEndTypesAllowed(1)
+ text = b"x\xe2\x80\xa9y";
+ self.ed.AddText(5, text)
+ self.assertEquals(self.ed.LineCount, 2)
+
+ for i in range(len(text)):
+ self.ed.TargetStart = i
+ self.ed.TargetEnd = i + 1
+ self.ed.ReplaceTarget(0, b"")
+ if i in [0, 4]:
+ # Removing text characters does not change number of lines
+ self.assertEquals(self.ed.LineCount, 2)
+ else:
+ # Removing byte from line end, removes 1 line
+ self.assertEquals(self.ed.LineCount, 1)
+
+ self.ed.TargetEnd = i
+ self.ed.ReplaceTarget(1, text[i:i+1])
+ self.assertEquals(self.ed.LineCount, 2)
+
+ def testURemoveEOLFragment(self):
+ # Add UTF-8 line end then delete each byte causing line end to disappear
+ self.ed.Lexer = self.ed.SCLEX_CPP
+ self.ed.SetCodePage(65001)
+ self.ed.SetLineEndTypesAllowed(1)
+ for i in range(3):
+ self.ed.ClearAll()
+ self.ed.AddText(5, b"x\xe2\x80\xa8y")
+ self.assertEquals(self.ed.LineCount, 2)
+ self.ed.TargetStart = i+1
+ self.ed.TargetEnd = i+2
+ self.ed.ReplaceTarget(0, b"")
+ self.assertEquals(self.ed.LineCount, 1)
+
+ # Several tests for unicode NEL line ends U+0085
+
+ def testNELLineEnds(self):
+ # Add two lines separated with U+0085 and ensure it is seen as two lines
+ # Then remove U+0085 and should be just 1 lines
+ self.ed.Lexer = self.ed.SCLEX_CPP
+ self.ed.SetCodePage(65001)
+ self.ed.SetLineEndTypesAllowed(1)
+ self.ed.AddText(4, b"x\xc2\x85y")
+ self.assertEquals(self.ed.LineCount, 2)
+ self.assertEquals(self.ed.GetLineEndPosition(0), 1)
+ self.assertEquals(self.ed.GetLineEndPosition(1), 4)
+ self.assertEquals(self.ed.LineLength(0), 3)
+ self.assertEquals(self.ed.LineLength(1), 1)
+ self.ed.TargetStart = 1
+ self.ed.TargetEnd = 3
+ self.ed.ReplaceTarget(0, b"")
+ self.assertEquals(self.ed.LineCount, 1)
+ self.assertEquals(self.ed.LineLength(0), 2)
+ self.assertEquals(self.ed.GetLineEndPosition(0), 2)
+
+ def testNELFragmentedEOLCompletion(self):
+ # Add starting byte of UTF-8 NEL then complete it
+ self.ed.AddText(3, b"x\xc2y")
+ self.assertEquals(self.ed.LineCount, 1)
+ self.assertEquals(self.ed.GetLineEndPosition(0), 3)
+ self.ed.SetSel(2,2)
+ self.ed.AddText(1, b"\x85")
+ self.assertEquals(self.ed.Contents(), b"x\xc2\x85y")
+ self.assertEquals(self.ed.LineCount, 2)
+
+ def testNELFragmentedEOLStart(self):
+ # Add end of UTF-8 NEL then insert start
+ self.ed.Lexer = self.ed.SCLEX_CPP
+ self.ed.SetCodePage(65001)
+ self.ed.SetLineEndTypesAllowed(1)
+ self.assertEquals(self.ed.LineCount, 1)
+ self.ed.AddText(4, b"x\x85y")
+ self.assertEquals(self.ed.LineCount, 1)
+ self.ed.SetSel(1,1)
+ self.ed.AddText(1, b"\xc2")
+ self.assertEquals(self.ed.LineCount, 2)
+
+ def testNELBreakApartEOL(self):
+ # Add two lines separated by U+0085 then remove and add back each byte ensuring
+ # only one line after each removal of any byte in line end and 2 lines after reinsertion
+ self.ed.Lexer = self.ed.SCLEX_CPP
+ self.ed.SetCodePage(65001)
+ self.ed.SetLineEndTypesAllowed(1)
+ text = b"x\xc2\x85y";
+ self.ed.AddText(4, text)
+ self.assertEquals(self.ed.LineCount, 2)
+
+ for i in range(len(text)):
+ self.ed.TargetStart = i
+ self.ed.TargetEnd = i + 1
+ self.ed.ReplaceTarget(0, b"")
+ if i in [0, 3]:
+ # Removing text characters does not change number of lines
+ self.assertEquals(self.ed.LineCount, 2)
+ else:
+ # Removing byte from line end, removes 1 line
+ self.assertEquals(self.ed.LineCount, 1)
+
+ self.ed.TargetEnd = i
+ self.ed.ReplaceTarget(1, text[i:i+1])
+ self.assertEquals(self.ed.LineCount, 2)
+
+ def testNELRemoveEOLFragment(self):
+ # Add UTF-8 NEL then delete each byte causing line end to disappear
+ self.ed.SetCodePage(65001)
+ for i in range(2):
+ self.ed.ClearAll()
+ self.ed.AddText(4, b"x\xc2\x85y")
+ self.assertEquals(self.ed.LineCount, 2)
+ self.ed.TargetStart = i+1
+ self.ed.TargetEnd = i+2
+ self.ed.ReplaceTarget(0, b"")
+ self.assertEquals(self.ed.LineCount, 1)
+
def testGoto(self):
self.ed.AddText(5, b"a\nb\nc")
self.assertEquals(self.ed.CurrentPos, 5)