aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authornyamatongwe <unknown>2013-01-19 12:33:20 +1100
committernyamatongwe <unknown>2013-01-19 12:33:20 +1100
commit5d17740fdedcea321a23ffd3350aa7adbf4c2329 (patch)
tree1512465a2bbf066e96eb1ae2d10fdf3fc7dbd42b
parentf46c96ecb682ad736453f78f6709fca6c6911886 (diff)
downloadscintilla-mirror-5d17740fdedcea321a23ffd3350aa7adbf4c2329.tar.gz
Implement generic support for Unicode line ends and sub styles in lexer support classes.
-rw-r--r--lexlib/LexAccessor.h21
-rw-r--r--lexlib/StyleContext.h97
-rw-r--r--lexlib/SubStyles.h158
3 files changed, 260 insertions, 16 deletions
diff --git a/lexlib/LexAccessor.h b/lexlib/LexAccessor.h
index 6458525cc..59ae11346 100644
--- a/lexlib/LexAccessor.h
+++ b/lexlib/LexAccessor.h
@@ -12,6 +12,8 @@
namespace Scintilla {
#endif
+enum EncodingType { enc8bit, encUnicode, encDBCS };
+
class LexAccessor {
private:
IDocument *pAccess;
@@ -25,7 +27,7 @@ private:
int startPos;
int endPos;
int codePage;
- enum { enc8bit, encUnicode, encDBCS } encodingType;
+ enum EncodingType encodingType;
int lenDoc;
int mask;
char styleBuf[bufferSize];
@@ -91,7 +93,9 @@ public:
bool IsLeadByte(char ch) {
return pAccess->IsDBCSLeadByte(ch);
}
-
+ EncodingType Encoding() const {
+ return encodingType;
+ }
bool Match(int pos, const char *s) {
for (int i=0; *s; i++) {
if (*s != SafeGetCharAt(pos+i))
@@ -109,6 +113,19 @@ public:
int LineStart(int line) {
return pAccess->LineStart(line);
}
+ int LineEnd(int line) {
+ if (documentVersion >= dvLineEnd) {
+ return (static_cast<IDocumentWithLineEnd *>(pAccess))->LineEnd(line);
+ } else {
+ // Old interface means only '\r', '\n' and '\r\n' line ends.
+ int startNext = pAccess->LineStart(line+1);
+ char chLineEnd = SafeGetCharAt(startNext-1);
+ if (chLineEnd == '\n' && (SafeGetCharAt(startNext-2) == '\r'))
+ return startNext - 2;
+ else
+ return startNext - 1;
+ }
+ }
int LevelAt(int line) {
return pAccess->GetLevel(line);
}
diff --git a/lexlib/StyleContext.h b/lexlib/StyleContext.h
index c2d223e3f..9f1818f21 100644
--- a/lexlib/StyleContext.h
+++ b/lexlib/StyleContext.h
@@ -19,6 +19,30 @@ static inline int MakeLowerCase(int ch) {
return ch - 'A' + 'a';
}
+inline int UnicodeCodePoint(const unsigned char *us) {
+ if (us[0] < 0xC2) {
+ return us[0];
+ } else if (us[0] < 0xE0) {
+ return ((us[0] & 0x1F) << 6) + (us[1] & 0x3F);
+ } else if (us[0] < 0xF0) {
+ return ((us[0] & 0xF) << 12) + ((us[1] & 0x3F) << 6) + (us[2] & 0x3F);
+ } else if (us[0] < 0xF5) {
+ return ((us[0] & 0x7) << 18) + ((us[1] & 0x3F) << 12) + ((us[2] & 0x3F) << 6) + (us[3] & 0x3F);
+ }
+ return us[0];
+}
+
+inline int BytesInUnicodeCodePoint(int codePoint) {
+ if (codePoint < 0x80)
+ return 1;
+ else if (codePoint < 0x800)
+ return 2;
+ else if (codePoint < 0x10000)
+ return 3;
+ else
+ return 4;
+}
+
// All languages handled so far can treat all characters >= 0x80 as one class
// which just continues the current token or starts an identifier if in default.
// DBCS treated specially as the second character can be < 0x80 and hence
@@ -27,22 +51,40 @@ class StyleContext {
LexAccessor &styler;
unsigned int endPos;
StyleContext &operator=(const StyleContext &);
+
void GetNextChar(unsigned int pos) {
chNext = static_cast<unsigned char>(styler.SafeGetCharAt(pos+1));
- if (styler.IsLeadByte(static_cast<char>(chNext))) {
- chNext = chNext << 8;
- chNext |= static_cast<unsigned char>(styler.SafeGetCharAt(pos+2));
+ if (styler.Encoding() == encUnicode) {
+ if (chNext >= 0x80) {
+ unsigned char bytes[4] = { static_cast<unsigned char>(chNext), 0, 0, 0 };
+ for (int trail=1; trail<3; trail++) {
+ bytes[trail] = static_cast<unsigned char>(styler.SafeGetCharAt(pos+1+trail));
+ if (!((bytes[trail] >= 0x80) && (bytes[trail] < 0xc0))) {
+ bytes[trail] = 0;
+ break;
+ }
+ }
+ chNext = UnicodeCodePoint(bytes);
+ }
+ } else if (styler.Encoding() == encDBCS) {
+ if (styler.IsLeadByte(static_cast<char>(chNext))) {
+ chNext = chNext << 8;
+ chNext |= static_cast<unsigned char>(styler.SafeGetCharAt(pos+2));
+ }
}
// End of line?
// Trigger on CR only (Mac style) or either on LF from CR+LF (Dos/Win)
// or on LF alone (Unix). Avoid triggering two times on Dos/Win.
- atLineEnd = (ch == '\r' && chNext != '\n') ||
- (ch == '\n') ||
- (currentPos >= endPos);
+ if (lineStartNext < styler.Length())
+ atLineEnd = static_cast<int>(pos) >= (lineStartNext-1);
+ else // Last line
+ atLineEnd = static_cast<int>(pos) >= lineStartNext;
}
public:
unsigned int currentPos;
+ int currentLine;
+ int lineStartNext;
bool atLineStart;
bool atLineEnd;
int state;
@@ -55,6 +97,8 @@ public:
styler(styler_),
endPos(startPos + length),
currentPos(startPos),
+ currentLine(-1),
+ lineStartNext(-1),
atLineEnd(false),
state(initStyle & chMask), // Mask off all bits which aren't in the chMask.
chPrev(0),
@@ -62,13 +106,22 @@ public:
chNext(0) {
styler.StartAt(startPos, chMask);
styler.StartSegment(startPos);
- atLineStart = static_cast<unsigned int>(styler.LineStart(styler.GetLine(startPos))) == startPos;
+ currentLine = styler.GetLine(startPos);
+ lineStartNext = styler.LineStart(currentLine+1);
+ atLineStart = static_cast<unsigned int>(styler.LineStart(currentLine)) == startPos;
unsigned int pos = currentPos;
ch = static_cast<unsigned char>(styler.SafeGetCharAt(pos));
- if (styler.IsLeadByte(static_cast<char>(ch))) {
- pos++;
- ch = ch << 8;
- ch |= static_cast<unsigned char>(styler.SafeGetCharAt(pos));
+ if (styler.Encoding() == encUnicode) {
+ // Get the current char
+ GetNextChar(pos-1);
+ ch = chNext;
+ pos += BytesInUnicodeCodePoint(ch) - 1;
+ } else if (styler.Encoding() == encDBCS) {
+ if (styler.IsLeadByte(static_cast<char>(ch))) {
+ pos++;
+ ch = ch << 8;
+ ch |= static_cast<unsigned char>(styler.SafeGetCharAt(pos));
+ }
}
GetNextChar(pos);
}
@@ -82,12 +135,28 @@ public:
void Forward() {
if (currentPos < endPos) {
atLineStart = atLineEnd;
+ if (atLineStart) {
+ currentLine++;
+ lineStartNext = styler.LineStart(currentLine+1);
+ }
chPrev = ch;
- currentPos++;
- if (ch >= 0x100)
+ if (styler.Encoding() == encUnicode) {
+ currentPos += BytesInUnicodeCodePoint(ch);
+ } else if (styler.Encoding() == encDBCS) {
+ currentPos++;
+ if (ch >= 0x100)
+ currentPos++;
+ } else {
currentPos++;
+ }
ch = chNext;
- GetNextChar(currentPos + ((ch >= 0x100) ? 1 : 0));
+ if (styler.Encoding() == encUnicode) {
+ GetNextChar(currentPos + BytesInUnicodeCodePoint(ch)-1);
+ } else if (styler.Encoding() == encDBCS) {
+ GetNextChar(currentPos + ((ch >= 0x100) ? 1 : 0));
+ } else {
+ GetNextChar(currentPos);
+ }
} else {
atLineStart = false;
chPrev = ' ';
diff --git a/lexlib/SubStyles.h b/lexlib/SubStyles.h
new file mode 100644
index 000000000..7dc7804ef
--- /dev/null
+++ b/lexlib/SubStyles.h
@@ -0,0 +1,158 @@
+// Scintilla source code edit control
+/** @file SubStyles.h
+ ** Manage substyles for a lexer.
+ **/
+// Copyright 2012 by Neil Hodgson <neilh@scintilla.org>
+// The License.txt file describes the conditions under which this software may be distributed.
+
+#ifndef SUBSTYLES_H
+#define SUBSTYLES_H
+
+#ifdef SCI_NAMESPACE
+namespace Scintilla {
+#endif
+
+class WordClassifier {
+ int firstStyle;
+ int lenStyles;
+ std::map<std::string, int> wordToStyle;
+
+public:
+
+ WordClassifier() : firstStyle(0), lenStyles(0) {
+ }
+
+ void Allocate(int firstStyle_, int lenStyles_) {
+ firstStyle = firstStyle_;
+ lenStyles = lenStyles_;
+ wordToStyle.clear();
+ }
+
+ int Start() const {
+ return firstStyle;
+ }
+
+ int Length() const {
+ return lenStyles;
+ }
+
+ void Clear() {
+ firstStyle = 0;
+ lenStyles = 0;
+ wordToStyle.clear();
+ }
+
+ int ValueFor(const std::string &s) const {
+ std::map<std::string, int>::const_iterator it = wordToStyle.find(s);
+ if (it != wordToStyle.end())
+ return it->second;
+ else
+ return -1;
+ }
+
+ bool IncludesStyle(int style) const {
+ return (style >= firstStyle) && (style < (firstStyle + lenStyles));
+ }
+
+ void SetIdentifiers(int style, const char *identifiers) {
+ while (*identifiers) {
+ const char *cpSpace = identifiers;
+ while (*cpSpace && *cpSpace != ' ')
+ cpSpace++;
+ std::string word(identifiers, cpSpace - identifiers);
+ wordToStyle[word] = style;
+ identifiers = cpSpace;
+ if (*identifiers)
+ identifiers++;
+ }
+ }
+};
+
+class SubStyles {
+ int classifications;
+ const char *baseStyles;
+ int styleFirst;
+ int stylesAvailable;
+ int secondaryDistance;
+ int allocated;
+ std::vector<WordClassifier> classifiers;
+
+ int BlockFromBaseStyle(int baseStyle) const {
+ for (int b=0; b < classifications; b++) {
+ if (baseStyle == baseStyles[b])
+ return b;
+ }
+ return -1;
+ }
+
+ int BlockFromStyle(int style) const {
+ int b = 0;
+ for (std::vector<WordClassifier>::const_iterator it=classifiers.begin(); it != classifiers.end(); ++it) {
+ if (it->IncludesStyle(style))
+ return b;
+ b++;
+ }
+ return -1;
+ }
+
+public:
+
+ SubStyles(const char *baseStyles_, int styleFirst_, int stylesAvailable_, int secondaryDistance_) :
+ classifications(0),
+ baseStyles(baseStyles_),
+ styleFirst(styleFirst_),
+ stylesAvailable(stylesAvailable_),
+ secondaryDistance(secondaryDistance_),
+ allocated(0) {
+ while (baseStyles[classifications]) {
+ classifications++;
+ classifiers.push_back(WordClassifier());
+ }
+ }
+
+ int Allocate(int styleBase, int numberStyles) {
+ int block = BlockFromBaseStyle(styleBase);
+ if (block >= 0) {
+ if ((allocated + numberStyles) > stylesAvailable)
+ return -1;
+ int startBlock = styleFirst + allocated;
+ allocated += numberStyles;
+ classifiers[block].Allocate(startBlock, numberStyles);
+ return startBlock;
+ } else {
+ return -1;
+ }
+ }
+
+ int Start(int styleBase) {
+ int block = BlockFromBaseStyle(styleBase);
+ return (block >= 0) ? classifiers[block].Start() : -1;
+ }
+
+ int Length(int styleBase) {
+ int block = BlockFromBaseStyle(styleBase);
+ return (block >= 0) ? classifiers[block].Length() : 0;
+ }
+
+ void SetIdentifiers(int style, const char *identifiers) {
+ int block = BlockFromStyle(style);
+ if (block >= 0)
+ classifiers[block].SetIdentifiers(style, identifiers);
+ }
+
+ void Free() {
+ allocated = 0;
+ for (std::vector<WordClassifier>::iterator it=classifiers.begin(); it != classifiers.end(); ++it)
+ it->Clear();
+ }
+
+ const WordClassifier &Classifier(int baseStyle) const {
+ return classifiers[BlockFromBaseStyle(baseStyle)];
+ }
+};
+
+#ifdef SCI_NAMESPACE
+}
+#endif
+
+#endif