diff options
author | nyamatongwe <unknown> | 2013-01-19 12:33:20 +1100 |
---|---|---|
committer | nyamatongwe <unknown> | 2013-01-19 12:33:20 +1100 |
commit | 5d17740fdedcea321a23ffd3350aa7adbf4c2329 (patch) | |
tree | 1512465a2bbf066e96eb1ae2d10fdf3fc7dbd42b | |
parent | f46c96ecb682ad736453f78f6709fca6c6911886 (diff) | |
download | scintilla-mirror-5d17740fdedcea321a23ffd3350aa7adbf4c2329.tar.gz |
Implement generic support for Unicode line ends and sub styles in lexer support classes.
-rw-r--r-- | lexlib/LexAccessor.h | 21 | ||||
-rw-r--r-- | lexlib/StyleContext.h | 97 | ||||
-rw-r--r-- | lexlib/SubStyles.h | 158 |
3 files changed, 260 insertions, 16 deletions
diff --git a/lexlib/LexAccessor.h b/lexlib/LexAccessor.h index 6458525cc..59ae11346 100644 --- a/lexlib/LexAccessor.h +++ b/lexlib/LexAccessor.h @@ -12,6 +12,8 @@ namespace Scintilla { #endif +enum EncodingType { enc8bit, encUnicode, encDBCS }; + class LexAccessor { private: IDocument *pAccess; @@ -25,7 +27,7 @@ private: int startPos; int endPos; int codePage; - enum { enc8bit, encUnicode, encDBCS } encodingType; + enum EncodingType encodingType; int lenDoc; int mask; char styleBuf[bufferSize]; @@ -91,7 +93,9 @@ public: bool IsLeadByte(char ch) { return pAccess->IsDBCSLeadByte(ch); } - + EncodingType Encoding() const { + return encodingType; + } bool Match(int pos, const char *s) { for (int i=0; *s; i++) { if (*s != SafeGetCharAt(pos+i)) @@ -109,6 +113,19 @@ public: int LineStart(int line) { return pAccess->LineStart(line); } + int LineEnd(int line) { + if (documentVersion >= dvLineEnd) { + return (static_cast<IDocumentWithLineEnd *>(pAccess))->LineEnd(line); + } else { + // Old interface means only '\r', '\n' and '\r\n' line ends. + int startNext = pAccess->LineStart(line+1); + char chLineEnd = SafeGetCharAt(startNext-1); + if (chLineEnd == '\n' && (SafeGetCharAt(startNext-2) == '\r')) + return startNext - 2; + else + return startNext - 1; + } + } int LevelAt(int line) { return pAccess->GetLevel(line); } diff --git a/lexlib/StyleContext.h b/lexlib/StyleContext.h index c2d223e3f..9f1818f21 100644 --- a/lexlib/StyleContext.h +++ b/lexlib/StyleContext.h @@ -19,6 +19,30 @@ static inline int MakeLowerCase(int ch) { return ch - 'A' + 'a'; } +inline int UnicodeCodePoint(const unsigned char *us) { + if (us[0] < 0xC2) { + return us[0]; + } else if (us[0] < 0xE0) { + return ((us[0] & 0x1F) << 6) + (us[1] & 0x3F); + } else if (us[0] < 0xF0) { + return ((us[0] & 0xF) << 12) + ((us[1] & 0x3F) << 6) + (us[2] & 0x3F); + } else if (us[0] < 0xF5) { + return ((us[0] & 0x7) << 18) + ((us[1] & 0x3F) << 12) + ((us[2] & 0x3F) << 6) + (us[3] & 0x3F); + } + return us[0]; +} + +inline int BytesInUnicodeCodePoint(int codePoint) { + if (codePoint < 0x80) + return 1; + else if (codePoint < 0x800) + return 2; + else if (codePoint < 0x10000) + return 3; + else + return 4; +} + // All languages handled so far can treat all characters >= 0x80 as one class // which just continues the current token or starts an identifier if in default. // DBCS treated specially as the second character can be < 0x80 and hence @@ -27,22 +51,40 @@ class StyleContext { LexAccessor &styler; unsigned int endPos; StyleContext &operator=(const StyleContext &); + void GetNextChar(unsigned int pos) { chNext = static_cast<unsigned char>(styler.SafeGetCharAt(pos+1)); - if (styler.IsLeadByte(static_cast<char>(chNext))) { - chNext = chNext << 8; - chNext |= static_cast<unsigned char>(styler.SafeGetCharAt(pos+2)); + if (styler.Encoding() == encUnicode) { + if (chNext >= 0x80) { + unsigned char bytes[4] = { static_cast<unsigned char>(chNext), 0, 0, 0 }; + for (int trail=1; trail<3; trail++) { + bytes[trail] = static_cast<unsigned char>(styler.SafeGetCharAt(pos+1+trail)); + if (!((bytes[trail] >= 0x80) && (bytes[trail] < 0xc0))) { + bytes[trail] = 0; + break; + } + } + chNext = UnicodeCodePoint(bytes); + } + } else if (styler.Encoding() == encDBCS) { + if (styler.IsLeadByte(static_cast<char>(chNext))) { + chNext = chNext << 8; + chNext |= static_cast<unsigned char>(styler.SafeGetCharAt(pos+2)); + } } // End of line? // Trigger on CR only (Mac style) or either on LF from CR+LF (Dos/Win) // or on LF alone (Unix). Avoid triggering two times on Dos/Win. - atLineEnd = (ch == '\r' && chNext != '\n') || - (ch == '\n') || - (currentPos >= endPos); + if (lineStartNext < styler.Length()) + atLineEnd = static_cast<int>(pos) >= (lineStartNext-1); + else // Last line + atLineEnd = static_cast<int>(pos) >= lineStartNext; } public: unsigned int currentPos; + int currentLine; + int lineStartNext; bool atLineStart; bool atLineEnd; int state; @@ -55,6 +97,8 @@ public: styler(styler_), endPos(startPos + length), currentPos(startPos), + currentLine(-1), + lineStartNext(-1), atLineEnd(false), state(initStyle & chMask), // Mask off all bits which aren't in the chMask. chPrev(0), @@ -62,13 +106,22 @@ public: chNext(0) { styler.StartAt(startPos, chMask); styler.StartSegment(startPos); - atLineStart = static_cast<unsigned int>(styler.LineStart(styler.GetLine(startPos))) == startPos; + currentLine = styler.GetLine(startPos); + lineStartNext = styler.LineStart(currentLine+1); + atLineStart = static_cast<unsigned int>(styler.LineStart(currentLine)) == startPos; unsigned int pos = currentPos; ch = static_cast<unsigned char>(styler.SafeGetCharAt(pos)); - if (styler.IsLeadByte(static_cast<char>(ch))) { - pos++; - ch = ch << 8; - ch |= static_cast<unsigned char>(styler.SafeGetCharAt(pos)); + if (styler.Encoding() == encUnicode) { + // Get the current char + GetNextChar(pos-1); + ch = chNext; + pos += BytesInUnicodeCodePoint(ch) - 1; + } else if (styler.Encoding() == encDBCS) { + if (styler.IsLeadByte(static_cast<char>(ch))) { + pos++; + ch = ch << 8; + ch |= static_cast<unsigned char>(styler.SafeGetCharAt(pos)); + } } GetNextChar(pos); } @@ -82,12 +135,28 @@ public: void Forward() { if (currentPos < endPos) { atLineStart = atLineEnd; + if (atLineStart) { + currentLine++; + lineStartNext = styler.LineStart(currentLine+1); + } chPrev = ch; - currentPos++; - if (ch >= 0x100) + if (styler.Encoding() == encUnicode) { + currentPos += BytesInUnicodeCodePoint(ch); + } else if (styler.Encoding() == encDBCS) { + currentPos++; + if (ch >= 0x100) + currentPos++; + } else { currentPos++; + } ch = chNext; - GetNextChar(currentPos + ((ch >= 0x100) ? 1 : 0)); + if (styler.Encoding() == encUnicode) { + GetNextChar(currentPos + BytesInUnicodeCodePoint(ch)-1); + } else if (styler.Encoding() == encDBCS) { + GetNextChar(currentPos + ((ch >= 0x100) ? 1 : 0)); + } else { + GetNextChar(currentPos); + } } else { atLineStart = false; chPrev = ' '; diff --git a/lexlib/SubStyles.h b/lexlib/SubStyles.h new file mode 100644 index 000000000..7dc7804ef --- /dev/null +++ b/lexlib/SubStyles.h @@ -0,0 +1,158 @@ +// Scintilla source code edit control +/** @file SubStyles.h + ** Manage substyles for a lexer. + **/ +// Copyright 2012 by Neil Hodgson <neilh@scintilla.org> +// The License.txt file describes the conditions under which this software may be distributed. + +#ifndef SUBSTYLES_H +#define SUBSTYLES_H + +#ifdef SCI_NAMESPACE +namespace Scintilla { +#endif + +class WordClassifier { + int firstStyle; + int lenStyles; + std::map<std::string, int> wordToStyle; + +public: + + WordClassifier() : firstStyle(0), lenStyles(0) { + } + + void Allocate(int firstStyle_, int lenStyles_) { + firstStyle = firstStyle_; + lenStyles = lenStyles_; + wordToStyle.clear(); + } + + int Start() const { + return firstStyle; + } + + int Length() const { + return lenStyles; + } + + void Clear() { + firstStyle = 0; + lenStyles = 0; + wordToStyle.clear(); + } + + int ValueFor(const std::string &s) const { + std::map<std::string, int>::const_iterator it = wordToStyle.find(s); + if (it != wordToStyle.end()) + return it->second; + else + return -1; + } + + bool IncludesStyle(int style) const { + return (style >= firstStyle) && (style < (firstStyle + lenStyles)); + } + + void SetIdentifiers(int style, const char *identifiers) { + while (*identifiers) { + const char *cpSpace = identifiers; + while (*cpSpace && *cpSpace != ' ') + cpSpace++; + std::string word(identifiers, cpSpace - identifiers); + wordToStyle[word] = style; + identifiers = cpSpace; + if (*identifiers) + identifiers++; + } + } +}; + +class SubStyles { + int classifications; + const char *baseStyles; + int styleFirst; + int stylesAvailable; + int secondaryDistance; + int allocated; + std::vector<WordClassifier> classifiers; + + int BlockFromBaseStyle(int baseStyle) const { + for (int b=0; b < classifications; b++) { + if (baseStyle == baseStyles[b]) + return b; + } + return -1; + } + + int BlockFromStyle(int style) const { + int b = 0; + for (std::vector<WordClassifier>::const_iterator it=classifiers.begin(); it != classifiers.end(); ++it) { + if (it->IncludesStyle(style)) + return b; + b++; + } + return -1; + } + +public: + + SubStyles(const char *baseStyles_, int styleFirst_, int stylesAvailable_, int secondaryDistance_) : + classifications(0), + baseStyles(baseStyles_), + styleFirst(styleFirst_), + stylesAvailable(stylesAvailable_), + secondaryDistance(secondaryDistance_), + allocated(0) { + while (baseStyles[classifications]) { + classifications++; + classifiers.push_back(WordClassifier()); + } + } + + int Allocate(int styleBase, int numberStyles) { + int block = BlockFromBaseStyle(styleBase); + if (block >= 0) { + if ((allocated + numberStyles) > stylesAvailable) + return -1; + int startBlock = styleFirst + allocated; + allocated += numberStyles; + classifiers[block].Allocate(startBlock, numberStyles); + return startBlock; + } else { + return -1; + } + } + + int Start(int styleBase) { + int block = BlockFromBaseStyle(styleBase); + return (block >= 0) ? classifiers[block].Start() : -1; + } + + int Length(int styleBase) { + int block = BlockFromBaseStyle(styleBase); + return (block >= 0) ? classifiers[block].Length() : 0; + } + + void SetIdentifiers(int style, const char *identifiers) { + int block = BlockFromStyle(style); + if (block >= 0) + classifiers[block].SetIdentifiers(style, identifiers); + } + + void Free() { + allocated = 0; + for (std::vector<WordClassifier>::iterator it=classifiers.begin(); it != classifiers.end(); ++it) + it->Clear(); + } + + const WordClassifier &Classifier(int baseStyle) const { + return classifiers[BlockFromBaseStyle(baseStyle)]; + } +}; + +#ifdef SCI_NAMESPACE +} +#endif + +#endif |