Implement generic support for Unicode line ends and sub styles in lexer support classes.

author: nyamatongwe <unknown> 2013-01-19 12:33:20 +1100
committer: nyamatongwe <unknown> 2013-01-19 12:33:20 +1100
commit: 5d17740fdedcea321a23ffd3350aa7adbf4c2329 (patch)
tree: 1512465a2bbf066e96eb1ae2d10fdf3fc7dbd42b
parent: f46c96ecb682ad736453f78f6709fca6c6911886 (diff)
download: scintilla-mirror-5d17740fdedcea321a23ffd3350aa7adbf4c2329.tar.gz
3 files changed, 260 insertions, 16 deletions
diff --git a/lexlib/LexAccessor.h b/lexlib/LexAccessor.h
index 6458525cc..59ae11346 100644
--- a/lexlib/LexAccessor.h
+++ b/lexlib/LexAccessor.h
@@ -12,6 +12,8 @@
 namespace Scintilla {
 #endif
 
+enum EncodingType { enc8bit, encUnicode, encDBCS };
+
 class LexAccessor {
 private:
 	IDocument *pAccess;
@@ -25,7 +27,7 @@ private:
 	int startPos;
 	int endPos;
 	int codePage;
-	enum { enc8bit, encUnicode, encDBCS } encodingType;
+	enum EncodingType encodingType;
 	int lenDoc;
 	int mask;
 	char styleBuf[bufferSize];
@@ -91,7 +93,9 @@ public:
 	bool IsLeadByte(char ch) {
 		return pAccess->IsDBCSLeadByte(ch);
 	}
-
+	EncodingType Encoding() const {
+		return encodingType;
+	}
 	bool Match(int pos, const char *s) {
 		for (int i=0; *s; i++) {
 			if (*s != SafeGetCharAt(pos+i))
@@ -109,6 +113,19 @@ public:
 	int LineStart(int line) {
 		return pAccess->LineStart(line);
 	}
+	int LineEnd(int line) {
+		if (documentVersion >= dvLineEnd) {
+			return (static_cast<IDocumentWithLineEnd *>(pAccess))->LineEnd(line);
+		} else {
+			// Old interface means only '\r', '\n' and '\r\n' line ends.
+			int startNext = pAccess->LineStart(line+1);
+			char chLineEnd = SafeGetCharAt(startNext-1);
+			if (chLineEnd == '\n' && (SafeGetCharAt(startNext-2)  == '\r'))
+				return startNext - 2;
+			else
+				return startNext - 1;
+		}
+	}
 	int LevelAt(int line) {
 		return pAccess->GetLevel(line);
 	}
diff --git a/lexlib/StyleContext.h b/lexlib/StyleContext.h
index c2d223e3f..9f1818f21 100644
--- a/lexlib/StyleContext.h
+++ b/lexlib/StyleContext.h
@@ -19,6 +19,30 @@ static inline int MakeLowerCase(int ch) {
 		return ch - 'A' + 'a';
 }
 
+inline int UnicodeCodePoint(const unsigned char *us) {
+	if (us[0] < 0xC2) {
+		return us[0];
+	} else if (us[0] < 0xE0) {
+		return ((us[0] & 0x1F) << 6) + (us[1] & 0x3F);
+	} else if (us[0] < 0xF0) {
+		return ((us[0] & 0xF) << 12) + ((us[1] & 0x3F) << 6) + (us[2] & 0x3F);
+	} else if (us[0] < 0xF5) {
+		return ((us[0] & 0x7) << 18) + ((us[1] & 0x3F) << 12) + ((us[2] & 0x3F) << 6) + (us[3] & 0x3F);
+	}
+	return us[0];
+}
+
+inline int BytesInUnicodeCodePoint(int codePoint) {
+	if (codePoint < 0x80)
+		return 1;
+	else if (codePoint < 0x800)
+		return 2;
+	else if (codePoint < 0x10000)
+		return 3;
+	else
+		return 4;
+}
+
 // All languages handled so far can treat all characters >= 0x80 as one class
 // which just continues the current token or starts an identifier if in default.
 // DBCS treated specially as the second character can be < 0x80 and hence
@@ -27,22 +51,40 @@ class StyleContext {
 	LexAccessor &styler;
 	unsigned int endPos;
 	StyleContext &operator=(const StyleContext &);
+
 	void GetNextChar(unsigned int pos) {
 		chNext = static_cast<unsigned char>(styler.SafeGetCharAt(pos+1));
-		if (styler.IsLeadByte(static_cast<char>(chNext))) {
-			chNext = chNext << 8;
-			chNext |= static_cast<unsigned char>(styler.SafeGetCharAt(pos+2));
+		if (styler.Encoding() == encUnicode) {
+			if (chNext >= 0x80) {
+				unsigned char bytes[4] = { static_cast<unsigned char>(chNext), 0, 0, 0 };
+				for (int trail=1; trail<3; trail++) {
+					bytes[trail] = static_cast<unsigned char>(styler.SafeGetCharAt(pos+1+trail));
+					if (!((bytes[trail] >= 0x80) && (bytes[trail] < 0xc0))) {
+						bytes[trail] = 0;
+						break;
+					}
+				}
+				chNext = UnicodeCodePoint(bytes);
+			}
+		} else if (styler.Encoding() == encDBCS) {
+			if (styler.IsLeadByte(static_cast<char>(chNext))) {
+				chNext = chNext << 8;
+				chNext |= static_cast<unsigned char>(styler.SafeGetCharAt(pos+2));
+			}
 		}
 		// End of line?
 		// Trigger on CR only (Mac style) or either on LF from CR+LF (Dos/Win)
 		// or on LF alone (Unix). Avoid triggering two times on Dos/Win.
-		atLineEnd = (ch == '\r' && chNext != '\n') ||
-					(ch == '\n') ||
-					(currentPos >= endPos);
+		if (lineStartNext < styler.Length())
+			atLineEnd = static_cast<int>(pos) >= (lineStartNext-1);
+		else // Last line
+			atLineEnd = static_cast<int>(pos) >= lineStartNext;
 	}
 
 public:
 	unsigned int currentPos;
+	int currentLine;
+	int lineStartNext;
 	bool atLineStart;
 	bool atLineEnd;
 	int state;
@@ -55,6 +97,8 @@ public:
 		styler(styler_),
 		endPos(startPos + length),
 		currentPos(startPos),
+		currentLine(-1),
+		lineStartNext(-1),
 		atLineEnd(false),
 		state(initStyle & chMask), // Mask off all bits which aren't in the chMask.
 		chPrev(0),
@@ -62,13 +106,22 @@ public:
 		chNext(0) {
 		styler.StartAt(startPos, chMask);
 		styler.StartSegment(startPos);
-		atLineStart = static_cast<unsigned int>(styler.LineStart(styler.GetLine(startPos))) == startPos;
+		currentLine = styler.GetLine(startPos);
+		lineStartNext = styler.LineStart(currentLine+1);
+		atLineStart = static_cast<unsigned int>(styler.LineStart(currentLine)) == startPos;
 		unsigned int pos = currentPos;
 		ch = static_cast<unsigned char>(styler.SafeGetCharAt(pos));
-		if (styler.IsLeadByte(static_cast<char>(ch))) {
-			pos++;
-			ch = ch << 8;
-			ch |= static_cast<unsigned char>(styler.SafeGetCharAt(pos));
+		if (styler.Encoding() == encUnicode) {
+			// Get the current char
+			GetNextChar(pos-1);
+			ch = chNext;
+			pos += BytesInUnicodeCodePoint(ch) - 1;
+		} else if (styler.Encoding() == encDBCS) {
+			if (styler.IsLeadByte(static_cast<char>(ch))) {
+				pos++;
+				ch = ch << 8;
+				ch |= static_cast<unsigned char>(styler.SafeGetCharAt(pos));
+			}
 		}
 		GetNextChar(pos);
 	}
@@ -82,12 +135,28 @@ public:
 	void Forward() {
 		if (currentPos < endPos) {
 			atLineStart = atLineEnd;
+			if (atLineStart) {
+				currentLine++;
+				lineStartNext = styler.LineStart(currentLine+1);
+			}
 			chPrev = ch;
-			currentPos++;
-			if (ch >= 0x100)
+			if (styler.Encoding() == encUnicode) {
+				currentPos += BytesInUnicodeCodePoint(ch);
+			} else if (styler.Encoding() == encDBCS) {
+				currentPos++;
+				if (ch >= 0x100)
+					currentPos++;
+			} else {
 				currentPos++;
+			}
 			ch = chNext;
-			GetNextChar(currentPos + ((ch >= 0x100) ? 1 : 0));
+			if (styler.Encoding() == encUnicode) {
+				GetNextChar(currentPos + BytesInUnicodeCodePoint(ch)-1);
+			} else if (styler.Encoding() == encDBCS) {
+				GetNextChar(currentPos + ((ch >= 0x100) ? 1 : 0));
+			} else {
+				GetNextChar(currentPos);
+			}
 		} else {
 			atLineStart = false;
 			chPrev = ' ';
diff --git a/lexlib/SubStyles.h b/lexlib/SubStyles.h
new file mode 100644
index 000000000..7dc7804ef
--- /dev/null
+++ b/lexlib/SubStyles.h
@@ -0,0 +1,158 @@
+// Scintilla source code edit control
+/** @file SubStyles.h
+ ** Manage substyles for a lexer.
+ **/
+// Copyright 2012 by Neil Hodgson <neilh@scintilla.org>
+// The License.txt file describes the conditions under which this software may be distributed.
+
+#ifndef SUBSTYLES_H
+#define SUBSTYLES_H
+
+#ifdef SCI_NAMESPACE
+namespace Scintilla {
+#endif
+
+class WordClassifier {
+	int firstStyle;
+	int lenStyles;
+	std::map<std::string, int> wordToStyle;
+
+public:
+
+	WordClassifier() : firstStyle(0), lenStyles(0) {
+	}
+
+	void Allocate(int firstStyle_, int lenStyles_) {
+		firstStyle = firstStyle_;
+		lenStyles = lenStyles_;
+		wordToStyle.clear();
+	}
+
+	int Start() const {
+		return firstStyle;
+	}
+
+	int Length() const {
+		return lenStyles;
+	}
+
+	void Clear() {
+		firstStyle = 0;
+		lenStyles = 0;
+		wordToStyle.clear();
+	}
+
+	int ValueFor(const std::string &s) const {
+		std::map<std::string, int>::const_iterator it = wordToStyle.find(s);
+		if (it != wordToStyle.end())
+			return it->second;
+		else
+			return -1;
+	}
+
+	bool IncludesStyle(int style) const {
+		return (style >= firstStyle) && (style < (firstStyle + lenStyles));
+	}
+
+	void SetIdentifiers(int style, const char *identifiers) {
+		while (*identifiers) {
+			const char *cpSpace = identifiers;
+			while (*cpSpace && *cpSpace != ' ')
+				cpSpace++;
+			std::string word(identifiers, cpSpace - identifiers);
+			wordToStyle[word] = style;
+			identifiers = cpSpace;
+			if (*identifiers)
+				identifiers++;
+		}
+	}
+};
+
+class SubStyles {
+	int classifications;
+	const char *baseStyles;
+	int styleFirst;
+	int stylesAvailable;
+	int secondaryDistance;
+	int allocated;
+	std::vector<WordClassifier> classifiers;
+
+	int BlockFromBaseStyle(int baseStyle) const {
+		for (int b=0; b < classifications; b++) {
+			if (baseStyle == baseStyles[b])
+				return b;
+		}
+		return -1;
+	}
+
+	int BlockFromStyle(int style) const {
+		int b = 0;
+		for (std::vector<WordClassifier>::const_iterator it=classifiers.begin(); it != classifiers.end(); ++it) {
+			if (it->IncludesStyle(style))
+				return b;
+			b++;
+		}
+		return -1;
+	}
+
+public:
+
+	SubStyles(const char *baseStyles_, int styleFirst_, int stylesAvailable_, int secondaryDistance_) :
+		classifications(0),
+		baseStyles(baseStyles_),
+		styleFirst(styleFirst_),
+		stylesAvailable(stylesAvailable_),
+		secondaryDistance(secondaryDistance_),
+		allocated(0) {
+		while (baseStyles[classifications]) {
+			classifications++;
+			classifiers.push_back(WordClassifier());
+		}
+	}
+
+	int Allocate(int styleBase, int numberStyles) {
+		int block = BlockFromBaseStyle(styleBase);
+		if (block >= 0) {
+			if ((allocated + numberStyles) > stylesAvailable)
+				return -1;
+			int startBlock = styleFirst + allocated;
+			allocated += numberStyles;
+			classifiers[block].Allocate(startBlock, numberStyles);
+			return startBlock;
+		} else {
+			return -1;
+		}
+	}
+
+	int Start(int styleBase) {
+		int block = BlockFromBaseStyle(styleBase);
+		return (block >= 0) ? classifiers[block].Start() : -1;
+	}
+
+	int Length(int styleBase) {
+		int block = BlockFromBaseStyle(styleBase);
+		return (block >= 0) ? classifiers[block].Length() : 0;
+	}
+
+	void SetIdentifiers(int style, const char *identifiers) {
+		int block = BlockFromStyle(style);
+		if (block >= 0)
+			classifiers[block].SetIdentifiers(style, identifiers);
+	}
+
+	void Free() {
+		allocated = 0;
+		for (std::vector<WordClassifier>::iterator it=classifiers.begin(); it != classifiers.end(); ++it)
+			it->Clear();
+	}
+
+	const WordClassifier &Classifier(int baseStyle) const {
+		return classifiers[BlockFromBaseStyle(baseStyle)];
+	}
+};
+
+#ifdef SCI_NAMESPACE
+}
+#endif
+
+#endif
author	nyamatongwe <unknown>	2013-01-19 12:33:20 +1100
committer	nyamatongwe <unknown>	2013-01-19 12:33:20 +1100
commit	5d17740fdedcea321a23ffd3350aa7adbf4c2329 (patch)
tree	1512465a2bbf066e96eb1ae2d10fdf3fc7dbd42b
parent	f46c96ecb682ad736453f78f6709fca6c6911886 (diff)
download	scintilla-mirror-5d17740fdedcea321a23ffd3350aa7adbf4c2329.tar.gz