Implement generic support for Unicode line ends and sub styles in lexer support classes.

author: nyamatongwe <unknown> 2013-01-19 12:33:20 +1100
committer: nyamatongwe <unknown> 2013-01-19 12:33:20 +1100
commit: 5d17740fdedcea321a23ffd3350aa7adbf4c2329 (patch)
tree: 1512465a2bbf066e96eb1ae2d10fdf3fc7dbd42b /lexlib/StyleContext.h
parent: f46c96ecb682ad736453f78f6709fca6c6911886 (diff)
download: scintilla-mirror-5d17740fdedcea321a23ffd3350aa7adbf4c2329.tar.gz
1 files changed, 83 insertions, 14 deletions
diff --git a/lexlib/StyleContext.h b/lexlib/StyleContext.h
index c2d223e3f..9f1818f21 100644
--- a/lexlib/StyleContext.h
+++ b/lexlib/StyleContext.h
@@ -19,6 +19,30 @@ static inline int MakeLowerCase(int ch) {
 		return ch - 'A' + 'a';
 }
 
+inline int UnicodeCodePoint(const unsigned char *us) {
+	if (us[0] < 0xC2) {
+		return us[0];
+	} else if (us[0] < 0xE0) {
+		return ((us[0] & 0x1F) << 6) + (us[1] & 0x3F);
+	} else if (us[0] < 0xF0) {
+		return ((us[0] & 0xF) << 12) + ((us[1] & 0x3F) << 6) + (us[2] & 0x3F);
+	} else if (us[0] < 0xF5) {
+		return ((us[0] & 0x7) << 18) + ((us[1] & 0x3F) << 12) + ((us[2] & 0x3F) << 6) + (us[3] & 0x3F);
+	}
+	return us[0];
+}
+
+inline int BytesInUnicodeCodePoint(int codePoint) {
+	if (codePoint < 0x80)
+		return 1;
+	else if (codePoint < 0x800)
+		return 2;
+	else if (codePoint < 0x10000)
+		return 3;
+	else
+		return 4;
+}
+
 // All languages handled so far can treat all characters >= 0x80 as one class
 // which just continues the current token or starts an identifier if in default.
 // DBCS treated specially as the second character can be < 0x80 and hence
@@ -27,22 +51,40 @@ class StyleContext {
 	LexAccessor &styler;
 	unsigned int endPos;
 	StyleContext &operator=(const StyleContext &);
+
 	void GetNextChar(unsigned int pos) {
 		chNext = static_cast<unsigned char>(styler.SafeGetCharAt(pos+1));
-		if (styler.IsLeadByte(static_cast<char>(chNext))) {
-			chNext = chNext << 8;
-			chNext |= static_cast<unsigned char>(styler.SafeGetCharAt(pos+2));
+		if (styler.Encoding() == encUnicode) {
+			if (chNext >= 0x80) {
+				unsigned char bytes[4] = { static_cast<unsigned char>(chNext), 0, 0, 0 };
+				for (int trail=1; trail<3; trail++) {
+					bytes[trail] = static_cast<unsigned char>(styler.SafeGetCharAt(pos+1+trail));
+					if (!((bytes[trail] >= 0x80) && (bytes[trail] < 0xc0))) {
+						bytes[trail] = 0;
+						break;
+					}
+				}
+				chNext = UnicodeCodePoint(bytes);
+			}
+		} else if (styler.Encoding() == encDBCS) {
+			if (styler.IsLeadByte(static_cast<char>(chNext))) {
+				chNext = chNext << 8;
+				chNext |= static_cast<unsigned char>(styler.SafeGetCharAt(pos+2));
+			}
 		}
 		// End of line?
 		// Trigger on CR only (Mac style) or either on LF from CR+LF (Dos/Win)
 		// or on LF alone (Unix). Avoid triggering two times on Dos/Win.
-		atLineEnd = (ch == '\r' && chNext != '\n') ||
-					(ch == '\n') ||
-					(currentPos >= endPos);
+		if (lineStartNext < styler.Length())
+			atLineEnd = static_cast<int>(pos) >= (lineStartNext-1);
+		else // Last line
+			atLineEnd = static_cast<int>(pos) >= lineStartNext;
 	}
 
 public:
 	unsigned int currentPos;
+	int currentLine;
+	int lineStartNext;
 	bool atLineStart;
 	bool atLineEnd;
 	int state;
@@ -55,6 +97,8 @@ public:
 		styler(styler_),
 		endPos(startPos + length),
 		currentPos(startPos),
+		currentLine(-1),
+		lineStartNext(-1),
 		atLineEnd(false),
 		state(initStyle & chMask), // Mask off all bits which aren't in the chMask.
 		chPrev(0),
@@ -62,13 +106,22 @@ public:
 		chNext(0) {
 		styler.StartAt(startPos, chMask);
 		styler.StartSegment(startPos);
-		atLineStart = static_cast<unsigned int>(styler.LineStart(styler.GetLine(startPos))) == startPos;
+		currentLine = styler.GetLine(startPos);
+		lineStartNext = styler.LineStart(currentLine+1);
+		atLineStart = static_cast<unsigned int>(styler.LineStart(currentLine)) == startPos;
 		unsigned int pos = currentPos;
 		ch = static_cast<unsigned char>(styler.SafeGetCharAt(pos));
-		if (styler.IsLeadByte(static_cast<char>(ch))) {
-			pos++;
-			ch = ch << 8;
-			ch |= static_cast<unsigned char>(styler.SafeGetCharAt(pos));
+		if (styler.Encoding() == encUnicode) {
+			// Get the current char
+			GetNextChar(pos-1);
+			ch = chNext;
+			pos += BytesInUnicodeCodePoint(ch) - 1;
+		} else if (styler.Encoding() == encDBCS) {
+			if (styler.IsLeadByte(static_cast<char>(ch))) {
+				pos++;
+				ch = ch << 8;
+				ch |= static_cast<unsigned char>(styler.SafeGetCharAt(pos));
+			}
 		}
 		GetNextChar(pos);
 	}
@@ -82,12 +135,28 @@ public:
 	void Forward() {
 		if (currentPos < endPos) {
 			atLineStart = atLineEnd;
+			if (atLineStart) {
+				currentLine++;
+				lineStartNext = styler.LineStart(currentLine+1);
+			}
 			chPrev = ch;
-			currentPos++;
-			if (ch >= 0x100)
+			if (styler.Encoding() == encUnicode) {
+				currentPos += BytesInUnicodeCodePoint(ch);
+			} else if (styler.Encoding() == encDBCS) {
+				currentPos++;
+				if (ch >= 0x100)
+					currentPos++;
+			} else {
 				currentPos++;
+			}
 			ch = chNext;
-			GetNextChar(currentPos + ((ch >= 0x100) ? 1 : 0));
+			if (styler.Encoding() == encUnicode) {
+				GetNextChar(currentPos + BytesInUnicodeCodePoint(ch)-1);
+			} else if (styler.Encoding() == encDBCS) {
+				GetNextChar(currentPos + ((ch >= 0x100) ? 1 : 0));
+			} else {
+				GetNextChar(currentPos);
+			}
 		} else {
 			atLineStart = false;
 			chPrev = ' ';
author	nyamatongwe <unknown>	2013-01-19 12:33:20 +1100
committer	nyamatongwe <unknown>	2013-01-19 12:33:20 +1100
commit	5d17740fdedcea321a23ffd3350aa7adbf4c2329 (patch)
tree	1512465a2bbf066e96eb1ae2d10fdf3fc7dbd42b /lexlib/StyleContext.h
parent	f46c96ecb682ad736453f78f6709fca6c6911886 (diff)
download	scintilla-mirror-5d17740fdedcea321a23ffd3350aa7adbf4c2329.tar.gz