Use standardised and more stringent UTF8Classify for determining validity and

width of UTF-8 characters. Optimize to make up for cost of UTF8Classify. Drop functions now provided by UniConversion. Use UTF8IsAscii function instead of test against literal.
author: nyamatongwe <devnull@localhost> 2012-05-26 13:36:25 +1000
committer: nyamatongwe <devnull@localhost> 2012-05-26 13:36:25 +1000
commit: 645d0f2b7c5ebad6d757e35cd257bcbf4a118f68 (patch)
tree: 2dc8da26482b7830d196aa0de0b92e329950efe4
parent: e1370f834348a12fea75ad883a0d801dfd1b9d8d (diff)
download: scintilla-mirror-645d0f2b7c5ebad6d757e35cd257bcbf4a118f68.tar.gz
1 files changed, 61 insertions, 58 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index 4e9366064..244e96e4f 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -465,51 +465,29 @@ int Document::LenChar(int pos) {
 	}
 }
 
-static inline bool IsTrailByte(int ch) {
-	return (ch >= 0x80) && (ch < 0xc0);
-}
-
-static int BytesFromLead(int leadByte) {
-	if (leadByte > 0xF4) {
-		// Characters longer than 4 bytes not possible in current UTF-8
-		return 0;
-	} else if (leadByte >= 0xF0) {
-		return 4;
-	} else if (leadByte >= 0xE0) {
-		return 3;
-	} else if (leadByte >= 0xC2) {
-		return 2;
-	}
-	return 0;
-}
-
 bool Document::InGoodUTF8(int pos, int &start, int &end) const {
-	int lead = pos;
-	while ((lead>0) && (pos-lead < 4) && IsTrailByte(static_cast<unsigned char>(cb.CharAt(lead-1))))
-		lead--;
-	start = 0;
-	if (lead > 0) {
-		start = lead-1;
-	}
-	int leadByte = static_cast<unsigned char>(cb.CharAt(start));
-	int bytes = BytesFromLead(leadByte);
-	if (bytes == 0) {
+	int trail = pos;
+	while ((trail>0) && (pos-trail < UTF8MaxBytes) && UTF8IsTrailByte(static_cast<unsigned char>(cb.CharAt(trail-1))))
+		trail--;
+	start = (trail > 0) ? trail-1 : trail;
+
+	const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(start));
+	const int widthCharBytes = UTF8BytesOfLead[leadByte];
+	if (widthCharBytes == 1) {
 		return false;
 	} else {
-		int trailBytes = bytes - 1;
-		int len = pos - lead + 1;
+		int trailBytes = widthCharBytes - 1;
+		int len = pos - start;
 		if (len > trailBytes)
 			// pos too far from lead
 			return false;
-		// Check that there are enough trails for this lead
-		int trail = pos + 1;
-		while ((trail-lead<trailBytes) && (trail < Length())) {
-			if (!IsTrailByte(static_cast<unsigned char>(cb.CharAt(trail)))) {
-				return false;
-			}
-			trail++;
-		}
-		end = start + bytes;
+		char charBytes[UTF8MaxBytes] = {static_cast<char>(leadByte),0,0,0};
+		for (int b=1; b<widthCharBytes && ((start+b) < Length()); b++)
+			charBytes[b] = cb.CharAt(static_cast<int>(start+b));
+		int utf8status = UTF8Classify(reinterpret_cast<const unsigned char *>(charBytes), widthCharBytes);
+		if (utf8status & UTF8MaskInvalid)
+			return false;
+		end = start + widthCharBytes;
 		return true;
 	}
 }
@@ -538,14 +516,18 @@ int Document::MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd) {
 	if (dbcsCodePage) {
 		if (SC_CP_UTF8 == dbcsCodePage) {
 			unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos));
-			int startUTF = pos;
-			int endUTF = pos;
-			if (IsTrailByte(ch) && InGoodUTF8(pos, startUTF, endUTF)) {
-				// ch is a trail byte within a UTF-8 character
-				if (moveDir > 0)
-					pos = endUTF;
-				else
-					pos = startUTF;
+			// If ch is not a trail byte then pos is valid intercharacter position
+			if (UTF8IsTrailByte(ch)) {
+				int startUTF = pos;
+				int endUTF = pos;
+				if (InGoodUTF8(pos, startUTF, endUTF)) {
+					// ch is a trail byte within a UTF-8 character
+					if (moveDir > 0)
+						pos = endUTF;
+					else
+						pos = startUTF;
+				}
+				// Else invalid UTF-8 so return position of isolated trail byte
 			}
 		} else {
 			// Anchor DBCS calculations at start of line because start of line can
@@ -592,16 +574,37 @@ int Document::NextPosition(int pos, int moveDir) const {
 
 	if (dbcsCodePage) {
 		if (SC_CP_UTF8 == dbcsCodePage) {
-			pos += increment;
-			unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos));
-			int startUTF = pos;
-			int endUTF = pos;
-			if (IsTrailByte(ch) && InGoodUTF8(pos, startUTF, endUTF)) {
-				// ch is a trail byte within a UTF-8 character
-				if (moveDir > 0)
-					pos = endUTF;
-				else
-					pos = startUTF;
+			if (increment == 1) {
+				// Simple forward movement case so can avoid some checks
+				const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(pos));
+				if (UTF8IsAscii(leadByte)) {
+					// Single byte character or invalid
+					pos++;
+				} else {
+					const int widthCharBytes = UTF8BytesOfLead[leadByte];
+					char charBytes[UTF8MaxBytes] = {static_cast<char>(leadByte),0,0,0};
+					for (int b=1; b<widthCharBytes; b++)
+						charBytes[b] = cb.CharAt(static_cast<int>(pos+b));
+					int utf8status = UTF8Classify(reinterpret_cast<const unsigned char *>(charBytes), widthCharBytes);
+					if (utf8status & UTF8MaskInvalid)
+						pos++;
+					else
+						pos += utf8status & UTF8MaskWidth;
+				}
+			} else {
+				// Examine byte before position
+				pos--;
+				unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos));
+				// If ch is not a trail byte then pos is valid intercharacter position
+				if (UTF8IsTrailByte(ch)) {
+					// If ch is a trail byte in a valid UTF-8 character then return start of character
+					int startUTF = pos;
+					int endUTF = pos;
+					if (InGoodUTF8(pos, startUTF, endUTF)) {
+						pos = startUTF;
+					}
+					// Else invalid UTF-8 so return position of isolated trail byte
+				}
 			}
 		} else {
 			if (moveDir > 0) {
@@ -1246,7 +1249,7 @@ int Document::ParaDown(int pos) {
 }
 
 CharClassify::cc Document::WordCharClass(unsigned char ch) {
-	if ((SC_CP_UTF8 == dbcsCodePage) && (ch >= 0x80))
+	if ((SC_CP_UTF8 == dbcsCodePage) && (!UTF8IsAscii(ch)))
 		return CharClassify::ccWord;
 	return charClass.GetClass(ch);
 }
author	nyamatongwe <devnull@localhost>	2012-05-26 13:36:25 +1000
committer	nyamatongwe <devnull@localhost>	2012-05-26 13:36:25 +1000
commit	645d0f2b7c5ebad6d757e35cd257bcbf4a118f68 (patch)
tree	2dc8da26482b7830d196aa0de0b92e329950efe4
parent	e1370f834348a12fea75ad883a0d801dfd1b9d8d (diff)
download	scintilla-mirror-645d0f2b7c5ebad6d757e35cd257bcbf4a118f68.tar.gz