Move classification of UTF-8 byte sequences into UniConversion module.

author: nyamatongwe <devnull@localhost> 2012-05-26 12:08:06 +1000
committer: nyamatongwe <devnull@localhost> 2012-05-26 12:08:06 +1000
commit: a6598d01d00e24f8c0ee2f4e9cc9036dd447c15f (patch)
tree: 31d920448268b4398b70b5e3adb54b5fa5900a35
parent: 9be362c7c8a4d2bac76892bc48dd0c54522406fe (diff)
download: scintilla-mirror-a6598d01d00e24f8c0ee2f4e9cc9036dd447c15f.tar.gz
3 files changed, 102 insertions, 86 deletions
diff --git a/src/Editor.cxx b/src/Editor.cxx
index b46f8a6b7..3b3e1a99e 100644
--- a/src/Editor.cxx
+++ b/src/Editor.cxx
@@ -36,6 +36,7 @@
 #include "CharClassify.h"
 #include "Decoration.h"
 #include "Document.h"
+#include "UniConversion.h"
 #include "Selection.h"
 #include "PositionCache.h"
 #include "Editor.h"
@@ -2051,100 +2052,18 @@ LineLayout *Editor::RetrieveLineLayout(int lineNumber) {
 	        LinesOnScreen() + 1, pdoc->LinesTotal());
 }
 
-static bool GoodTrailByte(int v) {
-	return (v >= 0x80) && (v < 0xc0);
-}
-
 bool BadUTF(const char *s, int len, int &trailBytes) {
 	// For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
 	if (trailBytes) {
 		trailBytes--;
 		return false;
 	}
-	const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
-	if (*us < 0x80) {
-		// Single bytes easy
-		return false;
-	} else if (*us > 0xF4) {
-		// Characters longer than 4 bytes not possible in current UTF-8
-		return true;
-	} else if (*us >= 0xF0) {
-		// 4 bytes
-		if (len < 4)
-			return true;
-		if (GoodTrailByte(us[1]) && GoodTrailByte(us[2]) && GoodTrailByte(us[3])) {
-			if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
-				// *FFFE or *FFFF non-character
-				return true;
-			}
-			if (*us == 0xf4) {
-				// Check if encoding a value beyond the last Unicode character 10FFFF
-				if (us[1] > 0x8f) {
-					return true;
-				} else if (us[1] == 0x8f) {
-					if (us[2] > 0xbf) {
-						return true;
-					} else if (us[2] == 0xbf) {
-						if (us[3] > 0xbf) {
-							return true;
-						}
-					}
-				}
-			} else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
-				// Overlong
-				return true;
-			}
-			trailBytes = 3;
-			return false;
-		} else {
-			return true;
-		}
-	} else if (*us >= 0xE0) {
-		// 3 bytes
-		if (len < 3)
-			return true;
-		if (GoodTrailByte(us[1]) && GoodTrailByte(us[2])) {
-			if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
-				// Overlong
-				return true;
-			}
-			if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
-				// Surrogate
-				return true;
-			}
-			if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
-				// U+FFFE
-				return true;
-			}
-			if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
-				// U+FFFF
-				return true;
-			}
-			if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
-				// U+FDD0 .. U+FDEF
-				return true;
-			}
-			trailBytes = 2;
-			return false;
-		} else {
-			return true;
-		}
-	} else if (*us >= 0xC2) {
-		// 2 bytes
-		if (len < 2)
-			return true;
-		if (GoodTrailByte(us[1])) {
-			trailBytes = 1;
-			return false;
-		} else {
-			return true;
-		}
-	} else if (*us >= 0xC0) {
-		// Overlong encoding
+	int utf8status = UTF8Classify(reinterpret_cast<const unsigned char *>(s), len);
+	if (utf8status & UTF8MaskInvalid) {
 		return true;
 	} else {
-		// Trail byte
-		return true;
+		trailBytes = (utf8status & UTF8MaskWidth) - 1;
+		return false;
 	}
 }
 
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx
index 2ef75840e..e1ad99563 100644
--- a/src/UniConversion.cxx
+++ b/src/UniConversion.cxx
@@ -129,3 +129,94 @@ unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsig
 	}
 	return ui;
 }
+
+// Return both the width of the first character in the string and a status
+// saying whether it is valid or invalid.
+// Most invalid sequences return a width of 1 so are treated as isolated bytes but
+// the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
+// reasonably treated as code points in some circumstances. They will, however,
+// not have associated glyphs.
+int UTF8Classify(const unsigned char *us, int len) {
+	// For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
+	if (*us < 0x80) {
+		// Single bytes easy
+		return 1;
+	} else if (*us > 0xf4) {
+		// Characters longer than 4 bytes not possible in current UTF-8
+		return UTF8MaskInvalid | 1;
+	} else if (*us >= 0xf0) {
+		// 4 bytes
+		if (len < 4)
+			return UTF8MaskInvalid | 1;
+		if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
+			if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
+				// *FFFE or *FFFF non-character
+				return UTF8MaskInvalid | 4;
+			}
+			if (*us == 0xf4) {
+				// Check if encoding a value beyond the last Unicode character 10FFFF
+				if (us[1] > 0x8f) {
+					return UTF8MaskInvalid | 1;
+				} else if (us[1] == 0x8f) {
+					if (us[2] > 0xbf) {
+						return UTF8MaskInvalid | 1;
+					} else if (us[2] == 0xbf) {
+						if (us[3] > 0xbf) {
+							return UTF8MaskInvalid | 1;
+						}
+					}
+				}
+			} else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
+				// Overlong
+				return UTF8MaskInvalid | 1;
+			}
+			return 4;
+		} else {
+			return UTF8MaskInvalid | 1;
+		}
+	} else if (*us >= 0xe0) {
+		// 3 bytes
+		if (len < 3)
+			return UTF8MaskInvalid | 1;
+		if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) {
+			if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
+				// Overlong
+				return UTF8MaskInvalid | 1;
+			}
+			if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
+				// Surrogate
+				return UTF8MaskInvalid | 1;
+			}
+			if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
+				// U+FFFE non-character - 3 bytes long
+				return UTF8MaskInvalid | 3;
+			}
+			if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
+				// U+FFFF non-character - 3 bytes long
+				return UTF8MaskInvalid | 3;
+			}
+			if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
+				// U+FDD0 .. U+FDEF
+				return UTF8MaskInvalid | 3;
+			}
+			return 3;
+		} else {
+			return UTF8MaskInvalid | 1;
+		}
+	} else if (*us >= 0xc2) {
+		// 2 bytes
+		if (len < 2)
+			return UTF8MaskInvalid | 1;
+		if (UTF8IsTrailByte(us[1])) {
+			return 2;
+		} else {
+			return UTF8MaskInvalid | 1;
+		}
+	} else if (*us >= 0xc0) {
+		// Overlong encoding
+		return UTF8MaskInvalid | 1;
+	} else {
+		// Trail byte
+		return UTF8MaskInvalid | 1;
+	}
+}
diff --git a/src/UniConversion.h b/src/UniConversion.h
index 2de2ef3fe..6793221cf 100644
--- a/src/UniConversion.h
+++ b/src/UniConversion.h
@@ -11,3 +11,9 @@ unsigned int UTF8CharLength(unsigned char ch);
 unsigned int UTF16Length(const char *s, unsigned int len);
 unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen);
 
+inline bool UTF8IsTrailByte(int ch) {
+	return (ch >= 0x80) && (ch < 0xc0);
+}
+
+enum { UTF8MaskWidth=0x7, UTF8MaskInvalid=0x8 };
+int UTF8Classify(const unsigned char *us, int len);
author	nyamatongwe <devnull@localhost>	2012-05-26 12:08:06 +1000
committer	nyamatongwe <devnull@localhost>	2012-05-26 12:08:06 +1000
commit	a6598d01d00e24f8c0ee2f4e9cc9036dd447c15f (patch)
tree	31d920448268b4398b70b5e3adb54b5fa5900a35
parent	9be362c7c8a4d2bac76892bc48dd0c54522406fe (diff)
download	scintilla-mirror-a6598d01d00e24f8c0ee2f4e9cc9036dd447c15f.tar.gz