diff options
| -rw-r--r-- | src/Editor.cxx | 91 | ||||
| -rw-r--r-- | src/UniConversion.cxx | 91 | ||||
| -rw-r--r-- | src/UniConversion.h | 6 | 
3 files changed, 102 insertions, 86 deletions
| diff --git a/src/Editor.cxx b/src/Editor.cxx index b46f8a6b7..3b3e1a99e 100644 --- a/src/Editor.cxx +++ b/src/Editor.cxx @@ -36,6 +36,7 @@  #include "CharClassify.h"  #include "Decoration.h"  #include "Document.h" +#include "UniConversion.h"  #include "Selection.h"  #include "PositionCache.h"  #include "Editor.h" @@ -2051,100 +2052,18 @@ LineLayout *Editor::RetrieveLineLayout(int lineNumber) {  	        LinesOnScreen() + 1, pdoc->LinesTotal());  } -static bool GoodTrailByte(int v) { -	return (v >= 0x80) && (v < 0xc0); -} -  bool BadUTF(const char *s, int len, int &trailBytes) {  	// For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8  	if (trailBytes) {  		trailBytes--;  		return false;  	} -	const unsigned char *us = reinterpret_cast<const unsigned char *>(s); -	if (*us < 0x80) { -		// Single bytes easy -		return false; -	} else if (*us > 0xF4) { -		// Characters longer than 4 bytes not possible in current UTF-8 -		return true; -	} else if (*us >= 0xF0) { -		// 4 bytes -		if (len < 4) -			return true; -		if (GoodTrailByte(us[1]) && GoodTrailByte(us[2]) && GoodTrailByte(us[3])) { -			if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) { -				// *FFFE or *FFFF non-character -				return true; -			} -			if (*us == 0xf4) { -				// Check if encoding a value beyond the last Unicode character 10FFFF -				if (us[1] > 0x8f) { -					return true; -				} else if (us[1] == 0x8f) { -					if (us[2] > 0xbf) { -						return true; -					} else if (us[2] == 0xbf) { -						if (us[3] > 0xbf) { -							return true; -						} -					} -				} -			} else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) { -				// Overlong -				return true; -			} -			trailBytes = 3; -			return false; -		} else { -			return true; -		} -	} else if (*us >= 0xE0) { -		// 3 bytes -		if (len < 3) -			return true; -		if (GoodTrailByte(us[1]) && GoodTrailByte(us[2])) { -			if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) { -				// Overlong -				return true; -			} -			if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) { -				// Surrogate -				return true; -			} -			if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) { -				// U+FFFE -				return true; -			} -			if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) { -				// U+FFFF -				return true; -			} -			if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) { -				// U+FDD0 .. U+FDEF -				return true; -			} -			trailBytes = 2; -			return false; -		} else { -			return true; -		} -	} else if (*us >= 0xC2) { -		// 2 bytes -		if (len < 2) -			return true; -		if (GoodTrailByte(us[1])) { -			trailBytes = 1; -			return false; -		} else { -			return true; -		} -	} else if (*us >= 0xC0) { -		// Overlong encoding +	int utf8status = UTF8Classify(reinterpret_cast<const unsigned char *>(s), len); +	if (utf8status & UTF8MaskInvalid) {  		return true;  	} else { -		// Trail byte -		return true; +		trailBytes = (utf8status & UTF8MaskWidth) - 1; +		return false;  	}  } diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index 2ef75840e..e1ad99563 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -129,3 +129,94 @@ unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsig  	}  	return ui;  } + +// Return both the width of the first character in the string and a status +// saying whether it is valid or invalid. +// Most invalid sequences return a width of 1 so are treated as isolated bytes but +// the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be +// reasonably treated as code points in some circumstances. They will, however, +// not have associated glyphs. +int UTF8Classify(const unsigned char *us, int len) { +	// For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 +	if (*us < 0x80) { +		// Single bytes easy +		return 1; +	} else if (*us > 0xf4) { +		// Characters longer than 4 bytes not possible in current UTF-8 +		return UTF8MaskInvalid | 1; +	} else if (*us >= 0xf0) { +		// 4 bytes +		if (len < 4) +			return UTF8MaskInvalid | 1; +		if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) { +			if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) { +				// *FFFE or *FFFF non-character +				return UTF8MaskInvalid | 4; +			} +			if (*us == 0xf4) { +				// Check if encoding a value beyond the last Unicode character 10FFFF +				if (us[1] > 0x8f) { +					return UTF8MaskInvalid | 1; +				} else if (us[1] == 0x8f) { +					if (us[2] > 0xbf) { +						return UTF8MaskInvalid | 1; +					} else if (us[2] == 0xbf) { +						if (us[3] > 0xbf) { +							return UTF8MaskInvalid | 1; +						} +					} +				} +			} else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) { +				// Overlong +				return UTF8MaskInvalid | 1; +			} +			return 4; +		} else { +			return UTF8MaskInvalid | 1; +		} +	} else if (*us >= 0xe0) { +		// 3 bytes +		if (len < 3) +			return UTF8MaskInvalid | 1; +		if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) { +			if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) { +				// Overlong +				return UTF8MaskInvalid | 1; +			} +			if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) { +				// Surrogate +				return UTF8MaskInvalid | 1; +			} +			if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) { +				// U+FFFE non-character - 3 bytes long +				return UTF8MaskInvalid | 3; +			} +			if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) { +				// U+FFFF non-character - 3 bytes long +				return UTF8MaskInvalid | 3; +			} +			if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) { +				// U+FDD0 .. U+FDEF +				return UTF8MaskInvalid | 3; +			} +			return 3; +		} else { +			return UTF8MaskInvalid | 1; +		} +	} else if (*us >= 0xc2) { +		// 2 bytes +		if (len < 2) +			return UTF8MaskInvalid | 1; +		if (UTF8IsTrailByte(us[1])) { +			return 2; +		} else { +			return UTF8MaskInvalid | 1; +		} +	} else if (*us >= 0xc0) { +		// Overlong encoding +		return UTF8MaskInvalid | 1; +	} else { +		// Trail byte +		return UTF8MaskInvalid | 1; +	} +} diff --git a/src/UniConversion.h b/src/UniConversion.h index 2de2ef3fe..6793221cf 100644 --- a/src/UniConversion.h +++ b/src/UniConversion.h @@ -11,3 +11,9 @@ unsigned int UTF8CharLength(unsigned char ch);  unsigned int UTF16Length(const char *s, unsigned int len);  unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen); +inline bool UTF8IsTrailByte(int ch) { +	return (ch >= 0x80) && (ch < 0xc0); +} + +enum { UTF8MaskWidth=0x7, UTF8MaskInvalid=0x8 }; +int UTF8Classify(const unsigned char *us, int len); | 
