diff options
author | nyamatongwe <devnull@localhost> | 2012-05-26 12:08:06 +1000 |
---|---|---|
committer | nyamatongwe <devnull@localhost> | 2012-05-26 12:08:06 +1000 |
commit | a6598d01d00e24f8c0ee2f4e9cc9036dd447c15f (patch) | |
tree | 31d920448268b4398b70b5e3adb54b5fa5900a35 /src | |
parent | 9be362c7c8a4d2bac76892bc48dd0c54522406fe (diff) | |
download | scintilla-mirror-a6598d01d00e24f8c0ee2f4e9cc9036dd447c15f.tar.gz |
Move classification of UTF-8 byte sequences into UniConversion module.
Diffstat (limited to 'src')
-rw-r--r-- | src/Editor.cxx | 91 | ||||
-rw-r--r-- | src/UniConversion.cxx | 91 | ||||
-rw-r--r-- | src/UniConversion.h | 6 |
3 files changed, 102 insertions, 86 deletions
diff --git a/src/Editor.cxx b/src/Editor.cxx index b46f8a6b7..3b3e1a99e 100644 --- a/src/Editor.cxx +++ b/src/Editor.cxx @@ -36,6 +36,7 @@ #include "CharClassify.h" #include "Decoration.h" #include "Document.h" +#include "UniConversion.h" #include "Selection.h" #include "PositionCache.h" #include "Editor.h" @@ -2051,100 +2052,18 @@ LineLayout *Editor::RetrieveLineLayout(int lineNumber) { LinesOnScreen() + 1, pdoc->LinesTotal()); } -static bool GoodTrailByte(int v) { - return (v >= 0x80) && (v < 0xc0); -} - bool BadUTF(const char *s, int len, int &trailBytes) { // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 if (trailBytes) { trailBytes--; return false; } - const unsigned char *us = reinterpret_cast<const unsigned char *>(s); - if (*us < 0x80) { - // Single bytes easy - return false; - } else if (*us > 0xF4) { - // Characters longer than 4 bytes not possible in current UTF-8 - return true; - } else if (*us >= 0xF0) { - // 4 bytes - if (len < 4) - return true; - if (GoodTrailByte(us[1]) && GoodTrailByte(us[2]) && GoodTrailByte(us[3])) { - if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) { - // *FFFE or *FFFF non-character - return true; - } - if (*us == 0xf4) { - // Check if encoding a value beyond the last Unicode character 10FFFF - if (us[1] > 0x8f) { - return true; - } else if (us[1] == 0x8f) { - if (us[2] > 0xbf) { - return true; - } else if (us[2] == 0xbf) { - if (us[3] > 0xbf) { - return true; - } - } - } - } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) { - // Overlong - return true; - } - trailBytes = 3; - return false; - } else { - return true; - } - } else if (*us >= 0xE0) { - // 3 bytes - if (len < 3) - return true; - if (GoodTrailByte(us[1]) && GoodTrailByte(us[2])) { - if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) { - // Overlong - return true; - } - if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) { - // Surrogate - return true; - } - if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) { - // U+FFFE - return true; - } - if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) { - // U+FFFF - return true; - } - if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) { - // U+FDD0 .. U+FDEF - return true; - } - trailBytes = 2; - return false; - } else { - return true; - } - } else if (*us >= 0xC2) { - // 2 bytes - if (len < 2) - return true; - if (GoodTrailByte(us[1])) { - trailBytes = 1; - return false; - } else { - return true; - } - } else if (*us >= 0xC0) { - // Overlong encoding + int utf8status = UTF8Classify(reinterpret_cast<const unsigned char *>(s), len); + if (utf8status & UTF8MaskInvalid) { return true; } else { - // Trail byte - return true; + trailBytes = (utf8status & UTF8MaskWidth) - 1; + return false; } } diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index 2ef75840e..e1ad99563 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -129,3 +129,94 @@ unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsig } return ui; } + +// Return both the width of the first character in the string and a status +// saying whether it is valid or invalid. +// Most invalid sequences return a width of 1 so are treated as isolated bytes but +// the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be +// reasonably treated as code points in some circumstances. They will, however, +// not have associated glyphs. +int UTF8Classify(const unsigned char *us, int len) { + // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 + if (*us < 0x80) { + // Single bytes easy + return 1; + } else if (*us > 0xf4) { + // Characters longer than 4 bytes not possible in current UTF-8 + return UTF8MaskInvalid | 1; + } else if (*us >= 0xf0) { + // 4 bytes + if (len < 4) + return UTF8MaskInvalid | 1; + if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) { + if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) { + // *FFFE or *FFFF non-character + return UTF8MaskInvalid | 4; + } + if (*us == 0xf4) { + // Check if encoding a value beyond the last Unicode character 10FFFF + if (us[1] > 0x8f) { + return UTF8MaskInvalid | 1; + } else if (us[1] == 0x8f) { + if (us[2] > 0xbf) { + return UTF8MaskInvalid | 1; + } else if (us[2] == 0xbf) { + if (us[3] > 0xbf) { + return UTF8MaskInvalid | 1; + } + } + } + } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) { + // Overlong + return UTF8MaskInvalid | 1; + } + return 4; + } else { + return UTF8MaskInvalid | 1; + } + } else if (*us >= 0xe0) { + // 3 bytes + if (len < 3) + return UTF8MaskInvalid | 1; + if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) { + if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) { + // Overlong + return UTF8MaskInvalid | 1; + } + if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) { + // Surrogate + return UTF8MaskInvalid | 1; + } + if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) { + // U+FFFE non-character - 3 bytes long + return UTF8MaskInvalid | 3; + } + if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) { + // U+FFFF non-character - 3 bytes long + return UTF8MaskInvalid | 3; + } + if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) { + // U+FDD0 .. U+FDEF + return UTF8MaskInvalid | 3; + } + return 3; + } else { + return UTF8MaskInvalid | 1; + } + } else if (*us >= 0xc2) { + // 2 bytes + if (len < 2) + return UTF8MaskInvalid | 1; + if (UTF8IsTrailByte(us[1])) { + return 2; + } else { + return UTF8MaskInvalid | 1; + } + } else if (*us >= 0xc0) { + // Overlong encoding + return UTF8MaskInvalid | 1; + } else { + // Trail byte + return UTF8MaskInvalid | 1; + } +} diff --git a/src/UniConversion.h b/src/UniConversion.h index 2de2ef3fe..6793221cf 100644 --- a/src/UniConversion.h +++ b/src/UniConversion.h @@ -11,3 +11,9 @@ unsigned int UTF8CharLength(unsigned char ch); unsigned int UTF16Length(const char *s, unsigned int len); unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen); +inline bool UTF8IsTrailByte(int ch) { + return (ch >= 0x80) && (ch < 0xc0); +} + +enum { UTF8MaskWidth=0x7, UTF8MaskInvalid=0x8 }; +int UTF8Classify(const unsigned char *us, int len); |