aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/Editor.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'src/Editor.cxx')
-rw-r--r--src/Editor.cxx91
1 files changed, 5 insertions, 86 deletions
diff --git a/src/Editor.cxx b/src/Editor.cxx
index b46f8a6b7..3b3e1a99e 100644
--- a/src/Editor.cxx
+++ b/src/Editor.cxx
@@ -36,6 +36,7 @@
#include "CharClassify.h"
#include "Decoration.h"
#include "Document.h"
+#include "UniConversion.h"
#include "Selection.h"
#include "PositionCache.h"
#include "Editor.h"
@@ -2051,100 +2052,18 @@ LineLayout *Editor::RetrieveLineLayout(int lineNumber) {
LinesOnScreen() + 1, pdoc->LinesTotal());
}
-static bool GoodTrailByte(int v) {
- return (v >= 0x80) && (v < 0xc0);
-}
-
bool BadUTF(const char *s, int len, int &trailBytes) {
// For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
if (trailBytes) {
trailBytes--;
return false;
}
- const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
- if (*us < 0x80) {
- // Single bytes easy
- return false;
- } else if (*us > 0xF4) {
- // Characters longer than 4 bytes not possible in current UTF-8
- return true;
- } else if (*us >= 0xF0) {
- // 4 bytes
- if (len < 4)
- return true;
- if (GoodTrailByte(us[1]) && GoodTrailByte(us[2]) && GoodTrailByte(us[3])) {
- if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
- // *FFFE or *FFFF non-character
- return true;
- }
- if (*us == 0xf4) {
- // Check if encoding a value beyond the last Unicode character 10FFFF
- if (us[1] > 0x8f) {
- return true;
- } else if (us[1] == 0x8f) {
- if (us[2] > 0xbf) {
- return true;
- } else if (us[2] == 0xbf) {
- if (us[3] > 0xbf) {
- return true;
- }
- }
- }
- } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
- // Overlong
- return true;
- }
- trailBytes = 3;
- return false;
- } else {
- return true;
- }
- } else if (*us >= 0xE0) {
- // 3 bytes
- if (len < 3)
- return true;
- if (GoodTrailByte(us[1]) && GoodTrailByte(us[2])) {
- if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
- // Overlong
- return true;
- }
- if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
- // Surrogate
- return true;
- }
- if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
- // U+FFFE
- return true;
- }
- if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
- // U+FFFF
- return true;
- }
- if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
- // U+FDD0 .. U+FDEF
- return true;
- }
- trailBytes = 2;
- return false;
- } else {
- return true;
- }
- } else if (*us >= 0xC2) {
- // 2 bytes
- if (len < 2)
- return true;
- if (GoodTrailByte(us[1])) {
- trailBytes = 1;
- return false;
- } else {
- return true;
- }
- } else if (*us >= 0xC0) {
- // Overlong encoding
+ int utf8status = UTF8Classify(reinterpret_cast<const unsigned char *>(s), len);
+ if (utf8status & UTF8MaskInvalid) {
return true;
} else {
- // Trail byte
- return true;
+ trailBytes = (utf8status & UTF8MaskWidth) - 1;
+ return false;
}
}