aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/UniConversion.cxx
diff options
context:
space:
mode:
authorZufu Liu <unknown>2018-03-25 10:51:16 +1100
committerZufu Liu <unknown>2018-03-25 10:51:16 +1100
commitaa5481a2fbd394e059ffdd7a8cd677abd55227d5 (patch)
tree759febaf4312d5f210196b26feb250751fd3c220 /src/UniConversion.cxx
parent4692de654d0e973f6ceed4b00d7859ef9b6af254 (diff)
downloadscintilla-mirror-aa5481a2fbd394e059ffdd7a8cd677abd55227d5.tar.gz
Feature [feature-requests:#1213]. Clarify UTF8Classify.
Use UTF8BytesOfLead to determine expected length early in function to quickly detect argument not long enough, invalid single bytes and invalid first trail then branching on length for more detailed checks.
Diffstat (limited to 'src/UniConversion.cxx')
-rw-r--r--src/UniConversion.cxx87
1 files changed, 42 insertions, 45 deletions
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx
index 16a9cef8a..39f553bc2 100644
--- a/src/UniConversion.cxx
+++ b/src/UniConversion.cxx
@@ -229,6 +229,7 @@ unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) {
}
}
+// generated by scripts/GenerateCharTable.py
const unsigned char UTF8BytesOfLead[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
@@ -256,39 +257,28 @@ const unsigned char UTF8BytesOfLead[256] = {
// not have associated glyphs.
int UTF8Classify(const unsigned char *us, int len) {
// For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
- if (*us < 0x80) {
- // Single bytes easy
+ if (us[0] < 0x80) {
+ // ASCII
return 1;
- } else if (*us > 0xf4) {
- // Characters longer than 4 bytes not possible in current UTF-8
+ }
+
+ const int byteCount = UTF8BytesOfLead[us[0]];
+ if (byteCount == 1 || byteCount > len) {
+ // Invalid lead byte
return UTF8MaskInvalid | 1;
- } else if (*us >= 0xf0) {
- // 4 bytes
- if (len < 4)
- return UTF8MaskInvalid | 1;
- if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
- if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
- // *FFFE or *FFFF non-character
- return UTF8MaskInvalid | 4;
- }
- if (*us == 0xf4) {
- // Check if encoding a value beyond the last Unicode character 10FFFF
- if (us[1] > 0x8f) {
- return UTF8MaskInvalid | 1;
- }
- } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
- // Overlong
- return UTF8MaskInvalid | 1;
- }
- return 4;
- } else {
- return UTF8MaskInvalid | 1;
- }
- } else if (*us >= 0xe0) {
- // 3 bytes
- if (len < 3)
- return UTF8MaskInvalid | 1;
- if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) {
+ }
+
+ if (!UTF8IsTrailByte(us[1])) {
+ // Invalid trail byte
+ return UTF8MaskInvalid | 1;
+ }
+
+ switch (byteCount) {
+ case 2:
+ return 2;
+
+ case 3:
+ if (UTF8IsTrailByte(us[2])) {
if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
// Overlong
return UTF8MaskInvalid | 1;
@@ -310,23 +300,30 @@ int UTF8Classify(const unsigned char *us, int len) {
return UTF8MaskInvalid | 3;
}
return 3;
- } else {
- return UTF8MaskInvalid | 1;
}
- } else if (*us >= 0xc2) {
- // 2 bytes
- if (len < 2)
- return UTF8MaskInvalid | 1;
- if (UTF8IsTrailByte(us[1])) {
- return 2;
- } else {
- return UTF8MaskInvalid | 1;
+ break;
+
+ default:
+ if (UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
+ if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
+ // *FFFE or *FFFF non-character
+ return UTF8MaskInvalid | 4;
+ }
+ if (*us == 0xf4) {
+ // Check if encoding a value beyond the last Unicode character 10FFFF
+ if (us[1] > 0x8f) {
+ return UTF8MaskInvalid | 1;
+ }
+ } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
+ // Overlong
+ return UTF8MaskInvalid | 1;
+ }
+ return 4;
}
- } else {
- // 0xc0 .. 0xc1 is overlong encoding
- // 0x80 .. 0xbf is trail byte
- return UTF8MaskInvalid | 1;
+ break;
}
+
+ return UTF8MaskInvalid | 1;
}
int UTF8DrawBytes(const unsigned char *us, int len) {