aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/UniConversion.cxx87
1 files changed, 42 insertions, 45 deletions
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx
index 16a9cef8a..39f553bc2 100644
--- a/src/UniConversion.cxx
+++ b/src/UniConversion.cxx
@@ -229,6 +229,7 @@ unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) {
}
}
+// generated by scripts/GenerateCharTable.py
const unsigned char UTF8BytesOfLead[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
@@ -256,39 +257,28 @@ const unsigned char UTF8BytesOfLead[256] = {
// not have associated glyphs.
int UTF8Classify(const unsigned char *us, int len) {
// For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
- if (*us < 0x80) {
- // Single bytes easy
+ if (us[0] < 0x80) {
+ // ASCII
return 1;
- } else if (*us > 0xf4) {
- // Characters longer than 4 bytes not possible in current UTF-8
+ }
+
+ const int byteCount = UTF8BytesOfLead[us[0]];
+ if (byteCount == 1 || byteCount > len) {
+ // Invalid lead byte
return UTF8MaskInvalid | 1;
- } else if (*us >= 0xf0) {
- // 4 bytes
- if (len < 4)
- return UTF8MaskInvalid | 1;
- if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
- if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
- // *FFFE or *FFFF non-character
- return UTF8MaskInvalid | 4;
- }
- if (*us == 0xf4) {
- // Check if encoding a value beyond the last Unicode character 10FFFF
- if (us[1] > 0x8f) {
- return UTF8MaskInvalid | 1;
- }
- } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
- // Overlong
- return UTF8MaskInvalid | 1;
- }
- return 4;
- } else {
- return UTF8MaskInvalid | 1;
- }
- } else if (*us >= 0xe0) {
- // 3 bytes
- if (len < 3)
- return UTF8MaskInvalid | 1;
- if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) {
+ }
+
+ if (!UTF8IsTrailByte(us[1])) {
+ // Invalid trail byte
+ return UTF8MaskInvalid | 1;
+ }
+
+ switch (byteCount) {
+ case 2:
+ return 2;
+
+ case 3:
+ if (UTF8IsTrailByte(us[2])) {
if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
// Overlong
return UTF8MaskInvalid | 1;
@@ -310,23 +300,30 @@ int UTF8Classify(const unsigned char *us, int len) {
return UTF8MaskInvalid | 3;
}
return 3;
- } else {
- return UTF8MaskInvalid | 1;
}
- } else if (*us >= 0xc2) {
- // 2 bytes
- if (len < 2)
- return UTF8MaskInvalid | 1;
- if (UTF8IsTrailByte(us[1])) {
- return 2;
- } else {
- return UTF8MaskInvalid | 1;
+ break;
+
+ default:
+ if (UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
+ if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
+ // *FFFE or *FFFF non-character
+ return UTF8MaskInvalid | 4;
+ }
+ if (*us == 0xf4) {
+ // Check if encoding a value beyond the last Unicode character 10FFFF
+ if (us[1] > 0x8f) {
+ return UTF8MaskInvalid | 1;
+ }
+ } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
+ // Overlong
+ return UTF8MaskInvalid | 1;
+ }
+ return 4;
}
- } else {
- // 0xc0 .. 0xc1 is overlong encoding
- // 0x80 .. 0xbf is trail byte
- return UTF8MaskInvalid | 1;
+ break;
}
+
+ return UTF8MaskInvalid | 1;
}
int UTF8DrawBytes(const unsigned char *us, int len) {