aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/Document.cxx4
-rw-r--r--src/UniConversion.cxx67
-rw-r--r--src/UniConversion.h6
3 files changed, 61 insertions, 16 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index a25e3070d..3061bbc37 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -266,7 +266,9 @@ int Document::LenChar(int pos) {
if (ch < 0x80)
return 1;
int len = 2;
- if (ch >= (0x80 + 0x40 + 0x20))
+ if (ch >= (0x80 + 0x40 + 0x20 + 0x10))
+ len = 4;
+ else if (ch >= (0x80 + 0x40 + 0x20))
len = 3;
int lengthDoc = Length();
if ((pos + len) > lengthDoc)
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx
index 363db90f4..863eb82cd 100644
--- a/src/UniConversion.cxx
+++ b/src/UniConversion.cxx
@@ -9,49 +9,80 @@
#include "UniConversion.h"
+enum { SURROGATE_LEAD_FIRST = 0xD800 };
+enum { SURROGATE_TRAIL_FIRST = 0xDC00 };
+enum { SURROGATE_TRAIL_LAST = 0xDFFF };
+
unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen) {
unsigned int len = 0;
- for (unsigned int i = 0; i < tlen && uptr[i]; i++) {
+ for (unsigned int i = 0; i < tlen && uptr[i];) {
unsigned int uch = uptr[i];
- if (uch < 0x80)
+ if (uch < 0x80) {
len++;
- else if (uch < 0x800)
+ } else if (uch < 0x800) {
len += 2;
- else
- len +=3;
+ } else if ((uch >= SURROGATE_LEAD_FIRST) &&
+ (uch <= SURROGATE_TRAIL_LAST)) {
+ len += 4;
+ i++;
+ } else {
+ len += 3;
+ }
+ i++;
}
return len;
}
-void UTF8FromUCS2(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len) {
+void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len) {
int k = 0;
- for (unsigned int i = 0; i < tlen && uptr[i]; i++) {
+ for (unsigned int i = 0; i < tlen && uptr[i];) {
unsigned int uch = uptr[i];
if (uch < 0x80) {
putf[k++] = static_cast<char>(uch);
} else if (uch < 0x800) {
putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
+ } else if ((uch >= SURROGATE_LEAD_FIRST) &&
+ (uch <= SURROGATE_TRAIL_LAST)) {
+ // Half a surrogate pair
+ i++;
+ unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff);
+ putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
+ putf[k++] = static_cast<char>(0x80 | (xch >> 12) & 0x3f);
+ putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
+ putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
} else {
putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
}
+ i++;
}
putf[len] = '\0';
}
-unsigned int UCS2Length(const char *s, unsigned int len) {
+unsigned int UTF16Length(const char *s, unsigned int len) {
unsigned int ulen = 0;
- for (unsigned int i=0;i<len;i++) {
+ unsigned int charLen;
+ for (unsigned int i=0;i<len;) {
unsigned char ch = static_cast<unsigned char>(s[i]);
- if ((ch < 0x80) || (ch > (0x80 + 0x40)))
+ if (ch < 0x80) {
+ charLen = 1;
+ } else if (ch < 0x80 + 0x40 + 0x20) {
+ charLen = 2;
+ } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
+ charLen = 3;
+ } else {
+ charLen = 4;
ulen++;
+ }
+ i += charLen;
+ ulen++;
}
return ulen;
}
-unsigned int UCS2FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen) {
+unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen) {
unsigned int ui=0;
const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
unsigned int i=0;
@@ -63,12 +94,24 @@ unsigned int UCS2FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsign
tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6);
ch = us[i++];
tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
- } else {
+ } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12);
ch = us[i++];
tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + ((ch & 0x7F) << 6));
ch = us[i++];
tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
+ } else {
+ // Outside the BMP so need two surrogates
+ int val = (ch & 0x7) << 18;
+ ch = us[i++];
+ val += (ch & 0x3F) << 12;
+ ch = us[i++];
+ val += (ch & 0x3F) << 6;
+ ch = us[i++];
+ val += (ch & 0x3F);
+ tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
+ ui++;
+ tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
}
ui++;
}
diff --git a/src/UniConversion.h b/src/UniConversion.h
index bd1d7754d..fd420a688 100644
--- a/src/UniConversion.h
+++ b/src/UniConversion.h
@@ -6,7 +6,7 @@
// The License.txt file describes the conditions under which this software may be distributed.
unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen);
-void UTF8FromUCS2(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len);
-unsigned int UCS2Length(const char *s, unsigned int len);
-unsigned int UCS2FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen);
+void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len);
+unsigned int UTF16Length(const char *s, unsigned int len);
+unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen);