From 476e533e7277cfd122f3ca3472783831c9e47ca5 Mon Sep 17 00:00:00 2001 From: nyamatongwe Date: Thu, 19 Apr 2007 04:38:53 +0000 Subject: All Unicode planes supported, not just the Basic Multilingual Plane. --- gtk/PlatGTK.cxx | 6 ++--- src/Document.cxx | 4 ++- src/UniConversion.cxx | 67 +++++++++++++++++++++++++++++++++++++++++--------- src/UniConversion.h | 6 ++--- win32/PlatWin.cxx | 29 ++++++++++++---------- win32/ScintillaWin.cxx | 24 +++++++++--------- 6 files changed, 92 insertions(+), 44 deletions(-) diff --git a/gtk/PlatGTK.cxx b/gtk/PlatGTK.cxx index 59e4d9dca..e361b3e17 100644 --- a/gtk/PlatGTK.cxx +++ b/gtk/PlatGTK.cxx @@ -1293,7 +1293,7 @@ void SurfaceImpl::DrawTextBase(PRectangle rc, Font &font_, int ybase, const char len = maxLengthTextRun-1; int wclen; if (et == UTF8) { - wclen = UCS2FromUTF8(s, len, + wclen = UTF16FromUTF8(s, len, static_cast(static_cast(wctext)), maxLengthTextRun - 1); } else { // dbcs, so convert using current locale char sMeasure[maxLengthTextRun]; @@ -1468,7 +1468,7 @@ void SurfaceImpl::MeasureWidths(Font &font_, const char *s, int len, int *positi len = maxLengthTextRun-1; int wclen; if (et == UTF8) { - wclen = UCS2FromUTF8(s, len, + wclen = UTF16FromUTF8(s, len, static_cast(static_cast(wctext)), maxLengthTextRun - 1); } else { // dbcsMode, so convert using current locale char sDraw[maxLengthTextRun]; @@ -1554,7 +1554,7 @@ int SurfaceImpl::WidthText(Font &font_, const char *s, int len) { #endif if (et == UTF8) { GdkWChar wctext[maxLengthTextRun]; - size_t wclen = UCS2FromUTF8(s, len, static_cast(static_cast(wctext)), + size_t wclen = UTF16FromUTF8(s, len, static_cast(static_cast(wctext)), sizeof(wctext) / sizeof(GdkWChar) - 1); wctext[wclen] = L'\0'; return gdk_text_width_wc(PFont(font_)->pfont, wctext, wclen); diff --git a/src/Document.cxx b/src/Document.cxx index a25e3070d..3061bbc37 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -266,7 +266,9 @@ int Document::LenChar(int pos) { if (ch < 0x80) return 1; int len = 2; - if (ch >= (0x80 + 0x40 + 0x20)) + if (ch >= (0x80 + 0x40 + 0x20 + 0x10)) + len = 4; + else if (ch >= (0x80 + 0x40 + 0x20)) len = 3; int lengthDoc = Length(); if ((pos + len) > lengthDoc) diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index 363db90f4..863eb82cd 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -9,49 +9,80 @@ #include "UniConversion.h" +enum { SURROGATE_LEAD_FIRST = 0xD800 }; +enum { SURROGATE_TRAIL_FIRST = 0xDC00 }; +enum { SURROGATE_TRAIL_LAST = 0xDFFF }; + unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen) { unsigned int len = 0; - for (unsigned int i = 0; i < tlen && uptr[i]; i++) { + for (unsigned int i = 0; i < tlen && uptr[i];) { unsigned int uch = uptr[i]; - if (uch < 0x80) + if (uch < 0x80) { len++; - else if (uch < 0x800) + } else if (uch < 0x800) { len += 2; - else - len +=3; + } else if ((uch >= SURROGATE_LEAD_FIRST) && + (uch <= SURROGATE_TRAIL_LAST)) { + len += 4; + i++; + } else { + len += 3; + } + i++; } return len; } -void UTF8FromUCS2(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len) { +void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len) { int k = 0; - for (unsigned int i = 0; i < tlen && uptr[i]; i++) { + for (unsigned int i = 0; i < tlen && uptr[i];) { unsigned int uch = uptr[i]; if (uch < 0x80) { putf[k++] = static_cast(uch); } else if (uch < 0x800) { putf[k++] = static_cast(0xC0 | (uch >> 6)); putf[k++] = static_cast(0x80 | (uch & 0x3f)); + } else if ((uch >= SURROGATE_LEAD_FIRST) && + (uch <= SURROGATE_TRAIL_LAST)) { + // Half a surrogate pair + i++; + unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff); + putf[k++] = static_cast(0xF0 | (xch >> 18)); + putf[k++] = static_cast(0x80 | (xch >> 12) & 0x3f); + putf[k++] = static_cast(0x80 | ((xch >> 6) & 0x3f)); + putf[k++] = static_cast(0x80 | (xch & 0x3f)); } else { putf[k++] = static_cast(0xE0 | (uch >> 12)); putf[k++] = static_cast(0x80 | ((uch >> 6) & 0x3f)); putf[k++] = static_cast(0x80 | (uch & 0x3f)); } + i++; } putf[len] = '\0'; } -unsigned int UCS2Length(const char *s, unsigned int len) { +unsigned int UTF16Length(const char *s, unsigned int len) { unsigned int ulen = 0; - for (unsigned int i=0;i(s[i]); - if ((ch < 0x80) || (ch > (0x80 + 0x40))) + if (ch < 0x80) { + charLen = 1; + } else if (ch < 0x80 + 0x40 + 0x20) { + charLen = 2; + } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) { + charLen = 3; + } else { + charLen = 4; ulen++; + } + i += charLen; + ulen++; } return ulen; } -unsigned int UCS2FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen) { +unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen) { unsigned int ui=0; const unsigned char *us = reinterpret_cast(s); unsigned int i=0; @@ -63,12 +94,24 @@ unsigned int UCS2FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsign tbuf[ui] = static_cast((ch & 0x1F) << 6); ch = us[i++]; tbuf[ui] = static_cast(tbuf[ui] + (ch & 0x7F)); - } else { + } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) { tbuf[ui] = static_cast((ch & 0xF) << 12); ch = us[i++]; tbuf[ui] = static_cast(tbuf[ui] + ((ch & 0x7F) << 6)); ch = us[i++]; tbuf[ui] = static_cast(tbuf[ui] + (ch & 0x7F)); + } else { + // Outside the BMP so need two surrogates + int val = (ch & 0x7) << 18; + ch = us[i++]; + val += (ch & 0x3F) << 12; + ch = us[i++]; + val += (ch & 0x3F) << 6; + ch = us[i++]; + val += (ch & 0x3F); + tbuf[ui] = static_cast(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST); + ui++; + tbuf[ui] = static_cast((val & 0x3ff) + SURROGATE_TRAIL_FIRST); } ui++; } diff --git a/src/UniConversion.h b/src/UniConversion.h index bd1d7754d..fd420a688 100644 --- a/src/UniConversion.h +++ b/src/UniConversion.h @@ -6,7 +6,7 @@ // The License.txt file describes the conditions under which this software may be distributed. unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen); -void UTF8FromUCS2(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len); -unsigned int UCS2Length(const char *s, unsigned int len); -unsigned int UCS2FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen); +void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len); +unsigned int UTF16Length(const char *s, unsigned int len); +unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen); diff --git a/win32/PlatWin.cxx b/win32/PlatWin.cxx index b2cc554bb..dd3213bf9 100644 --- a/win32/PlatWin.cxx +++ b/win32/PlatWin.cxx @@ -684,7 +684,7 @@ void SurfaceImpl::DrawTextCommon(PRectangle rc, Font &font_, int ybase, const ch wchar_t tbuf[MAX_US_LEN]; int tlen; if (unicodeMode) { - tlen = UCS2FromUTF8(s, len, tbuf, MAX_US_LEN); + tlen = UTF16FromUTF8(s, len, tbuf, MAX_US_LEN); } else { // Support Asian string display in 9x English tlen = ::MultiByteToWideChar(codePage, 0, s, len, NULL, 0); @@ -740,7 +740,7 @@ int SurfaceImpl::WidthText(Font &font_, const char *s, int len) { SIZE sz={0,0}; if (unicodeMode) { wchar_t tbuf[MAX_US_LEN]; - int tlen = UCS2FromUTF8(s, len, tbuf, MAX_US_LEN); + int tlen = UTF16FromUTF8(s, len, tbuf, MAX_US_LEN); ::GetTextExtentPoint32W(hdc, tbuf, tlen, &sz); } else if (IsNT() || (codePage==0) || win9xACPSame) { ::GetTextExtentPoint32A(hdc, s, Platform::Minimum(len, maxLenText), &sz); @@ -760,7 +760,7 @@ void SurfaceImpl::MeasureWidths(Font &font_, const char *s, int len, int *positi int fit = 0; if (unicodeMode) { wchar_t tbuf[MAX_US_LEN]; - int tlen = UCS2FromUTF8(s, len, tbuf, MAX_US_LEN); + int tlen = UTF16FromUTF8(s, len, tbuf, MAX_US_LEN); int poses[MAX_US_LEN]; fit = tlen; if (!::GetTextExtentExPointW(hdc, tbuf, tlen, maxWidthMeasure, &fit, poses, &sz)) { @@ -778,14 +778,17 @@ void SurfaceImpl::MeasureWidths(Font &font_, const char *s, int len, int *positi int i=0; while (ui= 0x80) { - if (uch < (0x80 + 0x40 + 0x20)) { - positions[i++] = poses[ui]; - } else { - positions[i++] = poses[ui]; - positions[i++] = poses[ui]; - } + unsigned int lenChar = 1; + if (uch >= (0x80 + 0x40 + 0x20 + 0x10)) { + lenChar = 4; + ui++; + } else if (uch >= (0x80 + 0x40 + 0x20)) { + lenChar = 3; + } else if (uch >= (0x80)) { + lenChar = 2; + } + for (unsigned int bytePos=0; bytePoshDC, tbuf, tlen, &rcText, DT_NOPREFIX|DT_END_ELLIPSIS|DT_SINGLELINE|DT_NOCLIP); } else { diff --git a/win32/ScintillaWin.cxx b/win32/ScintillaWin.cxx index 046da18a6..acca53d6a 100644 --- a/win32/ScintillaWin.cxx +++ b/win32/ScintillaWin.cxx @@ -487,7 +487,7 @@ sptr_t ScintillaWin::HandleComposition(uptr_t wParam, sptr_t lParam) { if (IsUnicodeMode()) { char utfval[maxLenInputIME * 3]; unsigned int len = UTF8Length(wcs, wides); - UTF8FromUCS2(wcs, wides, utfval, len); + UTF8FromUTF16(wcs, wides, utfval, len); utfval[len] = '\0'; AddCharUTF(utfval, len); } else { @@ -725,7 +725,7 @@ sptr_t ScintillaWin::WndProc(unsigned int iMessage, uptr_t wParam, sptr_t lParam //char utfval[4]; //wchar_t wcs[2] = {wParam, 0}; //unsigned int len = UTF8Length(wcs, 1); - //UTF8FromUCS2(wcs, 1, utfval, len); + //UTF8FromUTF16(wcs, 1, utfval, len); //AddCharUTF(utfval, len); AddCharBytes('\0', LOBYTE(wParam)); } else { @@ -744,7 +744,7 @@ sptr_t ScintillaWin::WndProc(unsigned int iMessage, uptr_t wParam, sptr_t lParam char utfval[4]; wchar_t wcs[2] = {static_cast(wParam), 0}; unsigned int len = UTF8Length(wcs, 1); - UTF8FromUCS2(wcs, 1, utfval, len); + UTF8FromUTF16(wcs, 1, utfval, len); AddCharUTF(utfval, len); return 1; } else { @@ -1301,7 +1301,7 @@ void ScintillaWin::Paste() { len = UTF8Length(uptr, bytes / 2); putf = new char[len + 1]; if (putf) { - UTF8FromUCS2(uptr, bytes / 2, putf, len); + UTF8FromUTF16(uptr, bytes / 2, putf, len); } } else { // CF_UNICODETEXT available, but not in Unicode mode @@ -1346,8 +1346,8 @@ void ScintillaWin::Paste() { unsigned int mlen = UTF8Length(uptr, ulen); char *putf = new char[mlen + 1]; if (putf) { - // CP_UTF8 not available on Windows 95, so use UTF8FromUCS2() - UTF8FromUCS2(uptr, ulen, putf, mlen); + // CP_UTF8 not available on Windows 95, so use UTF8FromUTF16() + UTF8FromUTF16(uptr, ulen, putf, mlen); } delete []uptr; @@ -1775,7 +1775,7 @@ void ScintillaWin::AddCharBytes(char b0, char b1) { ::MultiByteToWideChar(inputCodePage, 0, ansiChars, 1, wcs, 1); } unsigned int len = UTF8Length(wcs, 1); - UTF8FromUCS2(wcs, 1, utfval, len); + UTF8FromUTF16(wcs, 1, utfval, len); utfval[len] = '\0'; AddCharUTF(utfval, len ? len : 1); } else if (b0) { @@ -1803,10 +1803,10 @@ void ScintillaWin::CopyToClipboard(const SelectionText &selectedText) { // Default Scintilla behaviour in Unicode mode if (IsUnicodeMode()) { - int uchars = UCS2Length(selectedText.s, selectedText.len); + int uchars = UTF16Length(selectedText.s, selectedText.len); uniText.Allocate(2 * uchars); if (uniText) { - UCS2FromUTF8(selectedText.s, selectedText.len, static_cast(uniText.ptr), uchars); + UTF16FromUTF8(selectedText.s, selectedText.len, static_cast(uniText.ptr), uchars); } } else { // Not Unicode mode @@ -2093,7 +2093,7 @@ STDMETHODIMP ScintillaWin::Drop(LPDATAOBJECT pIDataSource, DWORD grfKeyState, int dataLen = UTF8Length(udata, tlen/2); data = new char[dataLen+1]; if (data) { - UTF8FromUCS2(udata, tlen/2, data, dataLen); + UTF8FromUTF16(udata, tlen/2, data, dataLen); dataAllocated = true; } } @@ -2153,10 +2153,10 @@ STDMETHODIMP ScintillaWin::GetData(FORMATETC *pFEIn, STGMEDIUM *pSTM) { GlobalMemory text; if (pFEIn->cfFormat == CF_UNICODETEXT) { - int uchars = UCS2Length(drag.s, drag.len); + int uchars = UTF16Length(drag.s, drag.len); text.Allocate(2 * uchars); if (text) { - UCS2FromUTF8(drag.s, drag.len, static_cast(text.ptr), uchars); + UTF16FromUTF8(drag.s, drag.len, static_cast(text.ptr), uchars); } } else { text.Allocate(drag.len); -- cgit v1.2.3