diff options
| -rw-r--r-- | gtk/PlatGTK.cxx | 6 | ||||
| -rw-r--r-- | src/Document.cxx | 4 | ||||
| -rw-r--r-- | src/UniConversion.cxx | 67 | ||||
| -rw-r--r-- | src/UniConversion.h | 6 | ||||
| -rw-r--r-- | win32/PlatWin.cxx | 29 | ||||
| -rw-r--r-- | win32/ScintillaWin.cxx | 24 | 
6 files changed, 92 insertions, 44 deletions
| diff --git a/gtk/PlatGTK.cxx b/gtk/PlatGTK.cxx index 59e4d9dca..e361b3e17 100644 --- a/gtk/PlatGTK.cxx +++ b/gtk/PlatGTK.cxx @@ -1293,7 +1293,7 @@ void SurfaceImpl::DrawTextBase(PRectangle rc, Font &font_, int ybase, const char  				len = maxLengthTextRun-1;  			int wclen;  			if (et == UTF8) { -				wclen = UCS2FromUTF8(s, len, +				wclen = UTF16FromUTF8(s, len,  					static_cast<wchar_t *>(static_cast<void *>(wctext)), maxLengthTextRun - 1);  			} else {	// dbcs, so convert using current locale  				char sMeasure[maxLengthTextRun]; @@ -1468,7 +1468,7 @@ void SurfaceImpl::MeasureWidths(Font &font_, const char *s, int len, int *positi  				len = maxLengthTextRun-1;  			int wclen;  			if (et == UTF8) { -				wclen = UCS2FromUTF8(s, len, +				wclen = UTF16FromUTF8(s, len,  					static_cast<wchar_t *>(static_cast<void *>(wctext)), maxLengthTextRun - 1);  			} else {	// dbcsMode, so convert using current locale  				char sDraw[maxLengthTextRun]; @@ -1554,7 +1554,7 @@ int SurfaceImpl::WidthText(Font &font_, const char *s, int len) {  #endif  		if (et == UTF8) {  			GdkWChar wctext[maxLengthTextRun]; -			size_t wclen = UCS2FromUTF8(s, len, static_cast<wchar_t *>(static_cast<void *>(wctext)), +			size_t wclen = UTF16FromUTF8(s, len, static_cast<wchar_t *>(static_cast<void *>(wctext)),  				sizeof(wctext) / sizeof(GdkWChar) - 1);  			wctext[wclen] = L'\0';  			return gdk_text_width_wc(PFont(font_)->pfont, wctext, wclen); diff --git a/src/Document.cxx b/src/Document.cxx index a25e3070d..3061bbc37 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -266,7 +266,9 @@ int Document::LenChar(int pos) {  		if (ch < 0x80)  			return 1;  		int len = 2; -		if (ch >= (0x80 + 0x40 + 0x20)) +		if (ch >= (0x80 + 0x40 + 0x20 + 0x10)) +			len = 4; +		else if (ch >= (0x80 + 0x40 + 0x20))  			len = 3;  		int lengthDoc = Length();  		if ((pos + len) > lengthDoc) diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index 363db90f4..863eb82cd 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -9,49 +9,80 @@  #include "UniConversion.h" +enum { SURROGATE_LEAD_FIRST = 0xD800 }; +enum { SURROGATE_TRAIL_FIRST = 0xDC00 }; +enum { SURROGATE_TRAIL_LAST = 0xDFFF }; +  unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen) {  	unsigned int len = 0; -	for (unsigned int i = 0; i < tlen && uptr[i]; i++) { +	for (unsigned int i = 0; i < tlen && uptr[i];) {  		unsigned int uch = uptr[i]; -		if (uch < 0x80) +		if (uch < 0x80) {  			len++; -		else if (uch < 0x800) +		} else if (uch < 0x800) {  			len += 2; -		else -			len +=3; +		} else if ((uch >= SURROGATE_LEAD_FIRST) && +			(uch <= SURROGATE_TRAIL_LAST)) { +			len += 4; +			i++; +		} else { +			len += 3; +		} +		i++;  	}  	return len;  } -void UTF8FromUCS2(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len) { +void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len) {  	int k = 0; -	for (unsigned int i = 0; i < tlen && uptr[i]; i++) { +	for (unsigned int i = 0; i < tlen && uptr[i];) {  		unsigned int uch = uptr[i];  		if (uch < 0x80) {  			putf[k++] = static_cast<char>(uch);  		} else if (uch < 0x800) {  			putf[k++] = static_cast<char>(0xC0 | (uch >> 6));  			putf[k++] = static_cast<char>(0x80 | (uch & 0x3f)); +		} else if ((uch >= SURROGATE_LEAD_FIRST) && +			(uch <= SURROGATE_TRAIL_LAST)) { +			// Half a surrogate pair +			i++; +			unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff); +			putf[k++] = static_cast<char>(0xF0 | (xch >> 18)); +			putf[k++] = static_cast<char>(0x80 | (xch >> 12) & 0x3f); +			putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f)); +			putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));  		} else {  			putf[k++] = static_cast<char>(0xE0 | (uch >> 12));  			putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));  			putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));  		} +		i++;  	}  	putf[len] = '\0';  } -unsigned int UCS2Length(const char *s, unsigned int len) { +unsigned int UTF16Length(const char *s, unsigned int len) {  	unsigned int ulen = 0; -	for (unsigned int i=0;i<len;i++) { +	unsigned int charLen; +	for (unsigned int i=0;i<len;) {  		unsigned char ch = static_cast<unsigned char>(s[i]); -		if ((ch < 0x80) || (ch > (0x80 + 0x40))) +		if (ch < 0x80) { +			charLen = 1; +		} else if (ch < 0x80 + 0x40 + 0x20) { +			charLen = 2; +		} else if (ch < 0x80 + 0x40 + 0x20 + 0x10) { +			charLen = 3; +		} else { +			charLen = 4;  			ulen++; +		} +		i += charLen; +		ulen++;  	}  	return ulen;  } -unsigned int UCS2FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen) { +unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen) {  	unsigned int ui=0;  	const unsigned char *us = reinterpret_cast<const unsigned char *>(s);  	unsigned int i=0; @@ -63,12 +94,24 @@ unsigned int UCS2FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsign  			tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6);  			ch = us[i++];  			tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F)); -		} else { +		} else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {  			tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12);  			ch = us[i++];  			tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + ((ch & 0x7F) << 6));  			ch = us[i++];  			tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F)); +		} else { +			// Outside the BMP so need two surrogates +			int val = (ch & 0x7) << 18; +			ch = us[i++]; +			val += (ch & 0x3F) << 12; +			ch = us[i++]; +			val += (ch & 0x3F) << 6; +			ch = us[i++]; +			val += (ch & 0x3F); +			tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST); +			ui++; +			tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);  		}  		ui++;  	} diff --git a/src/UniConversion.h b/src/UniConversion.h index bd1d7754d..fd420a688 100644 --- a/src/UniConversion.h +++ b/src/UniConversion.h @@ -6,7 +6,7 @@  // The License.txt file describes the conditions under which this software may be distributed.  unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen); -void UTF8FromUCS2(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len); -unsigned int UCS2Length(const char *s, unsigned int len); -unsigned int UCS2FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen); +void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len); +unsigned int UTF16Length(const char *s, unsigned int len); +unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen); diff --git a/win32/PlatWin.cxx b/win32/PlatWin.cxx index b2cc554bb..dd3213bf9 100644 --- a/win32/PlatWin.cxx +++ b/win32/PlatWin.cxx @@ -684,7 +684,7 @@ void SurfaceImpl::DrawTextCommon(PRectangle rc, Font &font_, int ybase, const ch  		wchar_t tbuf[MAX_US_LEN];  		int tlen;  		if (unicodeMode) { -			tlen = UCS2FromUTF8(s, len, tbuf, MAX_US_LEN); +			tlen = UTF16FromUTF8(s, len, tbuf, MAX_US_LEN);  		} else {  			// Support Asian string display in 9x English  			tlen = ::MultiByteToWideChar(codePage, 0, s, len, NULL, 0); @@ -740,7 +740,7 @@ int SurfaceImpl::WidthText(Font &font_, const char *s, int len) {  	SIZE sz={0,0};  	if (unicodeMode) {  		wchar_t tbuf[MAX_US_LEN]; -		int tlen = UCS2FromUTF8(s, len, tbuf, MAX_US_LEN); +		int tlen = UTF16FromUTF8(s, len, tbuf, MAX_US_LEN);  		::GetTextExtentPoint32W(hdc, tbuf, tlen, &sz);  	} else if (IsNT() || (codePage==0) || win9xACPSame) {  		::GetTextExtentPoint32A(hdc, s, Platform::Minimum(len, maxLenText), &sz); @@ -760,7 +760,7 @@ void SurfaceImpl::MeasureWidths(Font &font_, const char *s, int len, int *positi  	int fit = 0;  	if (unicodeMode) {  		wchar_t tbuf[MAX_US_LEN]; -		int tlen = UCS2FromUTF8(s, len, tbuf, MAX_US_LEN); +		int tlen = UTF16FromUTF8(s, len, tbuf, MAX_US_LEN);  		int poses[MAX_US_LEN];  		fit = tlen;  		if (!::GetTextExtentExPointW(hdc, tbuf, tlen, maxWidthMeasure, &fit, poses, &sz)) { @@ -778,14 +778,17 @@ void SurfaceImpl::MeasureWidths(Font &font_, const char *s, int len, int *positi  		int i=0;  		while (ui<fit) {  			unsigned char uch = us[i]; -			positions[i++] = poses[ui]; -			if (uch >= 0x80) { -				if (uch < (0x80 + 0x40 + 0x20)) { -					positions[i++] = poses[ui]; -				} else { -					positions[i++] = poses[ui]; -					positions[i++] = poses[ui]; -				} +			unsigned int lenChar = 1; +			if (uch >= (0x80 + 0x40 + 0x20 + 0x10)) { +				lenChar = 4; +				ui++; +			} else if (uch >= (0x80 + 0x40 + 0x20)) { +				lenChar = 3; +			} else if (uch >= (0x80)) { +				lenChar = 2; +			} +			for (unsigned int bytePos=0; bytePos<lenChar; bytePos++) { +				positions[i++] = poses[ui];  			}  			ui++;  		} @@ -1312,7 +1315,7 @@ PRectangle ListBoxX::GetDesiredRect() {  	int len = widestItem ? strlen(widestItem) : 0;  	if (unicodeMode) {  		wchar_t tbuf[MAX_US_LEN]; -		len = UCS2FromUTF8(widestItem, len, tbuf, sizeof(tbuf)/sizeof(wchar_t)-1); +		len = UTF16FromUTF8(widestItem, len, tbuf, sizeof(tbuf)/sizeof(wchar_t)-1);  		tbuf[len] = L'\0';  		::GetTextExtentPoint32W(hdc, tbuf, len, &textSize);  	} else { @@ -1431,7 +1434,7 @@ void ListBoxX::Draw(DRAWITEMSTRUCT *pDrawItem) {  		if (unicodeMode) {  			wchar_t tbuf[MAX_US_LEN]; -			int tlen = UCS2FromUTF8(text, len, tbuf, sizeof(tbuf)/sizeof(wchar_t)-1); +			int tlen = UTF16FromUTF8(text, len, tbuf, sizeof(tbuf)/sizeof(wchar_t)-1);  			tbuf[tlen] = L'\0';  			::DrawTextW(pDrawItem->hDC, tbuf, tlen, &rcText, DT_NOPREFIX|DT_END_ELLIPSIS|DT_SINGLELINE|DT_NOCLIP);  		} else { diff --git a/win32/ScintillaWin.cxx b/win32/ScintillaWin.cxx index 046da18a6..acca53d6a 100644 --- a/win32/ScintillaWin.cxx +++ b/win32/ScintillaWin.cxx @@ -487,7 +487,7 @@ sptr_t ScintillaWin::HandleComposition(uptr_t wParam, sptr_t lParam) {  			if (IsUnicodeMode()) {  				char utfval[maxLenInputIME * 3];  				unsigned int len = UTF8Length(wcs, wides); -				UTF8FromUCS2(wcs, wides, utfval, len); +				UTF8FromUTF16(wcs, wides, utfval, len);  				utfval[len] = '\0';  				AddCharUTF(utfval, len);  			} else { @@ -725,7 +725,7 @@ sptr_t ScintillaWin::WndProc(unsigned int iMessage, uptr_t wParam, sptr_t lParam  				//char utfval[4];  				//wchar_t wcs[2] = {wParam, 0};  				//unsigned int len = UTF8Length(wcs, 1); -				//UTF8FromUCS2(wcs, 1, utfval, len); +				//UTF8FromUTF16(wcs, 1, utfval, len);  				//AddCharUTF(utfval, len);  				AddCharBytes('\0', LOBYTE(wParam));  			} else { @@ -744,7 +744,7 @@ sptr_t ScintillaWin::WndProc(unsigned int iMessage, uptr_t wParam, sptr_t lParam  				char utfval[4];  				wchar_t wcs[2] = {static_cast<wchar_t>(wParam), 0};  				unsigned int len = UTF8Length(wcs, 1); -				UTF8FromUCS2(wcs, 1, utfval, len); +				UTF8FromUTF16(wcs, 1, utfval, len);  				AddCharUTF(utfval, len);  				return 1;  			} else { @@ -1301,7 +1301,7 @@ void ScintillaWin::Paste() {  				len = UTF8Length(uptr, bytes / 2);  				putf = new char[len + 1];  				if (putf) { -					UTF8FromUCS2(uptr, bytes / 2, putf, len); +					UTF8FromUTF16(uptr, bytes / 2, putf, len);  				}  			} else {  				// CF_UNICODETEXT available, but not in Unicode mode @@ -1346,8 +1346,8 @@ void ScintillaWin::Paste() {  					unsigned int mlen = UTF8Length(uptr, ulen);  					char *putf = new char[mlen + 1];  					if (putf) { -						// CP_UTF8 not available on Windows 95, so use UTF8FromUCS2() -						UTF8FromUCS2(uptr, ulen, putf, mlen); +						// CP_UTF8 not available on Windows 95, so use UTF8FromUTF16() +						UTF8FromUTF16(uptr, ulen, putf, mlen);  					}  					delete []uptr; @@ -1775,7 +1775,7 @@ void ScintillaWin::AddCharBytes(char b0, char b1) {  			::MultiByteToWideChar(inputCodePage, 0, ansiChars, 1, wcs, 1);  		}  		unsigned int len = UTF8Length(wcs, 1); -		UTF8FromUCS2(wcs, 1, utfval, len); +		UTF8FromUTF16(wcs, 1, utfval, len);  		utfval[len] = '\0';  		AddCharUTF(utfval, len ? len : 1);  	} else if (b0) { @@ -1803,10 +1803,10 @@ void ScintillaWin::CopyToClipboard(const SelectionText &selectedText) {  	// Default Scintilla behaviour in Unicode mode  	if (IsUnicodeMode()) { -		int uchars = UCS2Length(selectedText.s, selectedText.len); +		int uchars = UTF16Length(selectedText.s, selectedText.len);  		uniText.Allocate(2 * uchars);  		if (uniText) { -			UCS2FromUTF8(selectedText.s, selectedText.len, static_cast<wchar_t *>(uniText.ptr), uchars); +			UTF16FromUTF8(selectedText.s, selectedText.len, static_cast<wchar_t *>(uniText.ptr), uchars);  		}  	} else {  		// Not Unicode mode @@ -2093,7 +2093,7 @@ STDMETHODIMP ScintillaWin::Drop(LPDATAOBJECT pIDataSource, DWORD grfKeyState,  			int dataLen = UTF8Length(udata, tlen/2);  			data = new char[dataLen+1];  			if (data) { -				UTF8FromUCS2(udata, tlen/2, data, dataLen); +				UTF8FromUTF16(udata, tlen/2, data, dataLen);  				dataAllocated = true;  			}  		} @@ -2153,10 +2153,10 @@ STDMETHODIMP ScintillaWin::GetData(FORMATETC *pFEIn, STGMEDIUM *pSTM) {  	GlobalMemory text;  	if (pFEIn->cfFormat == CF_UNICODETEXT) { -		int uchars = UCS2Length(drag.s, drag.len); +		int uchars = UTF16Length(drag.s, drag.len);  		text.Allocate(2 * uchars);  		if (text) { -			UCS2FromUTF8(drag.s, drag.len, static_cast<wchar_t *>(text.ptr), uchars); +			UTF16FromUTF8(drag.s, drag.len, static_cast<wchar_t *>(text.ptr), uchars);  		}  	} else {  		text.Allocate(drag.len); | 
