diff options
author | Neil <nyamatongwe@gmail.com> | 2019-03-20 19:18:57 +1100 |
---|---|---|
committer | Neil <nyamatongwe@gmail.com> | 2019-03-20 19:18:57 +1100 |
commit | 31a735c8ec0a4f7c62b162cd1c173bfef76737c5 (patch) | |
tree | e7a3ad283f5b7c44d79cfbcdb78d8cdf34b68bf2 /src | |
parent | 4ac748ba2238d3abd5227918d8174507d8fa3ec1 (diff) | |
download | scintilla-mirror-31a735c8ec0a4f7c62b162cd1c173bfef76737c5.tar.gz |
Backport: Implement WStringFromUTF8 to simplify code that creates wstring objects for
regular expressions and calling the Win32 API.
Backport of changeset 7325:6148329fb2f3, but replaced std::string_view usage
with const char* and size_t components. Also used #ifdef instead of C++17
`if constexpr` at suggestion of Neil.
Diffstat (limited to 'src')
-rw-r--r-- | src/Document.cxx | 11 | ||||
-rw-r--r-- | src/UniConversion.cxx | 25 | ||||
-rw-r--r-- | src/UniConversion.h | 4 |
3 files changed, 31 insertions, 9 deletions
diff --git a/src/Document.cxx b/src/Document.cxx index f1079398d..4aad2e370 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -3061,16 +3061,9 @@ Sci::Position Cxx11RegexFindText(const Document *doc, Sci::Position minPos, Sci: bool matched = false; if (SC_CP_UTF8 == doc->dbcsCodePage) { - const size_t lenS = strlen(s); - std::vector<wchar_t> ws(lenS + 1); -#if WCHAR_T_IS_16 - const size_t outLen = UTF16FromUTF8(s, lenS, &ws[0], lenS); -#else - const size_t outLen = UTF32FromUTF8(s, lenS, reinterpret_cast<unsigned int *>(&ws[0]), lenS); -#endif - ws[outLen] = 0; + const std::wstring ws = WStringFromUTF8(s, strlen(s)); std::wregex regexp; - regexp.assign(&ws[0], flagsRe); + regexp.assign(ws, flagsRe); matched = MatchOnLines<UTF8Iterator>(doc, regexp, resr, search); } else { diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index 6cd6a8ba9..8cbb3cdd2 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -162,6 +162,17 @@ size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) { return ui; } +size_t UTF32Length(const char *s, size_t len) noexcept { + size_t ulen = 0; + for (size_t i = 0; i < len;) { + const unsigned char ch = s[i]; + const unsigned int byteCount = UTF8BytesOfLead[ch]; + i += byteCount; + ulen++; + } + return ulen; +} + size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen) { size_t ui = 0; for (size_t i = 0; i < len;) { @@ -215,6 +226,20 @@ size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen) return ui; } +std::wstring WStringFromUTF8(const char *s, size_t len) { +#ifdef _WIN32 + const size_t len16 = UTF16Length(s, len); + std::wstring ws(len16, 0); + UTF16FromUTF8(s, len, &ws[0], len16); + return ws; +#else + const size_t len32 = UTF32Length(s, len); + std::wstring ws(len32, 0); + UTF32FromUTF8(s, len, reinterpret_cast<unsigned int *>(&ws[0]), len32); + return ws; +#endif +} + unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept { if (val < SUPPLEMENTAL_PLANE_FIRST) { tbuf[0] = static_cast<wchar_t>(val); diff --git a/src/UniConversion.h b/src/UniConversion.h index 4bb8875d0..9f405e1ed 100644 --- a/src/UniConversion.h +++ b/src/UniConversion.h @@ -19,7 +19,11 @@ void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len); void UTF8FromUTF32Character(int uch, char *putf); size_t UTF16Length(const char *s, size_t len); size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen); +size_t UTF32Length(const char *s, size_t len) noexcept; size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen); +// WStringFromUTF8 does the right thing when wchar_t is 2 or 4 bytes so +// works on both Windows and Unix. +std::wstring WStringFromUTF8(const char *s, size_t len); unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept; bool UTF8IsValid(const char *s, size_t len) noexcept; std::string FixInvalidUTF8(const std::string &text); |