From 0bb4d5456748c8794a943b4716ee089d0590519c Mon Sep 17 00:00:00 2001 From: Zufu Liu Date: Sat, 24 Mar 2018 13:53:22 +1100 Subject: Feature [feature-requests:#1212]. Move Unicode conversions into UniConversion. Move Unicode conversion functions UnicodeFromUTF8 and UTF8FromUTF32Character into UniConversion. --- src/CaseConvert.cxx | 21 --------------------- src/Document.cxx | 1 - src/UniConversion.cxx | 22 +++++++++++++++++++++- src/UniConversion.h | 16 +++++++++++++++- src/UnicodeFromUTF8.h | 28 ---------------------------- 5 files changed, 36 insertions(+), 52 deletions(-) delete mode 100644 src/UnicodeFromUTF8.h (limited to 'src') diff --git a/src/CaseConvert.cxx b/src/CaseConvert.cxx index 76bc0c652..49205cb3e 100644 --- a/src/CaseConvert.cxx +++ b/src/CaseConvert.cxx @@ -18,7 +18,6 @@ #include "StringCopy.h" #include "CaseConvert.h" #include "UniConversion.h" -#include "UnicodeFromUTF8.h" using namespace Scintilla; @@ -665,26 +664,6 @@ CaseConverter caseConvFold; CaseConverter caseConvUp; CaseConverter caseConvLow; -void UTF8FromUTF32Character(int uch, char *putf) { - size_t k = 0; - if (uch < 0x80) { - putf[k++] = static_cast(uch); - } else if (uch < 0x800) { - putf[k++] = static_cast(0xC0 | (uch >> 6)); - putf[k++] = static_cast(0x80 | (uch & 0x3f)); - } else if (uch < 0x10000) { - putf[k++] = static_cast(0xE0 | (uch >> 12)); - putf[k++] = static_cast(0x80 | ((uch >> 6) & 0x3f)); - putf[k++] = static_cast(0x80 | (uch & 0x3f)); - } else { - putf[k++] = static_cast(0xF0 | (uch >> 18)); - putf[k++] = static_cast(0x80 | ((uch >> 12) & 0x3f)); - putf[k++] = static_cast(0x80 | ((uch >> 6) & 0x3f)); - putf[k++] = static_cast(0x80 | (uch & 0x3f)); - } - putf[k] = 0; -} - void AddSymmetric(enum CaseConversion conversion, int lower,int upper) { char lowerUTF8[UTF8MaxBytes+1]; UTF8FromUTF32Character(lower, lowerUTF8); diff --git a/src/Document.cxx b/src/Document.cxx index 48913a16c..cb2892c96 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -51,7 +51,6 @@ #include "Document.h" #include "RESearch.h" #include "UniConversion.h" -#include "UnicodeFromUTF8.h" using namespace Scintilla; diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index 19b968932..de86b0b76 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -65,6 +65,26 @@ void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len) { putf[k] = '\0'; } +void UTF8FromUTF32Character(int uch, char *putf) { + size_t k = 0; + if (uch < 0x80) { + putf[k++] = static_cast(uch); + } else if (uch < 0x800) { + putf[k++] = static_cast(0xC0 | (uch >> 6)); + putf[k++] = static_cast(0x80 | (uch & 0x3f)); + } else if (uch < 0x10000) { + putf[k++] = static_cast(0xE0 | (uch >> 12)); + putf[k++] = static_cast(0x80 | ((uch >> 6) & 0x3f)); + putf[k++] = static_cast(0x80 | (uch & 0x3f)); + } else { + putf[k++] = static_cast(0xF0 | (uch >> 18)); + putf[k++] = static_cast(0x80 | ((uch >> 12) & 0x3f)); + putf[k++] = static_cast(0x80 | ((uch >> 6) & 0x3f)); + putf[k++] = static_cast(0x80 | (uch & 0x3f)); + } + putf[k] = '\0'; +} + size_t UTF16Length(const char *s, size_t len) { size_t ulen = 0; const unsigned char *us = reinterpret_cast(s); @@ -101,7 +121,7 @@ size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) { break; } - const size_t outLen = (byteCount < 4) ? 1 : 2; + const size_t outLen = UTF16LengthFromUTF8ByteCount(byteCount); if (ui + outLen > tlen) { throw std::runtime_error("UTF16FromUTF8: attempted write beyond end"); } diff --git a/src/UniConversion.h b/src/UniConversion.h index 0f22c06e6..98bcd0329 100644 --- a/src/UniConversion.h +++ b/src/UniConversion.h @@ -16,6 +16,7 @@ const int unicodeReplacementChar = 0xFFFD; size_t UTF8Length(const wchar_t *uptr, size_t tlen); void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len); +void UTF8FromUTF32Character(int uch, char *putf); size_t UTF16Length(const char *s, size_t len); size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen); size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen); @@ -24,6 +25,19 @@ std::string FixInvalidUTF8(const std::string &text); extern const unsigned char UTF8BytesOfLead[256]; +inline int UnicodeFromUTF8(const unsigned char *us) { + switch (UTF8BytesOfLead[us[0]]) { + case 1: + return us[0]; + case 2: + return ((us[0] & 0x1F) << 6) + (us[1] & 0x3F); + case 3: + return ((us[0] & 0xF) << 12) + ((us[1] & 0x3F) << 6) + (us[2] & 0x3F); + default: + return ((us[0] & 0x7) << 18) + ((us[1] & 0x3F) << 12) + ((us[2] & 0x3F) << 6) + (us[3] & 0x3F); + } +} + inline bool UTF8IsTrailByte(unsigned char ch) { return (ch >= 0x80) && (ch < 0xc0); } @@ -63,7 +77,7 @@ inline unsigned int UTF16CharLength(wchar_t uch) { } inline unsigned int UTF16LengthFromUTF8ByteCount(unsigned int byteCount) { - return (byteCount < 4) ? 1 : 2; + return (byteCount < 4) ? 1 : 2; } } diff --git a/src/UnicodeFromUTF8.h b/src/UnicodeFromUTF8.h deleted file mode 100644 index 17999a786..000000000 --- a/src/UnicodeFromUTF8.h +++ /dev/null @@ -1,28 +0,0 @@ -// Scintilla source code edit control -/** @file UnicodeFromUTF8.h - ** Lexer infrastructure. - **/ -// Copyright 2013 by Neil Hodgson -// This file is in the public domain. - -#ifndef UNICODEFROMUTF8_H -#define UNICODEFROMUTF8_H - -namespace Scintilla { - -inline int UnicodeFromUTF8(const unsigned char *us) { - if (us[0] < 0xC2) { - return us[0]; - } else if (us[0] < 0xE0) { - return ((us[0] & 0x1F) << 6) + (us[1] & 0x3F); - } else if (us[0] < 0xF0) { - return ((us[0] & 0xF) << 12) + ((us[1] & 0x3F) << 6) + (us[2] & 0x3F); - } else if (us[0] < 0xF5) { - return ((us[0] & 0x7) << 18) + ((us[1] & 0x3F) << 12) + ((us[2] & 0x3F) << 6) + (us[3] & 0x3F); - } - return us[0]; -} - -} - -#endif -- cgit v1.2.3