diff options
author | Neil <nyamatongwe@gmail.com> | 2013-07-11 10:43:40 +1000 |
---|---|---|
committer | Neil <nyamatongwe@gmail.com> | 2013-07-11 10:43:40 +1000 |
commit | dad0081820141b9823f8a4ad633b28515f055f1f (patch) | |
tree | 823e4cc29a3fa9a524aa308bb824cf5406c7b47f /src | |
parent | 431004e5efda4bddbeb265db3d0e28fda828a808 (diff) | |
download | scintilla-mirror-dad0081820141b9823f8a4ad633b28515f055f1f.tar.gz |
Include case conversion data in Scintilla so that all platforms will perform
case conversion of Unicode text in accordance with Unicode.
Diffstat (limited to 'src')
-rw-r--r-- | src/CaseConvert.cxx | 620 | ||||
-rw-r--r-- | src/CaseConvert.h | 47 | ||||
-rw-r--r-- | src/CaseFolder.cxx | 68 | ||||
-rw-r--r-- | src/CaseFolder.h | 45 | ||||
-rw-r--r-- | src/Document.cxx | 42 | ||||
-rw-r--r-- | src/Document.h | 18 | ||||
-rw-r--r-- | src/Editor.cxx | 1 | ||||
-rw-r--r-- | src/PositionCache.cxx | 1 | ||||
-rw-r--r-- | src/ScintillaBase.cxx | 1 | ||||
-rw-r--r-- | src/UnicodeFromUTF8.h | 19 |
10 files changed, 803 insertions, 59 deletions
diff --git a/src/CaseConvert.cxx b/src/CaseConvert.cxx new file mode 100644 index 000000000..d9ecc3b68 --- /dev/null +++ b/src/CaseConvert.cxx @@ -0,0 +1,620 @@ +// Scintilla source code edit control +// Encoding: UTF-8 +/** @file CaseConvert.cxx + ** Case fold characters and convert them to upper or lower case. + ** Tables automatically regenerated by scripts/GenerateCharacterCategory.py + ** Should only be rarely regenerated for new versions of Unicode. + **/ +// Copyright 2013 by Neil Hodgson <neilh@scintilla.org> +// The License.txt file describes the conditions under which this software may be distributed. + +#include <cstring> + +#include <vector> +#include <algorithm> + +#include "CaseConvert.h" +#include "UniConversion.h" +#include "UnicodeFromUTF8.h" + +#ifdef SCI_NAMESPACE +using namespace Scintilla; +#endif + +namespace { + // Use an unnamed namespace to protect the declarations from name conflicts + +// Unicode code points are ordered by groups and follow patterns. +// Most characters (pitch==1) are in ranges for a particular alphabet and their +// upper case forms are a fixed distance away. +// Another pattern (pitch==2) is where each lower case letter is preceded by +// the upper case form. These are also grouped into ranges. + +int symmetricCaseConversionRanges[] = { +//lower, upper, range length, range pitch +//++Autogenerated -- start of section automatically generated +//**\(\*\n\) +97,65,26,1, +224,192,23,1, +248,216,7,1, +257,256,24,2, +314,313,8,2, +331,330,23,2, +462,461,8,2, +479,478,9,2, +505,504,20,2, +547,546,9,2, +583,582,5,2, +945,913,17,1, +963,931,9,1, +985,984,12,2, +1072,1040,32,1, +1104,1024,16,1, +1121,1120,17,2, +1163,1162,27,2, +1218,1217,7,2, +1233,1232,44,2, +1377,1329,38,1, +7681,7680,75,2, +7841,7840,48,2, +7936,7944,8,1, +7952,7960,6,1, +7968,7976,8,1, +7984,7992,8,1, +8000,8008,6,1, +8032,8040,8,1, +8560,8544,16,1, +9424,9398,26,1, +11312,11264,47,1, +11393,11392,50,2, +11520,4256,38,1, +42561,42560,23,2, +42625,42624,12,2, +42787,42786,7,2, +42803,42802,31,2, +42879,42878,5,2, +42913,42912,5,2, +65345,65313,26,1, +66600,66560,40,1, + +//--Autogenerated -- end of section automatically generated +}; + +// Code points that are symmetric but don't fit into a range of similar characters +// are listed here. + +int symmetricCaseConversions[] = { +//lower, upper +//++Autogenerated -- start of section automatically generated +//**1 \(\*\n\) +255,376, +307,306, +309,308, +311,310, +378,377, +380,379, +382,381, +384,579, +387,386, +389,388, +392,391, +396,395, +402,401, +405,502, +409,408, +410,573, +414,544, +417,416, +419,418, +421,420, +424,423, +429,428, +432,431, +436,435, +438,437, +441,440, +445,444, +447,503, +454,452, +457,455, +460,458, +477,398, +499,497, +501,500, +572,571, +575,11390, +576,11391, +578,577, +592,11375, +593,11373, +594,11376, +595,385, +596,390, +598,393, +599,394, +601,399, +603,400, +608,403, +611,404, +613,42893, +614,42922, +616,407, +617,406, +619,11362, +623,412, +625,11374, +626,413, +629,415, +637,11364, +640,422, +643,425, +648,430, +649,580, +650,433, +651,434, +652,581, +658,439, +881,880, +883,882, +887,886, +891,1021, +892,1022, +893,1023, +940,902, +941,904, +942,905, +943,906, +972,908, +973,910, +974,911, +983,975, +1010,1017, +1016,1015, +1019,1018, +1231,1216, +7545,42877, +7549,11363, +8017,8025, +8019,8027, +8021,8029, +8023,8031, +8048,8122, +8049,8123, +8050,8136, +8051,8137, +8052,8138, +8053,8139, +8054,8154, +8055,8155, +8056,8184, +8057,8185, +8058,8170, +8059,8171, +8060,8186, +8061,8187, +8112,8120, +8113,8121, +8144,8152, +8145,8153, +8160,8168, +8161,8169, +8165,8172, +8526,8498, +8580,8579, +11361,11360, +11365,570, +11366,574, +11368,11367, +11370,11369, +11372,11371, +11379,11378, +11382,11381, +11500,11499, +11502,11501, +11507,11506, +11559,4295, +11565,4301, +42874,42873, +42876,42875, +42892,42891, +42897,42896, +42899,42898, + +//--Autogenerated -- end of section automatically generated +}; + +// Characters that have complex case conversions are listed here. +// This includes cases where more than one character is needed for a conversion, +// folding is different to lowering, or (as appropriate) upper(lower(x)) != x or +// lower(upper(x)) != x. + +const char *complexCaseConversions = +// Original | Folded | Upper | Lower | +//++Autogenerated -- start of section automatically generated +//**2 \(\*\n\) +"µ|μ|Μ||" +"ß|ss|SS||" +"İ|i̇||i̇|" +"ı||I||" +"ʼn|ʼn|ʼN||" +"ſ|s|S||" +"Dž|dž|DŽ|dž|" +"Lj|lj|LJ|lj|" +"Nj|nj|NJ|nj|" +"ǰ|ǰ|J̌||" +"Dz|dz|DZ|dz|" +"ͅ|ι|Ι||" +"ΐ|ΐ|Ϊ́||" +"ΰ|ΰ|Ϋ́||" +"ς|σ|Σ||" +"ϐ|β|Β||" +"ϑ|θ|Θ||" +"ϕ|φ|Φ||" +"ϖ|π|Π||" +"ϰ|κ|Κ||" +"ϱ|ρ|Ρ||" +"ϴ|θ||θ|" +"ϵ|ε|Ε||" +"և|եւ|ԵՒ||" +"ẖ|ẖ|H̱||" +"ẗ|ẗ|T̈||" +"ẘ|ẘ|W̊||" +"ẙ|ẙ|Y̊||" +"ẚ|aʾ|Aʾ||" +"ẛ|ṡ|Ṡ||" +"ẞ|ss||ß|" +"ὐ|ὐ|Υ̓||" +"ὒ|ὒ|Υ̓̀||" +"ὔ|ὔ|Υ̓́||" +"ὖ|ὖ|Υ̓͂||" +"ᾀ|ἀι|ἈΙ||" +"ᾁ|ἁι|ἉΙ||" +"ᾂ|ἂι|ἊΙ||" +"ᾃ|ἃι|ἋΙ||" +"ᾄ|ἄι|ἌΙ||" +"ᾅ|ἅι|ἍΙ||" +"ᾆ|ἆι|ἎΙ||" +"ᾇ|ἇι|ἏΙ||" +"ᾈ|ἀι|ἈΙ|ᾀ|" +"ᾉ|ἁι|ἉΙ|ᾁ|" +"ᾊ|ἂι|ἊΙ|ᾂ|" +"ᾋ|ἃι|ἋΙ|ᾃ|" +"ᾌ|ἄι|ἌΙ|ᾄ|" +"ᾍ|ἅι|ἍΙ|ᾅ|" +"ᾎ|ἆι|ἎΙ|ᾆ|" +"ᾏ|ἇι|ἏΙ|ᾇ|" +"ᾐ|ἠι|ἨΙ||" +"ᾑ|ἡι|ἩΙ||" +"ᾒ|ἢι|ἪΙ||" +"ᾓ|ἣι|ἫΙ||" +"ᾔ|ἤι|ἬΙ||" +"ᾕ|ἥι|ἭΙ||" +"ᾖ|ἦι|ἮΙ||" +"ᾗ|ἧι|ἯΙ||" +"ᾘ|ἠι|ἨΙ|ᾐ|" +"ᾙ|ἡι|ἩΙ|ᾑ|" +"ᾚ|ἢι|ἪΙ|ᾒ|" +"ᾛ|ἣι|ἫΙ|ᾓ|" +"ᾜ|ἤι|ἬΙ|ᾔ|" +"ᾝ|ἥι|ἭΙ|ᾕ|" +"ᾞ|ἦι|ἮΙ|ᾖ|" +"ᾟ|ἧι|ἯΙ|ᾗ|" +"ᾠ|ὠι|ὨΙ||" +"ᾡ|ὡι|ὩΙ||" +"ᾢ|ὢι|ὪΙ||" +"ᾣ|ὣι|ὫΙ||" +"ᾤ|ὤι|ὬΙ||" +"ᾥ|ὥι|ὭΙ||" +"ᾦ|ὦι|ὮΙ||" +"ᾧ|ὧι|ὯΙ||" +"ᾨ|ὠι|ὨΙ|ᾠ|" +"ᾩ|ὡι|ὩΙ|ᾡ|" +"ᾪ|ὢι|ὪΙ|ᾢ|" +"ᾫ|ὣι|ὫΙ|ᾣ|" +"ᾬ|ὤι|ὬΙ|ᾤ|" +"ᾭ|ὥι|ὭΙ|ᾥ|" +"ᾮ|ὦι|ὮΙ|ᾦ|" +"ᾯ|ὧι|ὯΙ|ᾧ|" +"ᾲ|ὰι|ᾺΙ||" +"ᾳ|αι|ΑΙ||" +"ᾴ|άι|ΆΙ||" +"ᾶ|ᾶ|Α͂||" +"ᾷ|ᾶι|Α͂Ι||" +"ᾼ|αι|ΑΙ|ᾳ|" +"ι|ι|Ι||" +"ῂ|ὴι|ῊΙ||" +"ῃ|ηι|ΗΙ||" +"ῄ|ήι|ΉΙ||" +"ῆ|ῆ|Η͂||" +"ῇ|ῆι|Η͂Ι||" +"ῌ|ηι|ΗΙ|ῃ|" +"ῒ|ῒ|Ϊ̀||" +"ΐ|ΐ|Ϊ́||" +"ῖ|ῖ|Ι͂||" +"ῗ|ῗ|Ϊ͂||" +"ῢ|ῢ|Ϋ̀||" +"ΰ|ΰ|Ϋ́||" +"ῤ|ῤ|Ρ̓||" +"ῦ|ῦ|Υ͂||" +"ῧ|ῧ|Ϋ͂||" +"ῲ|ὼι|ῺΙ||" +"ῳ|ωι|ΩΙ||" +"ῴ|ώι|ΏΙ||" +"ῶ|ῶ|Ω͂||" +"ῷ|ῶι|Ω͂Ι||" +"ῼ|ωι|ΩΙ|ῳ|" +"Ω|ω||ω|" +"K|k||k|" +"Å|å||å|" +"ff|ff|FF||" +"fi|fi|FI||" +"fl|fl|FL||" +"ffi|ffi|FFI||" +"ffl|ffl|FFL||" +"ſt|st|ST||" +"st|st|ST||" +"ﬓ|մն|ՄՆ||" +"ﬔ|մե|ՄԵ||" +"ﬕ|մի|ՄԻ||" +"ﬖ|վն|ՎՆ||" +"ﬗ|մխ|ՄԽ||" + +//--Autogenerated -- end of section automatically generated +; + +class CaseConverter : public ICaseConverter { + // Maximum length of a case conversion result is 6 bytes in UTF-8 + enum { maxConversionLength=6 }; + struct ConversionString { + char conversion[maxConversionLength+1]; + }; + // Conversions are initially store in a vector of structs but then decomposed into + // parallel arrays as that is about 10% faster to search. + struct CharacterConversion { + int character; + ConversionString conversion; + CharacterConversion(int character_=0, const char *conversion_="") : character(character_) { + strcpy(conversion.conversion, conversion_); + } + bool operator<(const CharacterConversion &other) const { + return character < other.character; + } + }; + typedef std::vector<CharacterConversion> CharacterToConversion; + CharacterToConversion characterToConversion; + // The parallel arrays + std::vector<int> characters; + std::vector<ConversionString> conversions; + +public: + CaseConverter() { + } + bool Initialised() const { + return characters.size() > 0; + } + void Add(int character, const char *conversion) { + characterToConversion.push_back(CharacterConversion(character, conversion)); + } + const char *Find(int character) { + const std::vector<int>::iterator it = std::lower_bound(characters.begin(), characters.end(), character); + if (*it == character) + return conversions[it - characters.begin()].conversion; + else + return 0; + } + size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed) { + size_t lenConverted = 0; + size_t mixedPos = 0; + unsigned char bytes[UTF8MaxBytes + 1]; + while (mixedPos < lenMixed) { + const unsigned char leadByte = static_cast<unsigned char>(mixed[mixedPos]); + const char *caseConverted = 0; + size_t lenMixedChar = 1; + if (UTF8IsAscii(leadByte)) { + caseConverted = Find(leadByte); + } else { + bytes[0] = leadByte; + const int widthCharBytes = UTF8BytesOfLead[leadByte]; + for (int b=1; b<widthCharBytes; b++) { + bytes[b] = (mixedPos+b < lenMixed) ? mixed[mixedPos+b] : 0; + } + int classified = UTF8Classify(bytes, widthCharBytes); + if (!(classified & UTF8MaskInvalid)) { + // valid UTF-8 + lenMixedChar = classified & UTF8MaskWidth; + int character = UnicodeFromUTF8(bytes); + caseConverted = Find(character); + } + } + if (caseConverted) { + // Character has a conversion so copy that conversion in + while (*caseConverted) { + converted[lenConverted++] = *caseConverted++; + if (lenConverted >= sizeConverted) + return 0; + } + } else { + // Character has no conversion so copy the input to output + for (size_t i=0; i<lenMixedChar; i++) { + converted[lenConverted++] = mixed[mixedPos+i]; + if (lenConverted >= sizeConverted) + return 0; + } + } + mixedPos += lenMixedChar; + } + return lenConverted; + } + void FinishedAdding() { + std::sort(characterToConversion.begin(), characterToConversion.end()); + characters.reserve(characterToConversion.size()); + conversions.reserve(characterToConversion.size()); + for (CharacterToConversion::iterator it = characterToConversion.begin(); it != characterToConversion.end(); ++it) { + characters.push_back(it->character); + conversions.push_back(it->conversion); + } + // Empty the original calculated data completely + CharacterToConversion().swap(characterToConversion); + } +}; + +CaseConverter caseConvFold; +CaseConverter caseConvUp; +CaseConverter caseConvLow; + +void UTF8FromUTF32Character(int uch, char *putf) { + size_t k = 0; + if (uch < 0x80) { + putf[k++] = static_cast<char>(uch); + } else if (uch < 0x800) { + putf[k++] = static_cast<char>(0xC0 | (uch >> 6)); + putf[k++] = static_cast<char>(0x80 | (uch & 0x3f)); + } else if (uch < 0x10000) { + putf[k++] = static_cast<char>(0xE0 | (uch >> 12)); + putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f)); + putf[k++] = static_cast<char>(0x80 | (uch & 0x3f)); + } else { + putf[k++] = static_cast<char>(0xF0 | (uch >> 18)); + putf[k++] = static_cast<char>(0x80 | ((uch >> 12) & 0x3f)); + putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f)); + putf[k++] = static_cast<char>(0x80 | (uch & 0x3f)); + } + putf[k] = 0; +} + +void AddSymmetric(enum CaseConversion conversion, int lower,int upper) { + char lowerUTF8[UTF8MaxBytes+1]; + UTF8FromUTF32Character(lower, lowerUTF8); + char upperUTF8[UTF8MaxBytes+1]; + UTF8FromUTF32Character(upper, upperUTF8); + + switch (conversion) { + case CaseConversionFold: + caseConvFold.Add(upper, lowerUTF8); + break; + case CaseConversionUpper: + caseConvUp.Add(lower, upperUTF8); + break; + case CaseConversionLower: + caseConvLow.Add(upper, lowerUTF8); + break; + } +} + +void SetupConversions(enum CaseConversion conversion) { + // First initialize for the symmetric ranges + for (size_t i=0; i<sizeof(symmetricCaseConversionRanges)/sizeof(symmetricCaseConversionRanges[0]);) { + int lower = symmetricCaseConversionRanges[i++]; + int upper = symmetricCaseConversionRanges[i++]; + int length = symmetricCaseConversionRanges[i++]; + int pitch = symmetricCaseConversionRanges[i++]; + for (int j=0;j<length*pitch;j+=pitch) { + AddSymmetric(conversion, lower+j, upper+j); + } + } + // Add the symmetric singletons + for (size_t i=0; i<sizeof(symmetricCaseConversions)/sizeof(symmetricCaseConversions[0]);) { + int lower = symmetricCaseConversions[i++]; + int upper = symmetricCaseConversions[i++]; + AddSymmetric(conversion, lower, upper); + } + // Add the complex cases + const char *sComplex = complexCaseConversions; + while (*sComplex) { + // Longest ligature is 3 character so 5 for safety + const size_t lenUTF8 = 5*UTF8MaxBytes+1; + char originUTF8[lenUTF8]; + char foldedUTF8[lenUTF8]; + char lowerUTF8[lenUTF8]; + char upperUTF8[lenUTF8]; + size_t i = 0; + while (*sComplex && *sComplex != '|') { + originUTF8[i++] = *sComplex; + sComplex++; + } + sComplex++; + originUTF8[i] = 0; + i = 0; + while (*sComplex && *sComplex != '|') { + foldedUTF8[i++] = *sComplex; + sComplex++; + } + sComplex++; + foldedUTF8[i] = 0; + i = 0; + while (*sComplex && *sComplex != '|') { + upperUTF8[i++] = *sComplex; + sComplex++; + } + sComplex++; + upperUTF8[i] = 0; + i = 0; + while (*sComplex && *sComplex != '|') { + lowerUTF8[i++] = *sComplex; + sComplex++; + } + sComplex++; + lowerUTF8[i] = 0; + + int character = UnicodeFromUTF8(reinterpret_cast<unsigned char *>(originUTF8)); + + if (conversion == CaseConversionFold && foldedUTF8[0]) { + caseConvFold.Add(character, foldedUTF8); + } + + if (conversion == CaseConversionUpper && upperUTF8[0]) { + caseConvUp.Add(character, upperUTF8); + } + + if (conversion == CaseConversionLower && lowerUTF8[0]) { + caseConvLow.Add(character, lowerUTF8); + } + } + + switch (conversion) { + case CaseConversionFold: + caseConvFold.FinishedAdding(); + break; + case CaseConversionUpper: + caseConvUp.FinishedAdding(); + break; + case CaseConversionLower: + caseConvLow.FinishedAdding(); + break; + } +} + +CaseConverter *ConverterForConversion(enum CaseConversion conversion) { + switch (conversion) { + case CaseConversionFold: + return &caseConvFold; + case CaseConversionUpper: + return &caseConvUp; + case CaseConversionLower: + return &caseConvLow; + } + return 0; +} + +} + +ICaseConverter *ConverterFor(enum CaseConversion conversion) { + CaseConverter *pCaseConv = ConverterForConversion(conversion); + if (!pCaseConv->Initialised()) + SetupConversions(conversion); + return pCaseConv; +} + +const char *CaseConvert(int character, enum CaseConversion conversion) { + CaseConverter *pCaseConv = ConverterForConversion(conversion); + if (!pCaseConv->Initialised()) + SetupConversions(conversion); + return pCaseConv->Find(character); +} + +size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed, enum CaseConversion conversion) { + CaseConverter *pCaseConv = ConverterForConversion(conversion); + if (!pCaseConv->Initialised()) + SetupConversions(conversion); + return pCaseConv->CaseConvertString(converted, sizeConverted, mixed, lenMixed); +} diff --git a/src/CaseConvert.h b/src/CaseConvert.h new file mode 100644 index 000000000..60de22799 --- /dev/null +++ b/src/CaseConvert.h @@ -0,0 +1,47 @@ +// Scintilla source code edit control +// Encoding: UTF-8 +/** @file CaseConvert.h + ** Performs Unicode case conversions. + ** Does not handle locale-sensitive case conversion. + **/ +// Copyright 2013 by Neil Hodgson <neilh@scintilla.org> +// The License.txt file describes the conditions under which this software may be distributed. + +#ifndef CASECONVERT_H +#define CASECONVERT_H + +#ifdef SCI_NAMESPACE +namespace Scintilla { +#endif + +enum CaseConversion { + CaseConversionFold, + CaseConversionUpper, + CaseConversionLower +}; + +class ICaseConverter { +public: + virtual size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed) = 0; +}; + +ICaseConverter *ConverterFor(enum CaseConversion conversion); + +// Returns a UTF-8 string. Empty when no conversion +const char *CaseConvert(int character, enum CaseConversion conversion); + +// When performing CaseConvertString, the converted value may be up to 3 times longer than the input. +// Ligatures are often decomposed into multiple characters and long cases include: +// ΐ "\xce\x90" folds to ΐ "\xce\xb9\xcc\x88\xcc\x81" +const int maxExpansionCaseConversion=3; + +// Converts a mixed case string using a particular conversion. +// Result may be a different length to input and the length is the return value. +// If there is not enough space then 0 is returned. +size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed, enum CaseConversion conversion); + +#ifdef SCI_NAMESPACE +} +#endif + +#endif diff --git a/src/CaseFolder.cxx b/src/CaseFolder.cxx new file mode 100644 index 000000000..44a94da6f --- /dev/null +++ b/src/CaseFolder.cxx @@ -0,0 +1,68 @@ +// Scintilla source code edit control +/** @file CaseFolder.cxx + ** Classes for case folding. + **/ +// Copyright 1998-2013 by Neil Hodgson <neilh@scintilla.org> +// The License.txt file describes the conditions under which this software may be distributed. + +#include <vector> +#include <algorithm> + +#include "CaseConvert.h" +#include "UniConversion.h" +#include "CaseFolder.h" + +#ifdef SCI_NAMESPACE +using namespace Scintilla; +#endif + +CaseFolder::~CaseFolder() { +} + +CaseFolderTable::CaseFolderTable() { + for (size_t iChar=0; iChar<sizeof(mapping); iChar++) { + mapping[iChar] = static_cast<char>(iChar); + } +} + +CaseFolderTable::~CaseFolderTable() { +} + +size_t CaseFolderTable::Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) { + if (lenMixed > sizeFolded) { + return 0; + } else { + for (size_t i=0; i<lenMixed; i++) { + folded[i] = mapping[static_cast<unsigned char>(mixed[i])]; + } + return lenMixed; + } +} + +void CaseFolderTable::SetTranslation(char ch, char chTranslation) { + mapping[static_cast<unsigned char>(ch)] = chTranslation; +} + +void CaseFolderTable::StandardASCII() { + for (size_t iChar=0; iChar<sizeof(mapping); iChar++) { + if (iChar >= 'A' && iChar <= 'Z') { + mapping[iChar] = static_cast<char>(iChar - 'A' + 'a'); + } else { + mapping[iChar] = static_cast<char>(iChar); + } + } +} + +CaseFolderUnicode::CaseFolderUnicode() { + StandardASCII(); + converter = ConverterFor(CaseConversionFold); +} + +size_t CaseFolderUnicode::Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) { + if ((lenMixed == 1) && (sizeFolded > 0)) { + folded[0] = mapping[static_cast<unsigned char>(mixed[0])]; + return 1; + } else { + return converter->CaseConvertString(folded, sizeFolded, mixed, lenMixed); + } +} diff --git a/src/CaseFolder.h b/src/CaseFolder.h new file mode 100644 index 000000000..2d754d4f3 --- /dev/null +++ b/src/CaseFolder.h @@ -0,0 +1,45 @@ +// Scintilla source code edit control +/** @file CaseFolder.h + ** Classes for case folding. + **/ +// Copyright 1998-2013 by Neil Hodgson <neilh@scintilla.org> +// The License.txt file describes the conditions under which this software may be distributed. + +#ifndef CASEFOLDER_H +#define CASEFOLDER_H + +#ifdef SCI_NAMESPACE +namespace Scintilla { +#endif + +class CaseFolder { +public: + virtual ~CaseFolder(); + virtual size_t Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) = 0; +}; + +class CaseFolderTable : public CaseFolder { +protected: + char mapping[256]; +public: + CaseFolderTable(); + virtual ~CaseFolderTable(); + virtual size_t Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed); + void SetTranslation(char ch, char chTranslation); + void StandardASCII(); +}; + +class ICaseConverter; + +class CaseFolderUnicode : public CaseFolderTable { + ICaseConverter *converter; +public: + CaseFolderUnicode(); + virtual size_t Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed); +}; + +#ifdef SCI_NAMESPACE +} +#endif + +#endif diff --git a/src/Document.cxx b/src/Document.cxx index a00fc9fc2..0637c8d50 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -28,6 +28,7 @@ #include "CharClassify.h" #include "CharacterSet.h" #include "Decoration.h" +#include "CaseFolder.h" #include "Document.h" #include "RESearch.h" #include "UniConversion.h" @@ -1496,47 +1497,6 @@ bool Document::IsWordAt(int start, int end) const { return IsWordStartAt(start) && IsWordEndAt(end); } -static inline char MakeLowerCase(char ch) { - if (ch < 'A' || ch > 'Z') - return ch; - else - return static_cast<char>(ch - 'A' + 'a'); -} - -CaseFolderTable::CaseFolderTable() { - for (size_t iChar=0; iChar<sizeof(mapping); iChar++) { - mapping[iChar] = static_cast<char>(iChar); - } -} - -CaseFolderTable::~CaseFolderTable() { -} - -size_t CaseFolderTable::Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) { - if (lenMixed > sizeFolded) { - return 0; - } else { - for (size_t i=0; i<lenMixed; i++) { - folded[i] = mapping[static_cast<unsigned char>(mixed[i])]; - } - return lenMixed; - } -} - -void CaseFolderTable::SetTranslation(char ch, char chTranslation) { - mapping[static_cast<unsigned char>(ch)] = chTranslation; -} - -void CaseFolderTable::StandardASCII() { - for (size_t iChar=0; iChar<sizeof(mapping); iChar++) { - if (iChar >= 'A' && iChar <= 'Z') { - mapping[iChar] = static_cast<char>(iChar - 'A' + 'a'); - } else { - mapping[iChar] = static_cast<char>(iChar); - } - } -} - bool Document::MatchesWordOptions(bool word, bool wordStart, int pos, int length) const { return (!word && !wordStart) || (word && IsWordAt(pos, pos + length)) || diff --git a/src/Document.h b/src/Document.h index 5c7e8f8a0..5147875b1 100644 --- a/src/Document.h +++ b/src/Document.h @@ -155,24 +155,6 @@ public: bool isEnabled; }; -class CaseFolder { -public: - virtual ~CaseFolder() { - } - virtual size_t Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) = 0; -}; - -class CaseFolderTable : public CaseFolder { -protected: - char mapping[256]; -public: - CaseFolderTable(); - virtual ~CaseFolderTable(); - virtual size_t Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed); - void SetTranslation(char ch, char chTranslation); - void StandardASCII(); -}; - class Document; class LexInterface { diff --git a/src/Editor.cxx b/src/Editor.cxx index 16e3e8b56..acb840fdf 100644 --- a/src/Editor.cxx +++ b/src/Editor.cxx @@ -36,6 +36,7 @@ #include "ViewStyle.h" #include "CharClassify.h" #include "Decoration.h" +#include "CaseFolder.h" #include "Document.h" #include "UniConversion.h" #include "Selection.h" diff --git a/src/PositionCache.cxx b/src/PositionCache.cxx index 31eac8592..742a226b9 100644 --- a/src/PositionCache.cxx +++ b/src/PositionCache.cxx @@ -32,6 +32,7 @@ #include "CharClassify.h" #include "Decoration.h" #include "ILexer.h" +#include "CaseFolder.h" #include "Document.h" #include "Selection.h" #include "PositionCache.h" diff --git a/src/ScintillaBase.cxx b/src/ScintillaBase.cxx index 5d886f5a5..05768799d 100644 --- a/src/ScintillaBase.cxx +++ b/src/ScintillaBase.cxx @@ -42,6 +42,7 @@ #include "AutoComplete.h" #include "CharClassify.h" #include "Decoration.h" +#include "CaseFolder.h" #include "Document.h" #include "Selection.h" #include "PositionCache.h" diff --git a/src/UnicodeFromUTF8.h b/src/UnicodeFromUTF8.h new file mode 100644 index 000000000..24517e8a2 --- /dev/null +++ b/src/UnicodeFromUTF8.h @@ -0,0 +1,19 @@ +// Scintilla source code edit control +/** @file UnicodeFromUTF8.h + ** Lexer infrastructure. + **/ +// Copyright 2013 by Neil Hodgson <neilh@scintilla.org> +// This file is in the public domain. + +inline int UnicodeFromUTF8(const unsigned char *us) { + if (us[0] < 0xC2) { + return us[0]; + } else if (us[0] < 0xE0) { + return ((us[0] & 0x1F) << 6) + (us[1] & 0x3F); + } else if (us[0] < 0xF0) { + return ((us[0] & 0xF) << 12) + ((us[1] & 0x3F) << 6) + (us[2] & 0x3F); + } else if (us[0] < 0xF5) { + return ((us[0] & 0x7) << 18) + ((us[1] & 0x3F) << 12) + ((us[2] & 0x3F) << 6) + (us[3] & 0x3F); + } + return us[0]; +} |