diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/CaseConvert.cxx | 620 | ||||
| -rw-r--r-- | src/CaseConvert.h | 47 | ||||
| -rw-r--r-- | src/CaseFolder.cxx | 68 | ||||
| -rw-r--r-- | src/CaseFolder.h | 45 | ||||
| -rw-r--r-- | src/Document.cxx | 42 | ||||
| -rw-r--r-- | src/Document.h | 18 | ||||
| -rw-r--r-- | src/Editor.cxx | 1 | ||||
| -rw-r--r-- | src/PositionCache.cxx | 1 | ||||
| -rw-r--r-- | src/ScintillaBase.cxx | 1 | ||||
| -rw-r--r-- | src/UnicodeFromUTF8.h | 19 | 
10 files changed, 803 insertions, 59 deletions
| diff --git a/src/CaseConvert.cxx b/src/CaseConvert.cxx new file mode 100644 index 000000000..d9ecc3b68 --- /dev/null +++ b/src/CaseConvert.cxx @@ -0,0 +1,620 @@ +// Scintilla source code edit control +// Encoding: UTF-8 +/** @file CaseConvert.cxx + ** Case fold characters and convert them to upper or lower case. + ** Tables automatically regenerated by scripts/GenerateCharacterCategory.py + ** Should only be rarely regenerated for new versions of Unicode. + **/ +// Copyright 2013 by Neil Hodgson <neilh@scintilla.org> +// The License.txt file describes the conditions under which this software may be distributed. + +#include <cstring> + +#include <vector> +#include <algorithm> + +#include "CaseConvert.h" +#include "UniConversion.h" +#include "UnicodeFromUTF8.h" + +#ifdef SCI_NAMESPACE +using namespace Scintilla; +#endif + +namespace { +	// Use an unnamed namespace to protect the declarations from name conflicts + +// Unicode code points are ordered by groups and follow patterns. +// Most characters (pitch==1) are in ranges for a particular alphabet and their +// upper case forms are a fixed distance away. +// Another pattern (pitch==2) is where each lower case letter is preceded by +// the upper case form. These are also grouped into ranges. + +int symmetricCaseConversionRanges[] = { +//lower, upper, range length, range pitch +//++Autogenerated -- start of section automatically generated +//**\(\*\n\) +97,65,26,1,  +224,192,23,1,  +248,216,7,1,  +257,256,24,2,  +314,313,8,2,  +331,330,23,2,  +462,461,8,2,  +479,478,9,2,  +505,504,20,2,  +547,546,9,2,  +583,582,5,2,  +945,913,17,1,  +963,931,9,1,  +985,984,12,2,  +1072,1040,32,1,  +1104,1024,16,1,  +1121,1120,17,2,  +1163,1162,27,2,  +1218,1217,7,2,  +1233,1232,44,2,  +1377,1329,38,1,  +7681,7680,75,2,  +7841,7840,48,2,  +7936,7944,8,1,  +7952,7960,6,1,  +7968,7976,8,1,  +7984,7992,8,1,  +8000,8008,6,1,  +8032,8040,8,1,  +8560,8544,16,1,  +9424,9398,26,1,  +11312,11264,47,1,  +11393,11392,50,2,  +11520,4256,38,1,  +42561,42560,23,2,  +42625,42624,12,2,  +42787,42786,7,2,  +42803,42802,31,2,  +42879,42878,5,2,  +42913,42912,5,2,  +65345,65313,26,1,  +66600,66560,40,1,  + +//--Autogenerated -- end of section automatically generated +}; + +// Code points that are symmetric but don't fit into a range of similar characters +// are listed here. + +int symmetricCaseConversions[] = { +//lower, upper +//++Autogenerated -- start of section automatically generated +//**1 \(\*\n\) +255,376,  +307,306,  +309,308,  +311,310,  +378,377,  +380,379,  +382,381,  +384,579,  +387,386,  +389,388,  +392,391,  +396,395,  +402,401,  +405,502,  +409,408,  +410,573,  +414,544,  +417,416,  +419,418,  +421,420,  +424,423,  +429,428,  +432,431,  +436,435,  +438,437,  +441,440,  +445,444,  +447,503,  +454,452,  +457,455,  +460,458,  +477,398,  +499,497,  +501,500,  +572,571,  +575,11390,  +576,11391,  +578,577,  +592,11375,  +593,11373,  +594,11376,  +595,385,  +596,390,  +598,393,  +599,394,  +601,399,  +603,400,  +608,403,  +611,404,  +613,42893,  +614,42922,  +616,407,  +617,406,  +619,11362,  +623,412,  +625,11374,  +626,413,  +629,415,  +637,11364,  +640,422,  +643,425,  +648,430,  +649,580,  +650,433,  +651,434,  +652,581,  +658,439,  +881,880,  +883,882,  +887,886,  +891,1021,  +892,1022,  +893,1023,  +940,902,  +941,904,  +942,905,  +943,906,  +972,908,  +973,910,  +974,911,  +983,975,  +1010,1017,  +1016,1015,  +1019,1018,  +1231,1216,  +7545,42877,  +7549,11363,  +8017,8025,  +8019,8027,  +8021,8029,  +8023,8031,  +8048,8122,  +8049,8123,  +8050,8136,  +8051,8137,  +8052,8138,  +8053,8139,  +8054,8154,  +8055,8155,  +8056,8184,  +8057,8185,  +8058,8170,  +8059,8171,  +8060,8186,  +8061,8187,  +8112,8120,  +8113,8121,  +8144,8152,  +8145,8153,  +8160,8168,  +8161,8169,  +8165,8172,  +8526,8498,  +8580,8579,  +11361,11360,  +11365,570,  +11366,574,  +11368,11367,  +11370,11369,  +11372,11371,  +11379,11378,  +11382,11381,  +11500,11499,  +11502,11501,  +11507,11506,  +11559,4295,  +11565,4301,  +42874,42873,  +42876,42875,  +42892,42891,  +42897,42896,  +42899,42898,  + +//--Autogenerated -- end of section automatically generated +}; + +// Characters that have complex case conversions are listed here. +// This includes cases where more than one character is needed for a conversion, +// folding is different to lowering, or (as appropriate) upper(lower(x)) != x or +// lower(upper(x)) != x. + +const char *complexCaseConversions = +// Original | Folded | Upper | Lower | +//++Autogenerated -- start of section automatically generated +//**2 \(\*\n\) +"µ|μ|Μ||" +"ß|ss|SS||" +"İ|i̇||i̇|" +"ı||I||" +"ʼn|ʼn|ʼN||" +"ſ|s|S||" +"Dž|dž|DŽ|dž|" +"Lj|lj|LJ|lj|" +"Nj|nj|NJ|nj|" +"ǰ|ǰ|J̌||" +"Dz|dz|DZ|dz|" +"ͅ|ι|Ι||" +"ΐ|ΐ|Ϊ́||" +"ΰ|ΰ|Ϋ́||" +"ς|σ|Σ||" +"ϐ|β|Β||" +"ϑ|θ|Θ||" +"ϕ|φ|Φ||" +"ϖ|π|Π||" +"ϰ|κ|Κ||" +"ϱ|ρ|Ρ||" +"ϴ|θ||θ|" +"ϵ|ε|Ε||" +"և|եւ|ԵՒ||" +"ẖ|ẖ|H̱||" +"ẗ|ẗ|T̈||" +"ẘ|ẘ|W̊||" +"ẙ|ẙ|Y̊||" +"ẚ|aʾ|Aʾ||" +"ẛ|ṡ|Ṡ||" +"ẞ|ss||ß|" +"ὐ|ὐ|Υ̓||" +"ὒ|ὒ|Υ̓̀||" +"ὔ|ὔ|Υ̓́||" +"ὖ|ὖ|Υ̓͂||" +"ᾀ|ἀι|ἈΙ||" +"ᾁ|ἁι|ἉΙ||" +"ᾂ|ἂι|ἊΙ||" +"ᾃ|ἃι|ἋΙ||" +"ᾄ|ἄι|ἌΙ||" +"ᾅ|ἅι|ἍΙ||" +"ᾆ|ἆι|ἎΙ||" +"ᾇ|ἇι|ἏΙ||" +"ᾈ|ἀι|ἈΙ|ᾀ|" +"ᾉ|ἁι|ἉΙ|ᾁ|" +"ᾊ|ἂι|ἊΙ|ᾂ|" +"ᾋ|ἃι|ἋΙ|ᾃ|" +"ᾌ|ἄι|ἌΙ|ᾄ|" +"ᾍ|ἅι|ἍΙ|ᾅ|" +"ᾎ|ἆι|ἎΙ|ᾆ|" +"ᾏ|ἇι|ἏΙ|ᾇ|" +"ᾐ|ἠι|ἨΙ||" +"ᾑ|ἡι|ἩΙ||" +"ᾒ|ἢι|ἪΙ||" +"ᾓ|ἣι|ἫΙ||" +"ᾔ|ἤι|ἬΙ||" +"ᾕ|ἥι|ἭΙ||" +"ᾖ|ἦι|ἮΙ||" +"ᾗ|ἧι|ἯΙ||" +"ᾘ|ἠι|ἨΙ|ᾐ|" +"ᾙ|ἡι|ἩΙ|ᾑ|" +"ᾚ|ἢι|ἪΙ|ᾒ|" +"ᾛ|ἣι|ἫΙ|ᾓ|" +"ᾜ|ἤι|ἬΙ|ᾔ|" +"ᾝ|ἥι|ἭΙ|ᾕ|" +"ᾞ|ἦι|ἮΙ|ᾖ|" +"ᾟ|ἧι|ἯΙ|ᾗ|" +"ᾠ|ὠι|ὨΙ||" +"ᾡ|ὡι|ὩΙ||" +"ᾢ|ὢι|ὪΙ||" +"ᾣ|ὣι|ὫΙ||" +"ᾤ|ὤι|ὬΙ||" +"ᾥ|ὥι|ὭΙ||" +"ᾦ|ὦι|ὮΙ||" +"ᾧ|ὧι|ὯΙ||" +"ᾨ|ὠι|ὨΙ|ᾠ|" +"ᾩ|ὡι|ὩΙ|ᾡ|" +"ᾪ|ὢι|ὪΙ|ᾢ|" +"ᾫ|ὣι|ὫΙ|ᾣ|" +"ᾬ|ὤι|ὬΙ|ᾤ|" +"ᾭ|ὥι|ὭΙ|ᾥ|" +"ᾮ|ὦι|ὮΙ|ᾦ|" +"ᾯ|ὧι|ὯΙ|ᾧ|" +"ᾲ|ὰι|ᾺΙ||" +"ᾳ|αι|ΑΙ||" +"ᾴ|άι|ΆΙ||" +"ᾶ|ᾶ|Α͂||" +"ᾷ|ᾶι|Α͂Ι||" +"ᾼ|αι|ΑΙ|ᾳ|" +"ι|ι|Ι||" +"ῂ|ὴι|ῊΙ||" +"ῃ|ηι|ΗΙ||" +"ῄ|ήι|ΉΙ||" +"ῆ|ῆ|Η͂||" +"ῇ|ῆι|Η͂Ι||" +"ῌ|ηι|ΗΙ|ῃ|" +"ῒ|ῒ|Ϊ̀||" +"ΐ|ΐ|Ϊ́||" +"ῖ|ῖ|Ι͂||" +"ῗ|ῗ|Ϊ͂||" +"ῢ|ῢ|Ϋ̀||" +"ΰ|ΰ|Ϋ́||" +"ῤ|ῤ|Ρ̓||" +"ῦ|ῦ|Υ͂||" +"ῧ|ῧ|Ϋ͂||" +"ῲ|ὼι|ῺΙ||" +"ῳ|ωι|ΩΙ||" +"ῴ|ώι|ΏΙ||" +"ῶ|ῶ|Ω͂||" +"ῷ|ῶι|Ω͂Ι||" +"ῼ|ωι|ΩΙ|ῳ|" +"Ω|ω||ω|" +"K|k||k|" +"Å|å||å|" +"ff|ff|FF||" +"fi|fi|FI||" +"fl|fl|FL||" +"ffi|ffi|FFI||" +"ffl|ffl|FFL||" +"ſt|st|ST||" +"st|st|ST||" +"ﬓ|մն|ՄՆ||" +"ﬔ|մե|ՄԵ||" +"ﬕ|մի|ՄԻ||" +"ﬖ|վն|ՎՆ||" +"ﬗ|մխ|ՄԽ||" + +//--Autogenerated -- end of section automatically generated +; + +class CaseConverter : public ICaseConverter { +	// Maximum length of a case conversion result is 6 bytes in UTF-8 +	enum { maxConversionLength=6 }; +	struct ConversionString { +		char conversion[maxConversionLength+1]; +	}; +	// Conversions are initially store in a vector of structs but then decomposed into +	// parallel arrays as that is about 10% faster to search. +	struct CharacterConversion { +		int character; +		ConversionString conversion; +		CharacterConversion(int character_=0, const char *conversion_="") : character(character_) { +			strcpy(conversion.conversion, conversion_); +		} +		bool operator<(const CharacterConversion &other) const { +			return character < other.character; +		} +	}; +	typedef std::vector<CharacterConversion> CharacterToConversion; +	CharacterToConversion characterToConversion; +	// The parallel arrays  +	std::vector<int> characters; +	std::vector<ConversionString> conversions; + +public: +	CaseConverter() { +	} +	bool Initialised() const { +		return characters.size() > 0; +	} +	void Add(int character, const char *conversion) { +		characterToConversion.push_back(CharacterConversion(character, conversion)); +	} +	const char *Find(int character) { +		const std::vector<int>::iterator it = std::lower_bound(characters.begin(), characters.end(), character); +		if (*it == character) +			return conversions[it - characters.begin()].conversion; +		else +			return 0; +	} +	size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed) { +		size_t lenConverted = 0; +		size_t mixedPos = 0; +		unsigned char bytes[UTF8MaxBytes + 1]; +		while (mixedPos < lenMixed) { +			const unsigned char leadByte = static_cast<unsigned char>(mixed[mixedPos]); +			const char *caseConverted = 0; +			size_t lenMixedChar = 1; +			if (UTF8IsAscii(leadByte)) { +				caseConverted = Find(leadByte); +			} else { +				bytes[0] = leadByte; +				const int widthCharBytes = UTF8BytesOfLead[leadByte]; +				for (int b=1; b<widthCharBytes; b++) { +					bytes[b] = (mixedPos+b < lenMixed) ? mixed[mixedPos+b] : 0; +				} +				int classified = UTF8Classify(bytes, widthCharBytes); +				if (!(classified & UTF8MaskInvalid)) { +					// valid UTF-8 +					lenMixedChar = classified & UTF8MaskWidth; +					int character = UnicodeFromUTF8(bytes); +					caseConverted = Find(character); +				} +			} +			if (caseConverted) { +				// Character has a conversion so copy that conversion in +				while (*caseConverted) { +					converted[lenConverted++] = *caseConverted++; +					if (lenConverted >= sizeConverted) +						return 0; +				} +			} else { +				// Character has no conversion so copy the input to output +				for (size_t i=0; i<lenMixedChar; i++) { +					converted[lenConverted++] = mixed[mixedPos+i]; +					if (lenConverted >= sizeConverted) +						return 0; +				} +			} +			mixedPos += lenMixedChar; +		} +		return lenConverted; +	} +	void FinishedAdding() { +		std::sort(characterToConversion.begin(), characterToConversion.end()); +		characters.reserve(characterToConversion.size()); +		conversions.reserve(characterToConversion.size()); +		for (CharacterToConversion::iterator it = characterToConversion.begin(); it != characterToConversion.end(); ++it) { +			characters.push_back(it->character); +			conversions.push_back(it->conversion); +		} +		// Empty the original calculated data completely +		CharacterToConversion().swap(characterToConversion); +	} +}; + +CaseConverter caseConvFold; +CaseConverter caseConvUp; +CaseConverter caseConvLow; + +void UTF8FromUTF32Character(int uch, char *putf) { +	size_t k = 0; +	if (uch < 0x80) { +		putf[k++] = static_cast<char>(uch); +	} else if (uch < 0x800) { +		putf[k++] = static_cast<char>(0xC0 | (uch >> 6)); +		putf[k++] = static_cast<char>(0x80 | (uch & 0x3f)); +	} else if (uch < 0x10000) { +		putf[k++] = static_cast<char>(0xE0 | (uch >> 12)); +		putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f)); +		putf[k++] = static_cast<char>(0x80 | (uch & 0x3f)); +	} else { +		putf[k++] = static_cast<char>(0xF0 | (uch >> 18)); +		putf[k++] = static_cast<char>(0x80 | ((uch >> 12) & 0x3f)); +		putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f)); +		putf[k++] = static_cast<char>(0x80 | (uch & 0x3f)); +	} +	putf[k] = 0; +} + +void AddSymmetric(enum CaseConversion conversion, int lower,int upper) { +	char lowerUTF8[UTF8MaxBytes+1]; +	UTF8FromUTF32Character(lower, lowerUTF8); +	char upperUTF8[UTF8MaxBytes+1]; +	UTF8FromUTF32Character(upper, upperUTF8); + +	switch (conversion) { +	case CaseConversionFold: +		caseConvFold.Add(upper, lowerUTF8); +		break; +	case CaseConversionUpper: +		caseConvUp.Add(lower, upperUTF8); +		break; +	case CaseConversionLower: +		caseConvLow.Add(upper, lowerUTF8); +		break; +	} +} + +void SetupConversions(enum CaseConversion conversion) { +	// First initialize for the symmetric ranges +	for (size_t i=0; i<sizeof(symmetricCaseConversionRanges)/sizeof(symmetricCaseConversionRanges[0]);) { +		int lower = symmetricCaseConversionRanges[i++]; +		int upper = symmetricCaseConversionRanges[i++]; +		int length = symmetricCaseConversionRanges[i++]; +		int pitch = symmetricCaseConversionRanges[i++]; +		for (int j=0;j<length*pitch;j+=pitch) { +			AddSymmetric(conversion, lower+j, upper+j); +		} +	} +	// Add the symmetric singletons +	for (size_t i=0; i<sizeof(symmetricCaseConversions)/sizeof(symmetricCaseConversions[0]);) { +		int lower = symmetricCaseConversions[i++]; +		int upper = symmetricCaseConversions[i++]; +		AddSymmetric(conversion, lower, upper); +	} +	// Add the complex cases +	const char *sComplex = complexCaseConversions; +	while (*sComplex) { +		// Longest ligature is 3 character so 5 for safety +		const size_t lenUTF8 = 5*UTF8MaxBytes+1; +		char originUTF8[lenUTF8]; +		char foldedUTF8[lenUTF8]; +		char lowerUTF8[lenUTF8]; +		char upperUTF8[lenUTF8]; +		size_t i = 0; +		while (*sComplex && *sComplex != '|') { +			originUTF8[i++] = *sComplex; +			sComplex++; +		} +		sComplex++; +		originUTF8[i] = 0; +		i = 0; +		while (*sComplex && *sComplex != '|') { +			foldedUTF8[i++] = *sComplex; +			sComplex++; +		} +		sComplex++; +		foldedUTF8[i] = 0; +		i = 0; +		while (*sComplex && *sComplex != '|') { +			upperUTF8[i++] = *sComplex; +			sComplex++; +		} +		sComplex++; +		upperUTF8[i] = 0; +		i = 0; +		while (*sComplex && *sComplex != '|') { +			lowerUTF8[i++] = *sComplex; +			sComplex++; +		} +		sComplex++; +		lowerUTF8[i] = 0; + +		int character = UnicodeFromUTF8(reinterpret_cast<unsigned char *>(originUTF8)); + +		if (conversion == CaseConversionFold && foldedUTF8[0]) { +			caseConvFold.Add(character, foldedUTF8); +		} + +		if (conversion == CaseConversionUpper && upperUTF8[0]) { +			caseConvUp.Add(character, upperUTF8); +		} + +		if (conversion == CaseConversionLower && lowerUTF8[0]) { +			caseConvLow.Add(character, lowerUTF8); +		} +	} + +	switch (conversion) { +	case CaseConversionFold: +		caseConvFold.FinishedAdding(); +		break; +	case CaseConversionUpper: +		caseConvUp.FinishedAdding(); +		break; +	case CaseConversionLower: +		caseConvLow.FinishedAdding(); +		break; +	} +} + +CaseConverter *ConverterForConversion(enum CaseConversion conversion) { +	switch (conversion) { +	case CaseConversionFold: +		return &caseConvFold; +	case CaseConversionUpper: +		return &caseConvUp; +	case CaseConversionLower: +		return &caseConvLow; +	} +	return 0; +} + +} + +ICaseConverter *ConverterFor(enum CaseConversion conversion) { +	CaseConverter *pCaseConv = ConverterForConversion(conversion); +	if (!pCaseConv->Initialised()) +		SetupConversions(conversion); +	return pCaseConv; +} + +const char *CaseConvert(int character, enum CaseConversion conversion) { +	CaseConverter *pCaseConv = ConverterForConversion(conversion); +	if (!pCaseConv->Initialised()) +		SetupConversions(conversion); +	return pCaseConv->Find(character); +} + +size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed, enum CaseConversion conversion) { +	CaseConverter *pCaseConv = ConverterForConversion(conversion); +	if (!pCaseConv->Initialised()) +		SetupConversions(conversion); +	return pCaseConv->CaseConvertString(converted, sizeConverted, mixed, lenMixed); +} diff --git a/src/CaseConvert.h b/src/CaseConvert.h new file mode 100644 index 000000000..60de22799 --- /dev/null +++ b/src/CaseConvert.h @@ -0,0 +1,47 @@ +// Scintilla source code edit control +// Encoding: UTF-8 +/** @file CaseConvert.h + ** Performs Unicode case conversions. + ** Does not handle locale-sensitive case conversion. + **/ +// Copyright 2013 by Neil Hodgson <neilh@scintilla.org> +// The License.txt file describes the conditions under which this software may be distributed. + +#ifndef CASECONVERT_H +#define CASECONVERT_H + +#ifdef SCI_NAMESPACE +namespace Scintilla { +#endif + +enum CaseConversion { +	CaseConversionFold, +	CaseConversionUpper, +	CaseConversionLower +}; + +class ICaseConverter { +public: +	virtual size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed) = 0; +}; + +ICaseConverter *ConverterFor(enum CaseConversion conversion); + +// Returns a UTF-8 string. Empty when no conversion +const char *CaseConvert(int character, enum CaseConversion conversion); + +// When performing CaseConvertString, the converted value may be up to 3 times longer than the input. +// Ligatures are often decomposed into multiple characters and long cases include: +// ΐ "\xce\x90" folds to ΐ "\xce\xb9\xcc\x88\xcc\x81" +const int maxExpansionCaseConversion=3; + +// Converts a mixed case string using a particular conversion. +// Result may be a different length to input and the length is the return value. +// If there is not enough space then 0 is returned. +size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed, enum CaseConversion conversion); + +#ifdef SCI_NAMESPACE +} +#endif + +#endif diff --git a/src/CaseFolder.cxx b/src/CaseFolder.cxx new file mode 100644 index 000000000..44a94da6f --- /dev/null +++ b/src/CaseFolder.cxx @@ -0,0 +1,68 @@ +// Scintilla source code edit control +/** @file CaseFolder.cxx + ** Classes for case folding. + **/ +// Copyright 1998-2013 by Neil Hodgson <neilh@scintilla.org> +// The License.txt file describes the conditions under which this software may be distributed. + +#include <vector> +#include <algorithm> + +#include "CaseConvert.h" +#include "UniConversion.h" +#include "CaseFolder.h" + +#ifdef SCI_NAMESPACE +using namespace Scintilla; +#endif + +CaseFolder::~CaseFolder() { +} + +CaseFolderTable::CaseFolderTable() { +	for (size_t iChar=0; iChar<sizeof(mapping); iChar++) { +		mapping[iChar] = static_cast<char>(iChar); +	} +} + +CaseFolderTable::~CaseFolderTable() { +} + +size_t CaseFolderTable::Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) { +	if (lenMixed > sizeFolded) { +		return 0; +	} else { +		for (size_t i=0; i<lenMixed; i++) { +			folded[i] = mapping[static_cast<unsigned char>(mixed[i])]; +		} +		return lenMixed; +	} +} + +void CaseFolderTable::SetTranslation(char ch, char chTranslation) { +	mapping[static_cast<unsigned char>(ch)] = chTranslation; +} + +void CaseFolderTable::StandardASCII() { +	for (size_t iChar=0; iChar<sizeof(mapping); iChar++) { +		if (iChar >= 'A' && iChar <= 'Z') { +			mapping[iChar] = static_cast<char>(iChar - 'A' + 'a'); +		} else { +			mapping[iChar] = static_cast<char>(iChar); +		} +	} +} + +CaseFolderUnicode::CaseFolderUnicode() { +	StandardASCII(); +	converter = ConverterFor(CaseConversionFold); +} + +size_t CaseFolderUnicode::Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) { +	if ((lenMixed == 1) && (sizeFolded > 0)) { +		folded[0] = mapping[static_cast<unsigned char>(mixed[0])]; +		return 1; +	} else { +		return converter->CaseConvertString(folded, sizeFolded, mixed, lenMixed); +	} +} diff --git a/src/CaseFolder.h b/src/CaseFolder.h new file mode 100644 index 000000000..2d754d4f3 --- /dev/null +++ b/src/CaseFolder.h @@ -0,0 +1,45 @@ +// Scintilla source code edit control +/** @file CaseFolder.h + ** Classes for case folding. + **/ +// Copyright 1998-2013 by Neil Hodgson <neilh@scintilla.org> +// The License.txt file describes the conditions under which this software may be distributed. + +#ifndef CASEFOLDER_H +#define CASEFOLDER_H + +#ifdef SCI_NAMESPACE +namespace Scintilla { +#endif + +class CaseFolder { +public: +	virtual ~CaseFolder(); +	virtual size_t Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) = 0; +}; + +class CaseFolderTable : public CaseFolder { +protected: +	char mapping[256]; +public: +	CaseFolderTable(); +	virtual ~CaseFolderTable(); +	virtual size_t Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed); +	void SetTranslation(char ch, char chTranslation); +	void StandardASCII(); +}; + +class ICaseConverter; + +class CaseFolderUnicode : public CaseFolderTable { +	ICaseConverter *converter; +public: +	CaseFolderUnicode(); +	virtual size_t Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed); +}; + +#ifdef SCI_NAMESPACE +} +#endif + +#endif diff --git a/src/Document.cxx b/src/Document.cxx index a00fc9fc2..0637c8d50 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -28,6 +28,7 @@  #include "CharClassify.h"  #include "CharacterSet.h"  #include "Decoration.h" +#include "CaseFolder.h"  #include "Document.h"  #include "RESearch.h"  #include "UniConversion.h" @@ -1496,47 +1497,6 @@ bool Document::IsWordAt(int start, int end) const {  	return IsWordStartAt(start) && IsWordEndAt(end);  } -static inline char MakeLowerCase(char ch) { -	if (ch < 'A' || ch > 'Z') -		return ch; -	else -		return static_cast<char>(ch - 'A' + 'a'); -} - -CaseFolderTable::CaseFolderTable() { -	for (size_t iChar=0; iChar<sizeof(mapping); iChar++) { -		mapping[iChar] = static_cast<char>(iChar); -	} -} - -CaseFolderTable::~CaseFolderTable() { -} - -size_t CaseFolderTable::Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) { -	if (lenMixed > sizeFolded) { -		return 0; -	} else { -		for (size_t i=0; i<lenMixed; i++) { -			folded[i] = mapping[static_cast<unsigned char>(mixed[i])]; -		} -		return lenMixed; -	} -} - -void CaseFolderTable::SetTranslation(char ch, char chTranslation) { -	mapping[static_cast<unsigned char>(ch)] = chTranslation; -} - -void CaseFolderTable::StandardASCII() { -	for (size_t iChar=0; iChar<sizeof(mapping); iChar++) { -		if (iChar >= 'A' && iChar <= 'Z') { -			mapping[iChar] = static_cast<char>(iChar - 'A' + 'a'); -		} else { -			mapping[iChar] = static_cast<char>(iChar); -		} -	} -} -  bool Document::MatchesWordOptions(bool word, bool wordStart, int pos, int length) const {  	return (!word && !wordStart) ||  			(word && IsWordAt(pos, pos + length)) || diff --git a/src/Document.h b/src/Document.h index 5c7e8f8a0..5147875b1 100644 --- a/src/Document.h +++ b/src/Document.h @@ -155,24 +155,6 @@ public:  	bool isEnabled;  }; -class CaseFolder { -public: -	virtual ~CaseFolder() { -	} -	virtual size_t Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) = 0; -}; - -class CaseFolderTable : public CaseFolder { -protected: -	char mapping[256]; -public: -	CaseFolderTable(); -	virtual ~CaseFolderTable(); -	virtual size_t Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed); -	void SetTranslation(char ch, char chTranslation); -	void StandardASCII(); -}; -  class Document;  class LexInterface { diff --git a/src/Editor.cxx b/src/Editor.cxx index 16e3e8b56..acb840fdf 100644 --- a/src/Editor.cxx +++ b/src/Editor.cxx @@ -36,6 +36,7 @@  #include "ViewStyle.h"  #include "CharClassify.h"  #include "Decoration.h" +#include "CaseFolder.h"  #include "Document.h"  #include "UniConversion.h"  #include "Selection.h" diff --git a/src/PositionCache.cxx b/src/PositionCache.cxx index 31eac8592..742a226b9 100644 --- a/src/PositionCache.cxx +++ b/src/PositionCache.cxx @@ -32,6 +32,7 @@  #include "CharClassify.h"  #include "Decoration.h"  #include "ILexer.h" +#include "CaseFolder.h"  #include "Document.h"  #include "Selection.h"  #include "PositionCache.h" diff --git a/src/ScintillaBase.cxx b/src/ScintillaBase.cxx index 5d886f5a5..05768799d 100644 --- a/src/ScintillaBase.cxx +++ b/src/ScintillaBase.cxx @@ -42,6 +42,7 @@  #include "AutoComplete.h"  #include "CharClassify.h"  #include "Decoration.h" +#include "CaseFolder.h"  #include "Document.h"  #include "Selection.h"  #include "PositionCache.h" diff --git a/src/UnicodeFromUTF8.h b/src/UnicodeFromUTF8.h new file mode 100644 index 000000000..24517e8a2 --- /dev/null +++ b/src/UnicodeFromUTF8.h @@ -0,0 +1,19 @@ +// Scintilla source code edit control +/** @file UnicodeFromUTF8.h + ** Lexer infrastructure. + **/ +// Copyright 2013 by Neil Hodgson <neilh@scintilla.org> +// This file is in the public domain. + +inline int UnicodeFromUTF8(const unsigned char *us) { +	if (us[0] < 0xC2) { +		return us[0]; +	} else if (us[0] < 0xE0) { +		return ((us[0] & 0x1F) << 6) + (us[1] & 0x3F); +	} else if (us[0] < 0xF0) { +		return ((us[0] & 0xF) << 12) + ((us[1] & 0x3F) << 6) + (us[2] & 0x3F); +	} else if (us[0] < 0xF5) { +		return ((us[0] & 0x7) << 18) + ((us[1] & 0x3F) << 12) + ((us[2] & 0x3F) << 6) + (us[3] & 0x3F); +	} +	return us[0]; +} | 
