For case-insensitive UTF-8 searching, use UTF8Classify for finding valid

character width so compatible with other similar code. Optimize treatment of single byte ASCII characters and also optimize loop conditions. These mostly make up for the performance decrease from calling UTF8Classify. Add support definitions UTF8MaxBytes and UTF8IsAscii in UniConversion. Remove ExtractChar as no longer needed.
author: nyamatongwe <devnull@localhost> 2012-05-26 13:26:11 +1000
committer: nyamatongwe <devnull@localhost> 2012-05-26 13:26:11 +1000
commit: e1370f834348a12fea75ad883a0d801dfd1b9d8d (patch)
tree: d5bbf357b84a4794326f3cf3572b2b9d39f96f7e
parent: 44241ccc28b561efcdbda77350bb5435b11b3d47 (diff)
download: scintilla-mirror-e1370f834348a12fea75ad883a0d801dfd1b9d8d.tar.gz
3 files changed, 29 insertions, 28 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index dca0ccc51..4e9366064 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -1373,19 +1373,6 @@ static inline char MakeLowerCase(char ch) {
 		return static_cast<char>(ch - 'A' + 'a');
 }
 
-size_t Document::ExtractChar(int pos, char *bytes) {
-	unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos));
-	size_t widthChar = UTF8CharLength(ch);
-	bytes[0] = ch;
-	for (size_t i=1; i<widthChar; i++) {
-		bytes[i] = cb.CharAt(static_cast<int>(pos+i));
-		if (!IsTrailByte(static_cast<unsigned char>(bytes[i]))) { // Bad byte
-			widthChar = 1;
-		}
-	}
-	return widthChar;
-}
-
 CaseFolderTable::CaseFolderTable() {
 	for (size_t iChar=0; iChar<sizeof(mapping); iChar++) {
 		mapping[iChar] = static_cast<char>(iChar);
@@ -1476,37 +1463,46 @@ long Document::FindText(int minPos, int maxPos, const char *search,
 					break;
 			}
 		} else if (SC_CP_UTF8 == dbcsCodePage) {
-			const size_t maxBytesCharacter = 4;
 			const size_t maxFoldingExpansion = 4;
-			std::vector<char> searchThing(lengthFind * maxBytesCharacter * maxFoldingExpansion + 1);
+			std::vector<char> searchThing(lengthFind * UTF8MaxBytes * maxFoldingExpansion + 1);
 			const int lenSearch = static_cast<int>(
 				pcf->Fold(&searchThing[0], searchThing.size(), search, lengthFind));
+			char bytes[UTF8MaxBytes + 1];
+			char folded[UTF8MaxBytes * maxFoldingExpansion + 1];
 			while (forward ? (pos < endPos) : (pos >= endPos)) {
 				int widthFirstCharacter = 0;
-				int indexDocument = 0;
+				int posIndexDocument = pos;
 				int indexSearch = 0;
 				bool characterMatches = true;
-				while (characterMatches &&
-					((pos + indexDocument) < limitPos) &&
-					(indexSearch < lenSearch)) {
-					char bytes[maxBytesCharacter + 1];
-					bytes[maxBytesCharacter] = 0;
-					const int widthChar = static_cast<int>(ExtractChar(pos + indexDocument, bytes));
+				for (;;) {
+					const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(posIndexDocument));
+					bytes[0] = leadByte;
+					int widthChar = 1;
+					if (!UTF8IsAscii(leadByte)) {
+						const int widthCharBytes = UTF8BytesOfLead[leadByte];
+						for (int b=1; b<widthCharBytes; b++) {
+							bytes[b] = cb.CharAt(posIndexDocument+b);
+						}
+						widthChar = UTF8Classify(reinterpret_cast<const unsigned char *>(bytes), widthCharBytes) & UTF8MaskWidth;
+					}
 					if (!widthFirstCharacter)
 						widthFirstCharacter = widthChar;
-					if ((pos + indexDocument + widthChar) > limitPos)
+					if ((posIndexDocument + widthChar) > limitPos)
 						break;
-					char folded[maxBytesCharacter * maxFoldingExpansion + 1];
 					const int lenFlat = static_cast<int>(pcf->Fold(folded, sizeof(folded), bytes, widthChar));
 					folded[lenFlat] = 0;
 					// Does folded match the buffer
 					characterMatches = 0 == memcmp(folded, &searchThing[0] + indexSearch, lenFlat);
-					indexDocument += widthChar;
+					if (!characterMatches)
+						break;
+					posIndexDocument += widthChar;
 					indexSearch += lenFlat;
+					if (indexSearch >= lenSearch)
+						break;
 				}
 				if (characterMatches && (indexSearch == static_cast<int>(lenSearch))) {
-					if (MatchesWordOptions(word, wordStart, pos, indexDocument)) {
-						*length = indexDocument;
+					if (MatchesWordOptions(word, wordStart, pos, posIndexDocument - pos)) {
+						*length = posIndexDocument - pos;
 						return pos;
 					}
 				}
diff --git a/src/Document.h b/src/Document.h
index ec41603eb..18bf00a3d 100644
--- a/src/Document.h
+++ b/src/Document.h
@@ -352,7 +352,6 @@ public:
 	int NextWordEnd(int pos, int delta);
 	int SCI_METHOD Length() const { return cb.Length(); }
 	void Allocate(int newSize) { cb.Allocate(newSize); }
-	size_t ExtractChar(int pos, char *bytes);
 	bool MatchesWordOptions(bool word, bool wordStart, int pos, int length);
 	long FindText(int minPos, int maxPos, const char *search, bool caseSensitive, bool word,
 		bool wordStart, bool regExp, int flags, int *length, CaseFolder *pcf);
diff --git a/src/UniConversion.h b/src/UniConversion.h
index 87cc43f77..704f16239 100644
--- a/src/UniConversion.h
+++ b/src/UniConversion.h
@@ -5,6 +5,8 @@
 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
 // The License.txt file describes the conditions under which this software may be distributed.
 
+const int UTF8MaxBytes = 4;
+
 unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen);
 void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len);
 unsigned int UTF8CharLength(unsigned char ch);
@@ -18,5 +20,9 @@ inline bool UTF8IsTrailByte(int ch) {
 	return (ch >= 0x80) && (ch < 0xc0);
 }
 
+inline bool UTF8IsAscii(int ch) {
+	return ch < 0x80;
+}
+
 enum { UTF8MaskWidth=0x7, UTF8MaskInvalid=0x8 };
 int UTF8Classify(const unsigned char *us, int len);
author	nyamatongwe <devnull@localhost>	2012-05-26 13:26:11 +1000
committer	nyamatongwe <devnull@localhost>	2012-05-26 13:26:11 +1000
commit	e1370f834348a12fea75ad883a0d801dfd1b9d8d (patch)
tree	d5bbf357b84a4794326f3cf3572b2b9d39f96f7e
parent	44241ccc28b561efcdbda77350bb5435b11b3d47 (diff)
download	scintilla-mirror-e1370f834348a12fea75ad883a0d801dfd1b9d8d.tar.gz