Case insensitive search in DBCS and faster processing of DBCS.

DBCS case folder implemented on Windows.
author: nyamatongwe <unknown> 2010-08-05 13:47:25 +1000
committer: nyamatongwe <unknown> 2010-08-05 13:47:25 +1000
commit: 3a51b94f2cb34ea717cb7dc5f49d40eb21bf76bb (patch)
tree: 6d2d953c441f3777d8b3198865fc873d8284c617
parent: 63a2f56fa532bcbac8eb65119c3ebbb5c942328d (diff)
download: scintilla-mirror-3a51b94f2cb34ea717cb7dc5f49d40eb21bf76bb.tar.gz
3 files changed, 117 insertions, 21 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index a5907f97f..fa8ec0857 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -485,7 +485,16 @@ int Document::MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd) {
 		} else {
 			// Anchor DBCS calculations at start of line because start of line can
 			// not be a DBCS trail byte.
-			int posCheck = LineStart(LineFromPosition(pos));
+			int posStartLine = LineStart(LineFromPosition(pos));
+			if (pos == posStartLine)
+				return pos;
+
+			// Step back until a non-lead-byte is found.
+			int posCheck = pos;
+			while ((posCheck > posStartLine) && IsDBCSLeadByte(cb.CharAt(posCheck-1)))
+				posCheck--;
+
+			// Check from known start of character.
 			while (posCheck < pos) {
 				int mbsize = IsDBCSLeadByte(cb.CharAt(posCheck)) ? 2 : 1;
 				if (posCheck + mbsize == pos) {
@@ -575,6 +584,17 @@ int Document::NextPosition(int pos, int moveDir) {
 	return pos;
 }
 
+bool Document::NextCharacter(int &pos, int moveDir) {
+	// Returns true if pos changed
+	int posNext = NextPosition(pos, moveDir);
+	if (posNext == pos) {
+		return false;
+	} else {
+		pos = posNext;
+		return true;
+	}
+}
+
 int SCI_METHOD Document::CodePage() const {
 	return dbcsCodePage;
 }
@@ -598,7 +618,7 @@ bool SCI_METHOD Document::IsDBCSLeadByte(char ch) const {
 			return (uch >= 0x81) && (uch <= 0xFE);
 		case 1361:
 			// Korean Johab KS C-5601-1992
-			return 
+			return
 				((uch >= 0x84) && (uch <= 0xD3)) ||
 				((uch >= 0xD8) && (uch <= 0xDE)) ||
 				((uch >= 0xE0) && (uch <= 0xF9));
@@ -1316,13 +1336,8 @@ long Document::FindText(int minPos, int maxPos, const char *search,
 				if (found && MatchesWordOptions(word, wordStart, pos, lengthFind)) {
 					return pos;
 				}
-				pos += increment;
-				if (dbcsCodePage && (pos >= 0)) {
-					// Have to use >= 0 as otherwise next statement would change
-					// -1 to 0 and make loop infinite.
-					// Ensure trying to match from start of character
-					pos = MovePositionOutsideChar(pos, increment, false);
-				}
+				if (!NextCharacter(pos, increment))
+					break;
 			}
 		} else if (SC_CP_UTF8 == dbcsCodePage) {
 			const size_t maxBytesCharacter = 4;
@@ -1359,12 +1374,43 @@ long Document::FindText(int minPos, int maxPos, const char *search,
 				if (forward) {
 					pos += widthFirstCharacter;
 				} else {
-					pos--;
-					if (pos > 0) {
-						// Ensure trying to match from start of character
-						pos = MovePositionOutsideChar(pos, increment, false);
+					if (!NextCharacter(pos, increment))
+						break;
+				}
+			}
+		} else if (dbcsCodePage) {
+			const size_t maxBytesCharacter = 2;
+			const size_t maxFoldingExpansion = 4;
+			std::vector<char> searchThing(lengthFind * maxBytesCharacter * maxFoldingExpansion + 1);
+			const int lenSearch = pcf->Fold(&searchThing[0], searchThing.size(), search, lengthFind);
+			while (forward ? (pos < endSearch) : (pos >= endSearch)) {
+				int indexDocument = 0;
+				int indexSearch = 0;
+				bool characterMatches = true;
+				while (characterMatches &&
+					((pos + indexDocument) < limitPos) &&
+					(indexSearch < lenSearch)) {
+					char bytes[maxBytesCharacter + 1];
+					bytes[0] = cb.CharAt(pos + indexDocument);
+					const int widthChar = IsDBCSLeadByte(bytes[0]) ? 2 : 1;
+					if (widthChar == 2) 
+						bytes[1] = cb.CharAt(pos + indexDocument + 1);
+					char folded[maxBytesCharacter * maxFoldingExpansion + 1];
+					const int lenFlat = pcf->Fold(folded, sizeof(folded), bytes, widthChar);
+					folded[lenFlat] = 0;
+					// Does folded match the buffer
+					characterMatches = 0 == memcmp(folded, &searchThing[0] + indexSearch, lenFlat);
+					indexDocument += widthChar;
+					indexSearch += lenFlat;
+				}
+				if (characterMatches && (indexSearch == static_cast<int>(lenSearch))) {
+					if (MatchesWordOptions(word, wordStart, pos, indexDocument)) {
+						*length = indexDocument;
+						return pos;
 					}
 				}
+				if (!NextCharacter(pos, increment))
+					break;
 			}
 		} else {
 			CaseFolderTable caseFolder;
@@ -1381,11 +1427,8 @@ long Document::FindText(int minPos, int maxPos, const char *search,
 				if (found && MatchesWordOptions(word, wordStart, pos, lengthFind)) {
 					return pos;
 				}
-				pos += increment;
-				if (dbcsCodePage && (pos >= 0)) {
-					// Ensure trying to match from start of character
-					pos = MovePositionOutsideChar(pos, increment, false);
-				}
+				if (!NextCharacter(pos, increment))
+					break;
 			}
 		}
 	}
diff --git a/src/Document.h b/src/Document.h
index d87840872..6d2c2d0bb 100644
--- a/src/Document.h
+++ b/src/Document.h
@@ -231,6 +231,7 @@ public:
 	bool InGoodUTF8(int pos, int &start, int &end);
 	int MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd=true);
 	int NextPosition(int pos, int moveDir);
+	bool NextCharacter(int &pos, int moveDir);	// Returns true if pos changed
 	int SCI_METHOD CodePage() const;
 	bool SCI_METHOD IsDBCSLeadByte(char ch) const;
 
diff --git a/win32/ScintillaWin.cxx b/win32/ScintillaWin.cxx
index 570a2bc88..9873b82a4 100644
--- a/win32/ScintillaWin.cxx
+++ b/win32/ScintillaWin.cxx
@@ -1293,7 +1293,7 @@ void ScintillaWin::NotifyDoubleClick(Point pt, bool shift, bool ctrl, bool alt)
 			  MAKELPARAM(pt.x, pt.y));
 }
 
-class CaseFolderUTF8  : public CaseFolderTable {
+class CaseFolderUTF8 : public CaseFolderTable {
 	// Allocate the expandable storage here so that it does not need to be reallocated
 	// for each call to Fold.
 	std::vector<wchar_t> utf16Mixed;
@@ -1337,13 +1337,63 @@ public:
 	}
 };
 
+class CaseFolderDBCS : public CaseFolderTable {
+	// Allocate the expandable storage here so that it does not need to be reallocated
+	// for each call to Fold.
+	std::vector<wchar_t> utf16Mixed;
+	std::vector<wchar_t> utf16Folded;
+	UINT cp;
+public:
+	CaseFolderDBCS(UINT cp_) : cp(cp_) {
+		StandardASCII();
+	}
+	virtual size_t Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) {
+		if ((lenMixed == 1) && (sizeFolded > 0)) {
+			folded[0] = mapping[static_cast<unsigned char>(mixed[0])];
+			return 1;
+		} else {
+			if (lenMixed > utf16Mixed.size()) {
+				utf16Mixed.resize(lenMixed + 8);
+			}
+			size_t nUtf16Mixed = ::MultiByteToWideChar(cp, 0, mixed, lenMixed,
+				&utf16Mixed[0], utf16Mixed.size());
+
+			if (nUtf16Mixed == 0) {
+				// Failed to convert -> bad input
+				folded[0] = '\0';
+				return 1;
+			}
+
+			if (nUtf16Mixed * 4 > utf16Folded.size()) {	// Maximum folding expansion factor of 4
+				utf16Folded.resize(nUtf16Mixed * 4 + 8);
+			}
+			int lenFlat = ::LCMapStringW(LOCALE_SYSTEM_DEFAULT,
+				LCMAP_LINGUISTIC_CASING | LCMAP_LOWERCASE,
+				&utf16Mixed[0], nUtf16Mixed, &utf16Folded[0], utf16Folded.size());
+
+			size_t lenOut = ::WideCharToMultiByte(cp, 0, 
+				&utf16Folded[0], lenFlat,
+				NULL, 0, NULL, 0);
+
+			if (lenOut < sizeFolded) {
+				::WideCharToMultiByte(cp, 0, 
+					&utf16Folded[0], lenFlat,
+					folded, lenOut, NULL, 0);
+				return lenOut;
+			} else {
+				return 0;
+			}
+		}
+	}
+};
+
 CaseFolder *ScintillaWin::CaseFolderForEncoding() {
 	UINT cpDest = CodePageOfDocument();
 	if (cpDest == SC_CP_UTF8) {
 		return new CaseFolderUTF8();
 	} else {
-		CaseFolderTable *pcf = new CaseFolderTable();
 		if (pdoc->dbcsCodePage == 0) {
+			CaseFolderTable *pcf = new CaseFolderTable();
 			pcf->StandardASCII();
 			// Only for single byte encodings
 			UINT cpDoc = CodePageOfDocument();
@@ -1367,8 +1417,10 @@ CaseFolder *ScintillaWin::CaseFolderForEncoding() {
 					}
 				}
 			}
+			return pcf;
+		} else {
+			return new CaseFolderDBCS(cpDest);
 		}
-		return pcf;
 	}
 }
author	nyamatongwe <unknown>	2010-08-05 13:47:25 +1000
committer	nyamatongwe <unknown>	2010-08-05 13:47:25 +1000
commit	3a51b94f2cb34ea717cb7dc5f49d40eb21bf76bb (patch)
tree	6d2d953c441f3777d8b3198865fc873d8284c617
parent	63a2f56fa532bcbac8eb65119c3ebbb5c942328d (diff)
download	scintilla-mirror-3a51b94f2cb34ea717cb7dc5f49d40eb21bf76bb.tar.gz