diff options
-rw-r--r-- | gtk/ScintillaGTK.cxx | 59 | ||||
-rw-r--r-- | src/Document.cxx | 154 | ||||
-rw-r--r-- | src/Document.h | 24 | ||||
-rw-r--r-- | src/Editor.cxx | 43 | ||||
-rw-r--r-- | src/Editor.h | 1 | ||||
-rw-r--r-- | test/simpleTests.py | 78 | ||||
-rw-r--r-- | win32/ScintillaWin.cxx | 74 |
7 files changed, 409 insertions, 24 deletions
diff --git a/gtk/ScintillaGTK.cxx b/gtk/ScintillaGTK.cxx index f4e762832..b53b46691 100644 --- a/gtk/ScintillaGTK.cxx +++ b/gtk/ScintillaGTK.cxx @@ -195,6 +195,7 @@ private: void NotifyKey(int key, int modifiers); void NotifyURIDropped(const char *list); const char *CharacterSetID() const; + virtual CaseFolder *CaseFolderForEncoding(); virtual std::string CaseMapString(const std::string &s, int caseMapping); virtual int KeyDefault(int key, int modifiers); virtual void CopyToClipboard(const SelectionText &selectedText); @@ -1338,6 +1339,64 @@ const char *ScintillaGTK::CharacterSetID() const { return ::CharacterSetID(vs.styles[STYLE_DEFAULT].characterSet); } +class CaseFolderUTF8 : public CaseFolderTable { +public: + CaseFolderUTF8() { + StandardASCII(); + } + virtual size_t Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) { + if ((lenMixed == 1) && (sizeFolded > 0)) { + folded[0] = mapping[static_cast<unsigned char>(mixed[0])]; + return 1; + } else { + gchar *mapped = g_utf8_casefold(mixed, lenMixed); + size_t lenMapped = strlen(mapped); + if (lenMapped < sizeFolded) { + memcpy(folded, mapped, lenMapped); + } else { + lenMapped = 0; + } + g_free(mapped); + return lenMapped; + } + } +}; + +CaseFolder *ScintillaGTK::CaseFolderForEncoding() { + if (pdoc->dbcsCodePage == SC_CP_UTF8) { + return new CaseFolderUTF8(); + } else { + CaseFolderTable *pcf = new CaseFolderTable(); + const char *charSetBuffer = CharacterSetID(); + if ((pdoc->dbcsCodePage == 0) && charSetBuffer) { + pcf->StandardASCII(); + // Only for single byte encodings + for (int i=0x80; i<0x100; i++) { + char sCharacter[2] = "A"; + sCharacter[0] = i; + int convertedLength = 1; + const char *sUTF8 = ConvertText(&convertedLength, sCharacter, 1, + "UTF-8", charSetBuffer, false); + if (sUTF8) { + gchar *mapped = g_utf8_casefold(sUTF8, strlen(sUTF8)); + if (mapped) { + int mappedLength = strlen(mapped); + const char *mappedBack = ConvertText(&mappedLength, mapped, + mappedLength, charSetBuffer, "UTF-8", false); + if (mappedBack && (strlen(mappedBack) == 1) && (mappedBack[0] != sCharacter[0])) { + pcf->SetTranslation(sCharacter[0], mappedBack[0]); + } + delete []mappedBack; + g_free(mapped); + } + } + delete []sUTF8; + } + } + return pcf; + } +} + std::string ScintillaGTK::CaseMapString(const std::string &s, int caseMapping) { #if GTK_MAJOR_VERSION < 2 return Editor::CaseMapString(s, caseMapping); diff --git a/src/Document.cxx b/src/Document.cxx index b1130bd09..fe8b43128 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -10,6 +10,17 @@ #include <stdio.h> #include <ctype.h> +#include <string> +#include <vector> + +// With Borland C++ 5.5, including <string> includes Windows.h leading to defining +// FindText to FindTextA which makes calls here to Document::FindText fail. +#ifdef __BORLANDC__ +#ifdef FindText +#undef FindText +#endif +#endif + #include "Platform.h" #include "Scintilla.h" @@ -22,6 +33,7 @@ #include "Decoration.h" #include "Document.h" #include "RESearch.h" +#include "UniConversion.h" #ifdef SCI_NAMESPACE using namespace Scintilla; @@ -1074,6 +1086,57 @@ static inline char MakeLowerCase(char ch) { return static_cast<char>(ch - 'A' + 'a'); } +static bool GoodTrailByte(int v) { + return (v >= 0x80) && (v < 0xc0); +} + +size_t Document::ExtractChar(int pos, char *bytes) { + unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos)); + size_t widthChar = UTF8CharLength(ch); + bytes[0] = ch; + for (size_t i=1; i<widthChar; i++) { + bytes[i] = cb.CharAt(pos+i); + if (!GoodTrailByte(static_cast<unsigned char>(bytes[i]))) { // Bad byte + widthChar = 1; + } + } + return widthChar; +} + +CaseFolderTable::CaseFolderTable() { + for (size_t iChar=0; iChar<sizeof(mapping); iChar++) { + mapping[iChar] = static_cast<char>(iChar); + } +} + +CaseFolderTable::~CaseFolderTable() { +} + +size_t CaseFolderTable::Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) { + if (lenMixed > sizeFolded) { + return 0; + } else { + for (size_t i=0; i<lenMixed; i++) { + folded[i] = mapping[static_cast<unsigned char>(mixed[i])]; + } + return lenMixed; + } +} + +void CaseFolderTable::SetTranslation(char ch, char chTranslation) { + mapping[static_cast<unsigned char>(ch)] = chTranslation; +} + +void CaseFolderTable::StandardASCII() { + for (size_t iChar=0; iChar<sizeof(mapping); iChar++) { + if (iChar >= 'A' && iChar <= 'Z') { + mapping[iChar] = static_cast<char>(iChar - 'A' + 'a'); + } else { + mapping[iChar] = static_cast<char>(iChar); + } + } +} + /** * Find text in document, supporting both forward and backward * searches (just pass minPos > maxPos to do a backward search) @@ -1081,7 +1144,7 @@ static inline char MakeLowerCase(char ch) { */ long Document::FindText(int minPos, int maxPos, const char *s, bool caseSensitive, bool word, bool wordStart, bool regExp, int flags, - int *length) { + int *length, CaseFolder *pcf) { if (regExp) { if (!regex) regex = CreateRegexSearch(&charClass); @@ -1104,13 +1167,11 @@ long Document::FindText(int minPos, int maxPos, const char *s, endSearch = endPos - lengthFind + 1; } //Platform::DebugPrintf("Find %d %d %s %d\n", startPos, endPos, ft->lpstrText, lengthFind); - char firstChar = s[0]; - if (!caseSensitive) - firstChar = static_cast<char>(MakeUpperCase(firstChar)); int pos = forward ? startPos : (startPos - 1); - while (forward ? (pos < endSearch) : (pos >= endSearch)) { - char ch = CharAt(pos); - if (caseSensitive) { + char firstChar = s[0]; + if (caseSensitive) { + while (forward ? (pos < endSearch) : (pos >= endSearch)) { + char ch = CharAt(pos); if (ch == firstChar) { bool found = true; if (pos + lengthFind > Platform::Maximum(startPos, endPos)) found = false; @@ -1126,27 +1187,88 @@ long Document::FindText(int minPos, int maxPos, const char *s, return pos; } } - } else { - if (MakeUpperCase(ch) == firstChar) { + pos += increment; + if (dbcsCodePage && (pos >= 0)) { + // Ensure trying to match from start of character + pos = MovePositionOutsideChar(pos, increment, false); + } + } + } else if (SC_CP_UTF8 == dbcsCodePage) { + const size_t maxBytesCharacter = 4; + const size_t maxFoldingExpansion = 4; + int endMatch = Platform::Maximum(startPos, endPos); + std::vector<char> searchThing(*length * maxBytesCharacter * maxFoldingExpansion + 1); + size_t lenSearch = pcf->Fold(&searchThing[0], searchThing.size(), s, *length); + while (forward ? (pos < endSearch) : (pos >= endSearch)) { + bool matchChar = true; + int matchOff = 0; + int searchOff = 0; + int widthFirst = 0; + while (matchChar && (pos + matchOff < endMatch)) { + int widthChar; + char bytes[maxBytesCharacter + 1]; + widthChar = ExtractChar(pos + matchOff, bytes); + bytes[maxBytesCharacter] = 0; + if (!widthFirst) + widthFirst = widthChar; + char folded[maxBytesCharacter * maxFoldingExpansion + 1]; + int lenFlat = pcf->Fold(folded, sizeof(folded), bytes, widthChar); + folded[lenFlat] = 0; + // Does folded match the buffer + matchChar = 0 == strncmp(folded, &searchThing[0] + searchOff, lenFlat); + matchOff += widthChar; + searchOff += lenFlat; + if (searchOff >= static_cast<int>(lenSearch)) + break; + } + if (matchChar && (searchOff == static_cast<int>(lenSearch))) { + if ((!word && !wordStart) || + (word && IsWordAt(pos, pos + lengthFind)) || + (wordStart && IsWordStartAt(pos))) { + *length = matchOff; + return pos; + } + } + if (forward) { + pos += widthFirst; + } else { + pos--; + if (pos > 0) { + // Ensure trying to match from start of character + pos = MovePositionOutsideChar(pos, increment, false); + } + } + } + } else { + CaseFolderTable caseFolder; + std::vector<char> searchThing(*length + 1); + pcf->Fold(&searchThing[0], searchThing.size(), s, *length); + while (forward ? (pos < endSearch) : (pos >= endSearch)) { + char ch = CharAt(pos); + char folded[2]; + pcf->Fold(folded, sizeof(folded), &ch, 1); + if (folded[0] == searchThing[0]) { bool found = true; if (pos + lengthFind > Platform::Maximum(startPos, endPos)) found = false; for (int posMatch = 1; posMatch < lengthFind && found; posMatch++) { ch = CharAt(pos + posMatch); - if (MakeUpperCase(ch) != MakeUpperCase(s[posMatch])) + pcf->Fold(folded, sizeof(folded), &ch, 1); + if (folded[0] != searchThing[posMatch]) found = false; } if (found) { if ((!word && !wordStart) || (word && IsWordAt(pos, pos + lengthFind)) || - (wordStart && IsWordStartAt(pos))) + (wordStart && IsWordStartAt(pos))) { return pos; + } } } - } - pos += increment; - if (dbcsCodePage && (pos >= 0)) { - // Ensure trying to match from start of character - pos = MovePositionOutsideChar(pos, increment, false); + pos += increment; + if (dbcsCodePage && (pos >= 0)) { + // Ensure trying to match from start of character + pos = MovePositionOutsideChar(pos, increment, false); + } } } } diff --git a/src/Document.h b/src/Document.h index c61c56892..73571cbdd 100644 --- a/src/Document.h +++ b/src/Document.h @@ -115,6 +115,24 @@ struct StyledText { } }; +class CaseFolder { +public: + virtual ~CaseFolder() { + }; + virtual size_t Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) = 0; +}; + +class CaseFolderTable : public CaseFolder { +protected: + char mapping[256]; +public: + CaseFolderTable(); + virtual ~CaseFolderTable(); + virtual size_t Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed); + void SetTranslation(char ch, char chTranslation); + void StandardASCII(); +}; + /** */ class Document : PerLine { @@ -254,9 +272,9 @@ public: int NextWordEnd(int pos, int delta); int Length() const { return cb.Length(); } void Allocate(int newSize) { cb.Allocate(newSize); } - long FindText(int minPos, int maxPos, const char *s, - bool caseSensitive, bool word, bool wordStart, bool regExp, int flags, int *length); - long FindText(int iMessage, unsigned long wParam, long lParam); + size_t ExtractChar(int pos, char *bytes); + long FindText(int minPos, int maxPos, const char *s, bool caseSensitive, bool word, + bool wordStart, bool regExp, int flags, int *length, CaseFolder *pcf); const char *SubstituteByPosition(const char *text, int *length); int LinesTotal() const; diff --git a/src/Editor.cxx b/src/Editor.cxx index 4bdbecda8..e5623b542 100644 --- a/src/Editor.cxx +++ b/src/Editor.cxx @@ -13,6 +13,7 @@ #include <string> #include <vector> #include <algorithm> +#include <memory> // With Borland C++ 5.5, including <string> includes Windows.h leading to defining // FindText to FindTextA which makes calls here to Document::FindText fail. @@ -5309,6 +5310,31 @@ void Editor::Indent(bool forwards) { } } +class CaseFolderASCII : public CaseFolderTable { +public: + CaseFolderASCII() { + StandardASCII(); + } + ~CaseFolderASCII() { + } + virtual size_t Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) { + if (lenMixed > sizeFolded) { + return 0; + } else { + for (size_t i=0; i<lenMixed; i++) { + folded[i] = mapping[static_cast<unsigned char>(mixed[i])]; + } + return lenMixed; + } + } +}; + + +CaseFolder *Editor::CaseFolderForEncoding() { + // Simple default that only maps ASCII upper case to lower case. + return new CaseFolderASCII(); +} + /** * Search of a text in the document, in the given range. * @return The position of the found text, -1 if not found. @@ -5320,13 +5346,15 @@ long Editor::FindText( Sci_TextToFind *ft = reinterpret_cast<Sci_TextToFind *>(lParam); int lengthFound = istrlen(ft->lpstrText); + std::auto_ptr<CaseFolder> pcf(CaseFolderForEncoding()); int pos = pdoc->FindText(ft->chrg.cpMin, ft->chrg.cpMax, ft->lpstrText, (wParam & SCFIND_MATCHCASE) != 0, (wParam & SCFIND_WHOLEWORD) != 0, (wParam & SCFIND_WORDSTART) != 0, (wParam & SCFIND_REGEXP) != 0, wParam, - &lengthFound); + &lengthFound, + pcf.get()); if (pos != -1) { ft->chrgText.cpMin = pos; ft->chrgText.cpMax = pos + lengthFound; @@ -5363,6 +5391,7 @@ long Editor::SearchText( const char *txt = reinterpret_cast<char *>(lParam); int pos; int lengthFound = istrlen(txt); + std::auto_ptr<CaseFolder> pcf(CaseFolderForEncoding()); if (iMessage == SCI_SEARCHNEXT) { pos = pdoc->FindText(searchAnchor, pdoc->Length(), txt, (wParam & SCFIND_MATCHCASE) != 0, @@ -5370,7 +5399,8 @@ long Editor::SearchText( (wParam & SCFIND_WORDSTART) != 0, (wParam & SCFIND_REGEXP) != 0, wParam, - &lengthFound); + &lengthFound, + pcf.get()); } else { pos = pdoc->FindText(searchAnchor, 0, txt, (wParam & SCFIND_MATCHCASE) != 0, @@ -5378,9 +5408,9 @@ long Editor::SearchText( (wParam & SCFIND_WORDSTART) != 0, (wParam & SCFIND_REGEXP) != 0, wParam, - &lengthFound); + &lengthFound, + pcf.get()); } - if (pos != -1) { SetSelection(pos, pos + lengthFound); } @@ -5411,13 +5441,16 @@ std::string Editor::CaseMapString(const std::string &s, int caseMapping) { */ long Editor::SearchInTarget(const char *text, int length) { int lengthFound = length; + + std::auto_ptr<CaseFolder> pcf(CaseFolderForEncoding()); int pos = pdoc->FindText(targetStart, targetEnd, text, (searchFlags & SCFIND_MATCHCASE) != 0, (searchFlags & SCFIND_WHOLEWORD) != 0, (searchFlags & SCFIND_WORDSTART) != 0, (searchFlags & SCFIND_REGEXP) != 0, searchFlags, - &lengthFound); + &lengthFound, + pcf.get()); if (pos != -1) { targetStart = pos; targetEnd = pos + lengthFound; diff --git a/src/Editor.h b/src/Editor.h index 053b10a9e..180db571a 100644 --- a/src/Editor.h +++ b/src/Editor.h @@ -424,6 +424,7 @@ protected: // ScintillaBase subclass needs access to much of Editor void Indent(bool forwards); + virtual CaseFolder *CaseFolderForEncoding(); long FindText(uptr_t wParam, sptr_t lParam); void SearchAnchor(); long SearchText(unsigned int iMessage, uptr_t wParam, sptr_t lParam); diff --git a/test/simpleTests.py b/test/simpleTests.py index f9a67f59b..8fac00cd4 100644 --- a/test/simpleTests.py +++ b/test/simpleTests.py @@ -1109,6 +1109,84 @@ class TestCaseMapping(unittest.TestCase): self.assertEquals(self.ed.Length, 1) self.assertEquals(self.ed.Contents(), r) +class TestCaseInsensitiveSearch(unittest.TestCase): + def setUp(self): + self.xite = XiteWin.xiteFrame + self.ed = self.xite.ed + self.ed.ClearAll() + self.ed.EmptyUndoBuffer() + + def tearDown(self): + self.ed.SetCodePage(0) + self.ed.StyleSetCharacterSet(self.ed.STYLE_DEFAULT, self.ed.SC_CHARSET_DEFAULT) + + def testEmpty(self): + text = b" x X" + searchString = b"" + self.ed.SetText(len(text), text) + self.ed.TargetStart = 0 + self.ed.TargetEnd = self.ed.Length-1 + self.ed.SearchFlags = 0 + pos = self.ed.SearchInTarget(len(searchString), searchString) + self.assertEquals(-1, pos) + + def testASCII(self): + text = b" x X" + searchString = b"X" + self.ed.SetText(len(text), text) + self.ed.TargetStart = 0 + self.ed.TargetEnd = self.ed.Length-1 + self.ed.SearchFlags = 0 + pos = self.ed.SearchInTarget(len(searchString), searchString) + self.assertEquals(1, pos) + + def testLatin1(self): + text = "Frånd Åå".encode("Latin-1") + searchString = "Å".encode("Latin-1") + self.ed.SetText(len(text), text) + self.ed.TargetStart = 0 + self.ed.TargetEnd = self.ed.Length-1 + self.ed.SearchFlags = 0 + pos = self.ed.SearchInTarget(len(searchString), searchString) + self.assertEquals(2, pos) + + def testRussian(self): + self.ed.StyleSetCharacterSet(self.ed.STYLE_DEFAULT, self.ed.SC_CHARSET_RUSSIAN) + text = "=(Б tex б)".encode("Windows-1251") + searchString = "б".encode("Windows-1251") + self.ed.SetText(len(text), text) + self.ed.TargetStart = 0 + self.ed.TargetEnd = self.ed.Length-1 + self.ed.SearchFlags = 0 + pos = self.ed.SearchInTarget(len(searchString), searchString) + self.assertEquals(2, pos) + + def testUTF(self): + self.ed.SetCodePage(65001) + text = "Frånd Åå".encode("UTF-8") + searchString = "Å".encode("UTF-8") + self.ed.SetText(len(text), text) + self.ed.TargetStart = 0 + self.ed.TargetEnd = self.ed.Length-1 + self.ed.SearchFlags = 0 + pos = self.ed.SearchInTarget(len(searchString), searchString) + self.assertEquals(2, pos) + + def testUTFDifferentLength(self): + # Searching for a two byte string "ı" finds a single byte "I" + self.ed.SetCodePage(65001) + text = "Fråndi Ååİ $".encode("UTF-8") + firstPosition = len("Frånd".encode("UTF-8")) + searchString = "İ".encode("UTF-8") + self.assertEquals(len(searchString), 2) + self.ed.SetText(len(text), text) + self.ed.TargetStart = 0 + self.ed.TargetEnd = self.ed.Length-1 + self.ed.SearchFlags = 0 + pos = self.ed.SearchInTarget(len(searchString), searchString) + self.assertEquals(firstPosition, pos) + self.assertEquals(firstPosition+1, self.ed.TargetEnd) + class TestLexer(unittest.TestCase): def setUp(self): self.xite = XiteWin.xiteFrame diff --git a/win32/ScintillaWin.cxx b/win32/ScintillaWin.cxx index a509ef0ec..a14b8d01b 100644 --- a/win32/ScintillaWin.cxx +++ b/win32/ScintillaWin.cxx @@ -216,6 +216,7 @@ class ScintillaWin : virtual int GetCtrlID(); virtual void NotifyParent(SCNotification scn); virtual void NotifyDoubleClick(Point pt, bool shift, bool ctrl, bool alt); + virtual CaseFolder *CaseFolderForEncoding(); virtual std::string CaseMapString(const std::string &s, int caseMapping); virtual void Copy(); virtual void CopyAllowLine(); @@ -1298,6 +1299,79 @@ void ScintillaWin::NotifyDoubleClick(Point pt, bool shift, bool ctrl, bool alt) MAKELPARAM(pt.x, pt.y)); } +class CaseFolderUTF8 : public CaseFolderTable { + // Allocate the expandable storage here so that it does not need to be reallocated + // for each call to Fold. + std::vector<wchar_t> utf16Mixed; + std::vector<wchar_t> utf16Folded; +public: + CaseFolderUTF8() { + StandardASCII(); + } + virtual size_t Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) { + if ((lenMixed == 1) && (sizeFolded > 0)) { + folded[0] = mapping[static_cast<unsigned char>(mixed[0])]; + return 1; + } else { + if (lenMixed > utf16Mixed.size()) { + utf16Mixed.resize(lenMixed + 8); + } + size_t nUtf16Mixed = ::MultiByteToWideChar(65001, 0, mixed, lenMixed, + &utf16Mixed[0], utf16Mixed.size()); + + if (nUtf16Mixed * 4 > utf16Folded.size()) { // Maximum folding expansion factor of 4 + utf16Folded.resize(nUtf16Mixed * 4 + 8); + } + int lenFlat = ::LCMapStringW(LOCALE_SYSTEM_DEFAULT, + LCMAP_LINGUISTIC_CASING | LCMAP_LOWERCASE, + &utf16Mixed[0], nUtf16Mixed, &utf16Folded[0], utf16Folded.size()); + + size_t lenOut = UTF8Length(&utf16Folded[0], lenFlat); + if (lenOut < sizeFolded) { + UTF8FromUTF16(&utf16Folded[0], lenFlat, folded, lenOut); + return lenOut; + } else { + return 0; + } + } + } +}; + +CaseFolder *ScintillaWin::CaseFolderForEncoding() { + UINT cpDest = CodePageOfDocument(); + if (cpDest == SC_CP_UTF8) { + return new CaseFolderUTF8(); + } else { + CaseFolderTable *pcf = new CaseFolderTable(); + if (pdoc->dbcsCodePage == 0) { + pcf->StandardASCII(); + // Only for single byte encodings + UINT cpDoc = CodePageOfDocument(); + for (int i=0x80; i<0x100; i++) { + char sCharacter[2] = "A"; + sCharacter[0] = static_cast<char>(i); + wchar_t wCharacter[20]; + unsigned int lengthUTF16 = ::MultiByteToWideChar(cpDoc, 0, sCharacter, 1, + wCharacter, sizeof(wCharacter)/sizeof(wCharacter[0])); + if (lengthUTF16 == 1) { + wchar_t wLower[20]; + int charsConverted = ::LCMapStringW(LOCALE_SYSTEM_DEFAULT, + LCMAP_LINGUISTIC_CASING | LCMAP_LOWERCASE, + wCharacter, lengthUTF16, wLower, sizeof(wLower)/sizeof(wLower[0])); + char sCharacterLowered[20]; + unsigned int lengthConverted = ::WideCharToMultiByte(cpDoc, 0, + wLower, charsConverted, + sCharacterLowered, sizeof(sCharacterLowered), NULL, 0); + if ((lengthConverted == 1) && (sCharacter[0] != sCharacterLowered[0])) { + pcf->SetTranslation(sCharacter[0], sCharacterLowered[0]); + } + } + } + } + return pcf; + } +} + std::string ScintillaWin::CaseMapString(const std::string &s, int caseMapping) { if (s.size() == 0) return std::string(); |