diff options
author | Neil <nyamatongwe@gmail.com> | 2014-10-02 18:17:13 +1000 |
---|---|---|
committer | Neil <nyamatongwe@gmail.com> | 2014-10-02 18:17:13 +1000 |
commit | 2603f1e2074b0f880886b533ffc47ecef4fd33f7 (patch) | |
tree | 53bfaff1eca31d0768f43d0c496c81b551230a2d /src | |
parent | 8c42cc95ae3fbfdb8d4b7e8893c2fee283efe1a6 (diff) | |
download | scintilla-mirror-2603f1e2074b0f880886b533ffc47ecef4fd33f7.tar.gz |
Allow using C++11 <regex> for searches as a provisional feature.
Diffstat (limited to 'src')
-rw-r--r-- | src/Document.cxx | 501 | ||||
-rw-r--r-- | src/Document.h | 17 | ||||
-rw-r--r-- | src/Editor.cxx | 91 | ||||
-rw-r--r-- | src/RESearch.h | 2 | ||||
-rw-r--r-- | src/UniConversion.cxx | 46 | ||||
-rw-r--r-- | src/UniConversion.h | 4 |
6 files changed, 582 insertions, 79 deletions
diff --git a/src/Document.cxx b/src/Document.cxx index c4faee603..4d2c48f1f 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -16,6 +16,10 @@ #include <vector> #include <algorithm> +#ifdef CXX11_REGEX +#include <regex> +#endif + #include "Platform.h" #include "ILexer.h" @@ -336,6 +340,10 @@ int SCI_METHOD Document::LineStart(int line) const { return cb.LineStart(line); } +bool Document::IsLineStartPosition(int position) const { + return LineStart(LineFromPosition(position)) == position; +} + int SCI_METHOD Document::LineEnd(int line) const { if (line >= LinesTotal() - 1) { return LineStart(line + 1); @@ -602,7 +610,7 @@ bool Document::InGoodUTF8(int pos, int &start, int &end) const { // When lines are terminated with \r\n pairs which should be treated as one character. // When displaying DBCS text such as Japanese. // If moving, move the position in the indicated direction. -int Document::MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd) { +int Document::MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd) const { //Platform::DebugPrintf("NoCRLF %d %d\n", pos, moveDir); // If out of range, just return minimum/maximum value. if (pos <= 0) @@ -1587,6 +1595,25 @@ void Document::SetCaseFolder(CaseFolder *pcf_) { pcf = pcf_; } +Document::CharacterExtracted Document::ExtractCharacter(int position) const { + const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(position)); + if (UTF8IsAscii(leadByte)) { + // Common case: ASCII character + return CharacterExtracted(leadByte, 1); + } + const int widthCharBytes = UTF8BytesOfLead[leadByte]; + unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 }; + for (int b=1; b<widthCharBytes; b++) + charBytes[b] = static_cast<unsigned char>(cb.CharAt(position + b)); + int utf8status = UTF8Classify(charBytes, widthCharBytes); + if (utf8status & UTF8MaskInvalid) { + // Treat as invalid and use up just one byte + return CharacterExtracted(unicodeReplacementChar, 1); + } else { + return CharacterExtracted(UnicodeFromBytes(charBytes), utf8status & UTF8MaskWidth); + } +} + /** * Find text in document, supporting both forward and backward * searches (just pass minPos > maxPos to do a backward search) @@ -2178,6 +2205,61 @@ private: std::string substituted; }; +namespace { + +/** +* RESearchRange keeps track of search range. +*/ +class RESearchRange { +public: + const Document *doc; + int increment; + int startPos; + int endPos; + int lineRangeStart; + int lineRangeEnd; + int lineRangeBreak; + RESearchRange(const Document *doc_, int minPos, int maxPos) : doc(doc_) { + increment = (minPos <= maxPos) ? 1 : -1; + + // Range endpoints should not be inside DBCS characters, but just in case, move them. + startPos = doc->MovePositionOutsideChar(minPos, 1, false); + endPos = doc->MovePositionOutsideChar(maxPos, 1, false); + + lineRangeStart = doc->LineFromPosition(startPos); + lineRangeEnd = doc->LineFromPosition(endPos); + if ((increment == 1) && + (startPos >= doc->LineEnd(lineRangeStart)) && + (lineRangeStart < lineRangeEnd)) { + // the start position is at end of line or between line end characters. + lineRangeStart++; + startPos = doc->LineStart(lineRangeStart); + } else if ((increment == -1) && + (startPos <= doc->LineStart(lineRangeStart)) && + (lineRangeStart > lineRangeEnd)) { + // the start position is at beginning of line. + lineRangeStart--; + startPos = doc->LineEnd(lineRangeStart); + } + lineRangeBreak = lineRangeEnd + increment; + } + Range LineRange(int line) const { + Range range(doc->LineStart(line), doc->LineEnd(line)); + if (increment == 1) { + if (line == lineRangeStart) + range.start = startPos; + if (line == lineRangeEnd) + range.end = endPos; + } else { + if (line == lineRangeEnd) + range.start = endPos; + if (line == lineRangeStart) + range.end = startPos; + } + return range; + } +}; + // Define a way for the Regular Expression code to access the document class DocumentIndexer : public CharacterIndexer { Document *pdoc; @@ -2198,18 +2280,375 @@ public: } }; +#ifdef CXX11_REGEX + +class ByteIterator : public std::iterator<std::bidirectional_iterator_tag, char> { +public: + const Document *doc; + Position position; + ByteIterator(const Document *doc_ = 0, Position position_ = 0) : doc(doc_), position(position_) { + } + ByteIterator(const ByteIterator &other) { + doc = other.doc; + position = other.position; + } + ByteIterator &operator=(const ByteIterator &other) { + if (this != &other) { + doc = other.doc; + position = other.position; + } + return *this; + } + char operator*() const { + return doc->CharAt(position); + } + ByteIterator &operator++() { + position++; + return *this; + } + ByteIterator operator++(int) { + ByteIterator retVal(*this); + position++; + return retVal; + } + ByteIterator &operator--() { + position--; + return *this; + } + bool operator==(const ByteIterator &other) const { + return doc == other.doc && position == other.position; + } + bool operator!=(const ByteIterator &other) const { + return doc != other.doc || position != other.position; + } + int Pos() const { + return position; + } + int PosRoundUp() const { + return position; + } +}; + +// On Windows, wchar_t is 16 bits wide and on Unix it is 32 bits wide. +// Would be better to use sizeof(wchar_t) or similar to differentiate +// but easier for now to hard-code platforms. +// C++11 has char16_t and char32_t but neither Clang nor Visual C++ +// appear to allow specializing basic_regex over these. + +#ifdef _WIN32 +#define WCHAR_T_IS_16 1 +#else +#define WCHAR_T_IS_16 0 +#endif + +#if WCHAR_T_IS_16 + +// On Windows, report non-BMP characters as 2 separate surrogates as that +// matches wregex since it is based on wchar_t. +class UTF8Iterator : public std::iterator<std::bidirectional_iterator_tag, wchar_t> { + // These 3 fields determine the iterator position and are used for comparisons + const Document *doc; + Position position; + size_t characterIndex; + // Remaining fields are derived from the determining fields so are excluded in comparisons + unsigned int lenBytes; + size_t lenCharacters; + wchar_t buffered[2]; +public: + UTF8Iterator(const Document *doc_ = 0, Position position_ = 0) : + doc(doc_), position(position_), characterIndex(0), lenBytes(0), lenCharacters(0) { + buffered[0] = 0; + buffered[1] = 0; + } + UTF8Iterator(const UTF8Iterator &other) { + doc = other.doc; + position = other.position; + characterIndex = other.characterIndex; + lenBytes = other.lenBytes; + lenCharacters = other.lenCharacters; + buffered[0] = other.buffered[0]; + buffered[1] = other.buffered[1]; + } + UTF8Iterator &operator=(const UTF8Iterator &other) { + if (this != &other) { + doc = other.doc; + position = other.position; + characterIndex = other.characterIndex; + lenBytes = other.lenBytes; + lenCharacters = other.lenCharacters; + buffered[0] = other.buffered[0]; + buffered[1] = other.buffered[1]; + } + return *this; + } + wchar_t operator*() { + if (lenCharacters == 0) { + ReadCharacter(); + } + return buffered[characterIndex]; + } + UTF8Iterator &operator++() { + if ((characterIndex + 1) < (lenCharacters)) { + characterIndex++; + } else { + position += lenBytes; + ReadCharacter(); + characterIndex = 0; + } + return *this; + } + UTF8Iterator operator++(int) { + UTF8Iterator retVal(*this); + if ((characterIndex + 1) < (lenCharacters)) { + characterIndex++; + } else { + position += lenBytes; + ReadCharacter(); + characterIndex = 0; + } + return retVal; + } + UTF8Iterator &operator--() { + if (characterIndex) { + characterIndex--; + } else { + position = doc->NextPosition(position, -1); + ReadCharacter(); + characterIndex = lenCharacters - 1; + } + return *this; + } + bool operator==(const UTF8Iterator &other) const { + // Only test the determining fields, not the character widths and values derived from this + return doc == other.doc && + position == other.position && + characterIndex == other.characterIndex; + } + bool operator!=(const UTF8Iterator &other) const { + // Only test the determining fields, not the character widths and values derived from this + return doc != other.doc || + position != other.position || + characterIndex != other.characterIndex; + } + int Pos() const { + return position; + } + int PosRoundUp() const { + if (characterIndex) + return position + lenBytes; // Force to end of character + else + return position; + } +private: + void ReadCharacter() { + Document::CharacterExtracted charExtracted = doc->ExtractCharacter(position); + lenBytes = charExtracted.widthBytes; + if (charExtracted.character == unicodeReplacementChar) { + lenCharacters = 1; + buffered[0] = static_cast<wchar_t>(charExtracted.character); + } else { + lenCharacters = UTF16FromUTF32Character(charExtracted.character, buffered); + } + } +}; + +#else + +// On Unix, report non-BMP characters as single characters + +class UTF8Iterator : public std::iterator<std::bidirectional_iterator_tag, wchar_t> { + const Document *doc; + Position position; +public: + UTF8Iterator(const Document *doc_=0, Position position_=0) : doc(doc_), position(position_) { + } + UTF8Iterator(const UTF8Iterator &other) { + doc = other.doc; + position = other.position; + } + UTF8Iterator &operator=(const UTF8Iterator &other) { + if (this != &other) { + doc = other.doc; + position = other.position; + } + return *this; + } + wchar_t operator*() const { + Document::CharacterExtracted charExtracted = doc->ExtractCharacter(position); + return charExtracted.character; + } + UTF8Iterator &operator++() { + position = doc->NextPosition(position, 1); + return *this; + } + UTF8Iterator operator++(int) { + UTF8Iterator retVal(*this); + position = doc->NextPosition(position, 1); + return retVal; + } + UTF8Iterator &operator--() { + position = doc->NextPosition(position, -1); + return *this; + } + bool operator==(const UTF8Iterator &other) const { + return doc == other.doc && position == other.position; + } + bool operator!=(const UTF8Iterator &other) const { + return doc != other.doc || position != other.position; + } + int Pos() const { + return position; + } + int PosRoundUp() const { + return position; + } +}; + +#endif + +std::regex_constants::match_flag_type MatchFlags(const Document *doc, int startPos, int endPos) { + std::regex_constants::match_flag_type flagsMatch = std::regex_constants::match_default; + if (!doc->IsLineStartPosition(startPos)) + flagsMatch |= std::regex_constants::match_not_bol; + if (!doc->IsLineEndPosition(endPos)) + flagsMatch |= std::regex_constants::match_not_eol; + return flagsMatch; +} + +template<typename Iterator, typename Regex> +bool MatchOnLines(const Document *doc, const Regex ®exp, const RESearchRange &resr, RESearch &search) { + bool matched = false; + std::match_results<Iterator> match; + + // MSVC and libc++ have problems with ^ and $ matching line ends inside a range + // If they didn't then the line by line iteration could be removed for the forwards + // case and replaced with the following 4 lines: + // Iterator uiStart(doc, startPos); + // Iterator uiEnd(doc, endPos); + // flagsMatch = MatchFlags(doc, startPos, endPos); + // matched = std::regex_search(uiStart, uiEnd, match, regexp, flagsMatch); + + // Line by line. + for (int line = resr.lineRangeStart; line != resr.lineRangeBreak; line += resr.increment) { + const Range lineRange = resr.LineRange(line); + Iterator itStart(doc, lineRange.start); + Iterator itEnd(doc, lineRange.end); + std::regex_constants::match_flag_type flagsMatch = MatchFlags(doc, lineRange.start, lineRange.end); + matched = std::regex_search(itStart, itEnd, match, regexp, flagsMatch); + // Check for the last match on this line. + if (matched) { + if (resr.increment == -1) { + while (matched) { + Iterator itNext(doc, match[0].second.PosRoundUp()); + flagsMatch = MatchFlags(doc, itNext.Pos(), lineRange.end); + std::match_results<Iterator> matchNext; + matched = std::regex_search(itNext, itEnd, matchNext, regexp, flagsMatch); + if (matched) { + if (match[0].first == match[0].second) { + // Empty match means failure so exit + return false; + } + match = matchNext; + } + } + matched = true; + } + break; + } + } + if (matched) { + for (size_t co = 0; co < match.size(); co++) { + search.bopat[co] = match[co].first.Pos(); + search.eopat[co] = match[co].second.PosRoundUp(); + size_t lenMatch = search.eopat[co] - search.bopat[co]; + search.pat[co].resize(lenMatch); + for (size_t iPos = 0; iPos < lenMatch; iPos++) { + search.pat[co][iPos] = doc->CharAt(iPos + search.bopat[co]); + } + } + } + return matched; +} + +long Cxx11RegexFindText(Document *doc, int minPos, int maxPos, const char *s, + bool caseSensitive, int *length, RESearch &search) { + const RESearchRange resr(doc, minPos, maxPos); + try { + //ElapsedTime et; + std::regex::flag_type flagsRe = std::regex::ECMAScript; + // Flags that apper to have no effect: + // | std::regex::collate | std::regex::extended; + if (!caseSensitive) + flagsRe = flagsRe | std::regex::icase; + + // Clear the RESearch so can fill in matches + search.Clear(); + + bool matched = false; + if (SC_CP_UTF8 == doc->dbcsCodePage) { + unsigned int lenS = static_cast<unsigned int>(strlen(s)); + std::vector<wchar_t> ws(lenS + 1); +#if WCHAR_T_IS_16 + size_t outLen = UTF16FromUTF8(s, lenS, &ws[0], lenS); +#else + size_t outLen = UTF32FromUTF8(s, lenS, reinterpret_cast<unsigned int *>(&ws[0]), lenS); +#endif + ws[outLen] = 0; + std::wregex regexp; +#if defined(__APPLE__) + // Using a UTF-8 locale doesn't change to Unicode over a byte buffer so '.' + // is one byte not one character. + // However, on OS X this makes wregex act as Unicode + std::locale localeU("en_US.UTF-8"); + regexp.imbue(localeU); +#endif + regexp.assign(&ws[0], flagsRe); + matched = MatchOnLines<UTF8Iterator>(doc, regexp, resr, search); + + } else { + std::regex regexp; + regexp.assign(s, flagsRe); + matched = MatchOnLines<ByteIterator>(doc, regexp, resr, search); + } + + int posMatch = -1; + if (matched) { + posMatch = search.bopat[0]; + *length = search.eopat[0] - search.bopat[0]; + } + // Example - search in doc/ScintillaHistory.html for + // [[:upper:]]eta[[:space:]] + // On MacBook, normally around 1 second but with locale imbued -> 14 seconds. + //double durSearch = et.Duration(true); + //Platform::DebugPrintf("Search:%9.6g \n", durSearch); + return posMatch; + } catch (std::regex_error &) { + // Failed to create regular expression + throw RegexError(); + } catch (...) { + // Failed in some other way + return -1; + } +} + +#endif + +} + long BuiltinRegex::FindText(Document *doc, int minPos, int maxPos, const char *s, bool caseSensitive, bool, bool, int flags, int *length) { - const bool posix = (flags & SCFIND_POSIX) != 0; - const int increment = (minPos <= maxPos) ? 1 : -1; - int startPos = minPos; - int endPos = maxPos; +#ifdef CXX11_REGEX + if (flags & SCFIND_CXX11REGEX) { + return Cxx11RegexFindText(doc, minPos, maxPos, s, + caseSensitive, length, search); + } +#endif - // Range endpoints should not be inside DBCS characters, but just in case, move them. - startPos = doc->MovePositionOutsideChar(startPos, 1, false); - endPos = doc->MovePositionOutsideChar(endPos, 1, false); + const RESearchRange resr(doc, minPos, maxPos); + + const bool posix = (flags & SCFIND_POSIX) != 0; const char *errmsg = search.Compile(s, *length, caseSensitive, posix); if (errmsg) { @@ -2219,50 +2658,34 @@ long BuiltinRegex::FindText(Document *doc, int minPos, int maxPos, const char *s // Replace first '.' with '-' in each property file variable reference: // Search: \$(\([A-Za-z0-9_-]+\)\.\([A-Za-z0-9_.]+\)) // Replace: $(\1-\2) - int lineRangeStart = doc->LineFromPosition(startPos); - const int lineRangeEnd = doc->LineFromPosition(endPos); - if ((increment == 1) && - (startPos >= doc->LineEnd(lineRangeStart)) && - (lineRangeStart < lineRangeEnd)) { - // the start position is at end of line or between line end characters. - lineRangeStart++; - startPos = doc->LineStart(lineRangeStart); - } else if ((increment == -1) && - (startPos <= doc->LineStart(lineRangeStart)) && - (lineRangeStart > lineRangeEnd)) { - // the start position is at beginning of line. - lineRangeStart--; - startPos = doc->LineEnd(lineRangeStart); - } int pos = -1; int lenRet = 0; const char searchEnd = s[*length - 1]; const char searchEndPrev = (*length > 1) ? s[*length - 2] : '\0'; - const int lineRangeBreak = lineRangeEnd + increment; - for (int line = lineRangeStart; line != lineRangeBreak; line += increment) { + for (int line = resr.lineRangeStart; line != resr.lineRangeBreak; line += resr.increment) { int startOfLine = doc->LineStart(line); int endOfLine = doc->LineEnd(line); - if (increment == 1) { - if (line == lineRangeStart) { - if ((startPos != startOfLine) && (s[0] == '^')) + if (resr.increment == 1) { + if (line == resr.lineRangeStart) { + if ((resr.startPos != startOfLine) && (s[0] == '^')) continue; // Can't match start of line if start position after start of line - startOfLine = startPos; + startOfLine = resr.startPos; } - if (line == lineRangeEnd) { - if ((endPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\')) + if (line == resr.lineRangeEnd) { + if ((resr.endPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\')) continue; // Can't match end of line if end position before end of line - endOfLine = endPos; + endOfLine = resr.endPos; } } else { - if (line == lineRangeEnd) { - if ((endPos != startOfLine) && (s[0] == '^')) + if (line == resr.lineRangeEnd) { + if ((resr.endPos != startOfLine) && (s[0] == '^')) continue; // Can't match start of line if end position after start of line - startOfLine = endPos; + startOfLine = resr.endPos; } - if (line == lineRangeStart) { - if ((startPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\')) + if (line == resr.lineRangeStart) { + if ((resr.startPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\')) continue; // Can't match end of line if start position before end of line - endOfLine = startPos; + endOfLine = resr.startPos; } } @@ -2274,7 +2697,7 @@ long BuiltinRegex::FindText(Document *doc, int minPos, int maxPos, const char *s search.eopat[0] = doc->MovePositionOutsideChar(search.eopat[0], 1, false); lenRet = search.eopat[0] - search.bopat[0]; // There can be only one start of a line, so no need to look for last match in line - if ((increment == -1) && (s[0] != '^')) { + if ((resr.increment == -1) && (s[0] != '^')) { // Check for the last match on this line. int repetitions = 1000; // Break out of infinite loop while (success && (search.eopat[0] <= endOfLine) && (repetitions--)) { diff --git a/src/Document.h b/src/Document.h index e84be14e4..477b4dc60 100644 --- a/src/Document.h +++ b/src/Document.h @@ -188,6 +188,10 @@ public: } }; +struct RegexError : public std::runtime_error { + RegexError() : std::runtime_error("regex failure") {} +}; + /** */ class Document : PerLine, public IDocumentWithLineEnd, public ILoader { @@ -271,7 +275,7 @@ public: bool IsCrLf(int pos) const; int LenChar(int pos); bool InGoodUTF8(int pos, int &start, int &end) const; - int MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd=true); + int MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd=true) const; int NextPosition(int pos, int moveDir) const; bool NextCharacter(int &pos, int moveDir) const; // Returns true if pos changed int SCI_METHOD GetRelativePosition(int positionStart, int characterOffset) const; @@ -345,6 +349,7 @@ public: void DeleteAllMarks(int markerNum); int LineFromHandle(int markerHandle); int SCI_METHOD LineStart(int line) const; + bool IsLineStartPosition(int position) const; int SCI_METHOD LineEnd(int line) const; int LineEndPosition(int position) const; bool IsLineEndPosition(int position) const; @@ -364,6 +369,16 @@ public: int NextWordEnd(int pos, int delta); int SCI_METHOD Length() const { return cb.Length(); } void Allocate(int newSize) { cb.Allocate(newSize); } + + struct CharacterExtracted { + unsigned int character; + unsigned int widthBytes; + CharacterExtracted(unsigned int character_, unsigned int widthBytes_) : + character(character_), widthBytes(widthBytes_) { + } + }; + CharacterExtracted ExtractCharacter(int position) const; + bool MatchesWordOptions(bool word, bool wordStart, int pos, int length) const; bool HasCaseFolder(void) const; void SetCaseFolder(CaseFolder *pcf_); diff --git a/src/Editor.cxx b/src/Editor.cxx index 8748d89c1..80f96a7c8 100644 --- a/src/Editor.cxx +++ b/src/Editor.cxx @@ -3639,18 +3639,23 @@ long Editor::FindText( int lengthFound = istrlen(ft->lpstrText); if (!pdoc->HasCaseFolder()) pdoc->SetCaseFolder(CaseFolderForEncoding()); - int pos = pdoc->FindText(ft->chrg.cpMin, ft->chrg.cpMax, ft->lpstrText, - (wParam & SCFIND_MATCHCASE) != 0, - (wParam & SCFIND_WHOLEWORD) != 0, - (wParam & SCFIND_WORDSTART) != 0, - (wParam & SCFIND_REGEXP) != 0, - static_cast<int>(wParam), - &lengthFound); - if (pos != -1) { - ft->chrgText.cpMin = pos; - ft->chrgText.cpMax = pos + lengthFound; + try { + int pos = pdoc->FindText(ft->chrg.cpMin, ft->chrg.cpMax, ft->lpstrText, + (wParam & SCFIND_MATCHCASE) != 0, + (wParam & SCFIND_WHOLEWORD) != 0, + (wParam & SCFIND_WORDSTART) != 0, + (wParam & SCFIND_REGEXP) != 0, + static_cast<int>(wParam), + &lengthFound); + if (pos != -1) { + ft->chrgText.cpMin = pos; + ft->chrgText.cpMax = pos + lengthFound; + } + return pos; + } catch (RegexError &) { + errorStatus = SC_STATUS_WARN_REGEX; + return -1; } - return pos; } /** @@ -3684,22 +3689,27 @@ long Editor::SearchText( int lengthFound = istrlen(txt); if (!pdoc->HasCaseFolder()) pdoc->SetCaseFolder(CaseFolderForEncoding()); - if (iMessage == SCI_SEARCHNEXT) { - pos = pdoc->FindText(searchAnchor, pdoc->Length(), txt, - (wParam & SCFIND_MATCHCASE) != 0, - (wParam & SCFIND_WHOLEWORD) != 0, - (wParam & SCFIND_WORDSTART) != 0, - (wParam & SCFIND_REGEXP) != 0, - static_cast<int>(wParam), - &lengthFound); - } else { - pos = pdoc->FindText(searchAnchor, 0, txt, - (wParam & SCFIND_MATCHCASE) != 0, - (wParam & SCFIND_WHOLEWORD) != 0, - (wParam & SCFIND_WORDSTART) != 0, - (wParam & SCFIND_REGEXP) != 0, - static_cast<int>(wParam), - &lengthFound); + try { + if (iMessage == SCI_SEARCHNEXT) { + pos = pdoc->FindText(searchAnchor, pdoc->Length(), txt, + (wParam & SCFIND_MATCHCASE) != 0, + (wParam & SCFIND_WHOLEWORD) != 0, + (wParam & SCFIND_WORDSTART) != 0, + (wParam & SCFIND_REGEXP) != 0, + static_cast<int>(wParam), + &lengthFound); + } else { + pos = pdoc->FindText(searchAnchor, 0, txt, + (wParam & SCFIND_MATCHCASE) != 0, + (wParam & SCFIND_WHOLEWORD) != 0, + (wParam & SCFIND_WORDSTART) != 0, + (wParam & SCFIND_REGEXP) != 0, + static_cast<int>(wParam), + &lengthFound); + } + } catch (RegexError &) { + errorStatus = SC_STATUS_WARN_REGEX; + return -1; } if (pos != -1) { SetSelection(pos, pos + lengthFound); @@ -3734,18 +3744,23 @@ long Editor::SearchInTarget(const char *text, int length) { if (!pdoc->HasCaseFolder()) pdoc->SetCaseFolder(CaseFolderForEncoding()); - int pos = pdoc->FindText(targetStart, targetEnd, text, - (searchFlags & SCFIND_MATCHCASE) != 0, - (searchFlags & SCFIND_WHOLEWORD) != 0, - (searchFlags & SCFIND_WORDSTART) != 0, - (searchFlags & SCFIND_REGEXP) != 0, - searchFlags, - &lengthFound); - if (pos != -1) { - targetStart = pos; - targetEnd = pos + lengthFound; + try { + int pos = pdoc->FindText(targetStart, targetEnd, text, + (searchFlags & SCFIND_MATCHCASE) != 0, + (searchFlags & SCFIND_WHOLEWORD) != 0, + (searchFlags & SCFIND_WORDSTART) != 0, + (searchFlags & SCFIND_REGEXP) != 0, + searchFlags, + &lengthFound); + if (pos != -1) { + targetStart = pos; + targetEnd = pos + lengthFound; + } + return pos; + } catch (RegexError &) { + errorStatus = SC_STATUS_WARN_REGEX; + return -1; } - return pos; } void Editor::GoToLine(int lineNo) { diff --git a/src/RESearch.h b/src/RESearch.h index 38875a3f4..3a7f0e4d6 100644 --- a/src/RESearch.h +++ b/src/RESearch.h @@ -33,6 +33,7 @@ class RESearch { public: explicit RESearch(CharClassify *charClassTable); ~RESearch(); + void Clear(); void GrabMatches(CharacterIndexer &ci); const char *Compile(const char *pattern, int length, bool caseSensitive, bool posix); int Execute(CharacterIndexer &ci, int lp, int endp); @@ -46,7 +47,6 @@ public: std::string pat[MAXTAG]; private: - void Clear(); void ChSet(unsigned char c); void ChSetWithCase(unsigned char c, bool caseSensitive); int GetBackslashExpression(const char *pattern, int &incr); diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index 2286e047d..d19828a52 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -20,6 +20,7 @@ namespace Scintilla { enum { SURROGATE_LEAD_FIRST = 0xD800 }; enum { SURROGATE_TRAIL_FIRST = 0xDC00 }; enum { SURROGATE_TRAIL_LAST = 0xDFFF }; +enum { SUPPLEMENTAL_PLANE_FIRST = 0x10000 }; unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen) { unsigned int len = 0; @@ -138,6 +139,51 @@ unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsig return ui; } +unsigned int UTF32FromUTF8(const char *s, unsigned int len, unsigned int *tbuf, unsigned int tlen) { + unsigned int ui=0; + const unsigned char *us = reinterpret_cast<const unsigned char *>(s); + unsigned int i=0; + while ((i<len) && (ui<tlen)) { + unsigned char ch = us[i++]; + wchar_t value = 0; + if (ch < 0x80) { + value = ch; + } else if (((len-i) >= 1) && (ch < 0x80 + 0x40 + 0x20)) { + value = (ch & 0x1F) << 6; + ch = us[i++]; + value += ch & 0x7F; + } else if (((len-i) >= 2) && (ch < 0x80 + 0x40 + 0x20 + 0x10)) { + value = (ch & 0xF) << 12; + ch = us[i++]; + value += (ch & 0x7F) << 6; + ch = us[i++]; + value += ch & 0x7F; + } else if ((len-i) >= 3) { + value = (ch & 0x7) << 18; + ch = us[i++]; + value += (ch & 0x3F) << 12; + ch = us[i++]; + value += (ch & 0x3F) << 6; + ch = us[i++]; + value += ch & 0x3F; + } + tbuf[ui] = value; + ui++; + } + return ui; +} + +unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) { + if (val < SUPPLEMENTAL_PLANE_FIRST) { + tbuf[0] = static_cast<wchar_t>(val); + return 1; + } else { + tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST); + tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST); + return 2; + } +} + int UTF8BytesOfLead[256]; static bool initialisedBytesOfLead = false; diff --git a/src/UniConversion.h b/src/UniConversion.h index 753490bab..760f50476 100644 --- a/src/UniConversion.h +++ b/src/UniConversion.h @@ -14,11 +14,15 @@ namespace Scintilla { const int UTF8MaxBytes = 4; +const int unicodeReplacementChar = 0xFFFD; + unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen); void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len); unsigned int UTF8CharLength(unsigned char ch); unsigned int UTF16Length(const char *s, unsigned int len); unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen); +unsigned int UTF32FromUTF8(const char *s, unsigned int len, unsigned int *tbuf, unsigned int tlen); +unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf); extern int UTF8BytesOfLead[256]; void UTF8BytesOfLeadInitialise(); |