aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/Document.cxx
diff options
context:
space:
mode:
authorNeil <nyamatongwe@gmail.com>2014-10-02 18:17:13 +1000
committerNeil <nyamatongwe@gmail.com>2014-10-02 18:17:13 +1000
commit2603f1e2074b0f880886b533ffc47ecef4fd33f7 (patch)
tree53bfaff1eca31d0768f43d0c496c81b551230a2d /src/Document.cxx
parent8c42cc95ae3fbfdb8d4b7e8893c2fee283efe1a6 (diff)
downloadscintilla-mirror-2603f1e2074b0f880886b533ffc47ecef4fd33f7.tar.gz
Allow using C++11 <regex> for searches as a provisional feature.
Diffstat (limited to 'src/Document.cxx')
-rw-r--r--src/Document.cxx501
1 files changed, 462 insertions, 39 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index c4faee603..4d2c48f1f 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -16,6 +16,10 @@
#include <vector>
#include <algorithm>
+#ifdef CXX11_REGEX
+#include <regex>
+#endif
+
#include "Platform.h"
#include "ILexer.h"
@@ -336,6 +340,10 @@ int SCI_METHOD Document::LineStart(int line) const {
return cb.LineStart(line);
}
+bool Document::IsLineStartPosition(int position) const {
+ return LineStart(LineFromPosition(position)) == position;
+}
+
int SCI_METHOD Document::LineEnd(int line) const {
if (line >= LinesTotal() - 1) {
return LineStart(line + 1);
@@ -602,7 +610,7 @@ bool Document::InGoodUTF8(int pos, int &start, int &end) const {
// When lines are terminated with \r\n pairs which should be treated as one character.
// When displaying DBCS text such as Japanese.
// If moving, move the position in the indicated direction.
-int Document::MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd) {
+int Document::MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd) const {
//Platform::DebugPrintf("NoCRLF %d %d\n", pos, moveDir);
// If out of range, just return minimum/maximum value.
if (pos <= 0)
@@ -1587,6 +1595,25 @@ void Document::SetCaseFolder(CaseFolder *pcf_) {
pcf = pcf_;
}
+Document::CharacterExtracted Document::ExtractCharacter(int position) const {
+ const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(position));
+ if (UTF8IsAscii(leadByte)) {
+ // Common case: ASCII character
+ return CharacterExtracted(leadByte, 1);
+ }
+ const int widthCharBytes = UTF8BytesOfLead[leadByte];
+ unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 };
+ for (int b=1; b<widthCharBytes; b++)
+ charBytes[b] = static_cast<unsigned char>(cb.CharAt(position + b));
+ int utf8status = UTF8Classify(charBytes, widthCharBytes);
+ if (utf8status & UTF8MaskInvalid) {
+ // Treat as invalid and use up just one byte
+ return CharacterExtracted(unicodeReplacementChar, 1);
+ } else {
+ return CharacterExtracted(UnicodeFromBytes(charBytes), utf8status & UTF8MaskWidth);
+ }
+}
+
/**
* Find text in document, supporting both forward and backward
* searches (just pass minPos > maxPos to do a backward search)
@@ -2178,6 +2205,61 @@ private:
std::string substituted;
};
+namespace {
+
+/**
+* RESearchRange keeps track of search range.
+*/
+class RESearchRange {
+public:
+ const Document *doc;
+ int increment;
+ int startPos;
+ int endPos;
+ int lineRangeStart;
+ int lineRangeEnd;
+ int lineRangeBreak;
+ RESearchRange(const Document *doc_, int minPos, int maxPos) : doc(doc_) {
+ increment = (minPos <= maxPos) ? 1 : -1;
+
+ // Range endpoints should not be inside DBCS characters, but just in case, move them.
+ startPos = doc->MovePositionOutsideChar(minPos, 1, false);
+ endPos = doc->MovePositionOutsideChar(maxPos, 1, false);
+
+ lineRangeStart = doc->LineFromPosition(startPos);
+ lineRangeEnd = doc->LineFromPosition(endPos);
+ if ((increment == 1) &&
+ (startPos >= doc->LineEnd(lineRangeStart)) &&
+ (lineRangeStart < lineRangeEnd)) {
+ // the start position is at end of line or between line end characters.
+ lineRangeStart++;
+ startPos = doc->LineStart(lineRangeStart);
+ } else if ((increment == -1) &&
+ (startPos <= doc->LineStart(lineRangeStart)) &&
+ (lineRangeStart > lineRangeEnd)) {
+ // the start position is at beginning of line.
+ lineRangeStart--;
+ startPos = doc->LineEnd(lineRangeStart);
+ }
+ lineRangeBreak = lineRangeEnd + increment;
+ }
+ Range LineRange(int line) const {
+ Range range(doc->LineStart(line), doc->LineEnd(line));
+ if (increment == 1) {
+ if (line == lineRangeStart)
+ range.start = startPos;
+ if (line == lineRangeEnd)
+ range.end = endPos;
+ } else {
+ if (line == lineRangeEnd)
+ range.start = endPos;
+ if (line == lineRangeStart)
+ range.end = startPos;
+ }
+ return range;
+ }
+};
+
// Define a way for the Regular Expression code to access the document
class DocumentIndexer : public CharacterIndexer {
Document *pdoc;
@@ -2198,18 +2280,375 @@ public:
}
};
+#ifdef CXX11_REGEX
+
+class ByteIterator : public std::iterator<std::bidirectional_iterator_tag, char> {
+public:
+ const Document *doc;
+ Position position;
+ ByteIterator(const Document *doc_ = 0, Position position_ = 0) : doc(doc_), position(position_) {
+ }
+ ByteIterator(const ByteIterator &other) {
+ doc = other.doc;
+ position = other.position;
+ }
+ ByteIterator &operator=(const ByteIterator &other) {
+ if (this != &other) {
+ doc = other.doc;
+ position = other.position;
+ }
+ return *this;
+ }
+ char operator*() const {
+ return doc->CharAt(position);
+ }
+ ByteIterator &operator++() {
+ position++;
+ return *this;
+ }
+ ByteIterator operator++(int) {
+ ByteIterator retVal(*this);
+ position++;
+ return retVal;
+ }
+ ByteIterator &operator--() {
+ position--;
+ return *this;
+ }
+ bool operator==(const ByteIterator &other) const {
+ return doc == other.doc && position == other.position;
+ }
+ bool operator!=(const ByteIterator &other) const {
+ return doc != other.doc || position != other.position;
+ }
+ int Pos() const {
+ return position;
+ }
+ int PosRoundUp() const {
+ return position;
+ }
+};
+
+// On Windows, wchar_t is 16 bits wide and on Unix it is 32 bits wide.
+// Would be better to use sizeof(wchar_t) or similar to differentiate
+// but easier for now to hard-code platforms.
+// C++11 has char16_t and char32_t but neither Clang nor Visual C++
+// appear to allow specializing basic_regex over these.
+
+#ifdef _WIN32
+#define WCHAR_T_IS_16 1
+#else
+#define WCHAR_T_IS_16 0
+#endif
+
+#if WCHAR_T_IS_16
+
+// On Windows, report non-BMP characters as 2 separate surrogates as that
+// matches wregex since it is based on wchar_t.
+class UTF8Iterator : public std::iterator<std::bidirectional_iterator_tag, wchar_t> {
+ // These 3 fields determine the iterator position and are used for comparisons
+ const Document *doc;
+ Position position;
+ size_t characterIndex;
+ // Remaining fields are derived from the determining fields so are excluded in comparisons
+ unsigned int lenBytes;
+ size_t lenCharacters;
+ wchar_t buffered[2];
+public:
+ UTF8Iterator(const Document *doc_ = 0, Position position_ = 0) :
+ doc(doc_), position(position_), characterIndex(0), lenBytes(0), lenCharacters(0) {
+ buffered[0] = 0;
+ buffered[1] = 0;
+ }
+ UTF8Iterator(const UTF8Iterator &other) {
+ doc = other.doc;
+ position = other.position;
+ characterIndex = other.characterIndex;
+ lenBytes = other.lenBytes;
+ lenCharacters = other.lenCharacters;
+ buffered[0] = other.buffered[0];
+ buffered[1] = other.buffered[1];
+ }
+ UTF8Iterator &operator=(const UTF8Iterator &other) {
+ if (this != &other) {
+ doc = other.doc;
+ position = other.position;
+ characterIndex = other.characterIndex;
+ lenBytes = other.lenBytes;
+ lenCharacters = other.lenCharacters;
+ buffered[0] = other.buffered[0];
+ buffered[1] = other.buffered[1];
+ }
+ return *this;
+ }
+ wchar_t operator*() {
+ if (lenCharacters == 0) {
+ ReadCharacter();
+ }
+ return buffered[characterIndex];
+ }
+ UTF8Iterator &operator++() {
+ if ((characterIndex + 1) < (lenCharacters)) {
+ characterIndex++;
+ } else {
+ position += lenBytes;
+ ReadCharacter();
+ characterIndex = 0;
+ }
+ return *this;
+ }
+ UTF8Iterator operator++(int) {
+ UTF8Iterator retVal(*this);
+ if ((characterIndex + 1) < (lenCharacters)) {
+ characterIndex++;
+ } else {
+ position += lenBytes;
+ ReadCharacter();
+ characterIndex = 0;
+ }
+ return retVal;
+ }
+ UTF8Iterator &operator--() {
+ if (characterIndex) {
+ characterIndex--;
+ } else {
+ position = doc->NextPosition(position, -1);
+ ReadCharacter();
+ characterIndex = lenCharacters - 1;
+ }
+ return *this;
+ }
+ bool operator==(const UTF8Iterator &other) const {
+ // Only test the determining fields, not the character widths and values derived from this
+ return doc == other.doc &&
+ position == other.position &&
+ characterIndex == other.characterIndex;
+ }
+ bool operator!=(const UTF8Iterator &other) const {
+ // Only test the determining fields, not the character widths and values derived from this
+ return doc != other.doc ||
+ position != other.position ||
+ characterIndex != other.characterIndex;
+ }
+ int Pos() const {
+ return position;
+ }
+ int PosRoundUp() const {
+ if (characterIndex)
+ return position + lenBytes; // Force to end of character
+ else
+ return position;
+ }
+private:
+ void ReadCharacter() {
+ Document::CharacterExtracted charExtracted = doc->ExtractCharacter(position);
+ lenBytes = charExtracted.widthBytes;
+ if (charExtracted.character == unicodeReplacementChar) {
+ lenCharacters = 1;
+ buffered[0] = static_cast<wchar_t>(charExtracted.character);
+ } else {
+ lenCharacters = UTF16FromUTF32Character(charExtracted.character, buffered);
+ }
+ }
+};
+
+#else
+
+// On Unix, report non-BMP characters as single characters
+
+class UTF8Iterator : public std::iterator<std::bidirectional_iterator_tag, wchar_t> {
+ const Document *doc;
+ Position position;
+public:
+ UTF8Iterator(const Document *doc_=0, Position position_=0) : doc(doc_), position(position_) {
+ }
+ UTF8Iterator(const UTF8Iterator &other) {
+ doc = other.doc;
+ position = other.position;
+ }
+ UTF8Iterator &operator=(const UTF8Iterator &other) {
+ if (this != &other) {
+ doc = other.doc;
+ position = other.position;
+ }
+ return *this;
+ }
+ wchar_t operator*() const {
+ Document::CharacterExtracted charExtracted = doc->ExtractCharacter(position);
+ return charExtracted.character;
+ }
+ UTF8Iterator &operator++() {
+ position = doc->NextPosition(position, 1);
+ return *this;
+ }
+ UTF8Iterator operator++(int) {
+ UTF8Iterator retVal(*this);
+ position = doc->NextPosition(position, 1);
+ return retVal;
+ }
+ UTF8Iterator &operator--() {
+ position = doc->NextPosition(position, -1);
+ return *this;
+ }
+ bool operator==(const UTF8Iterator &other) const {
+ return doc == other.doc && position == other.position;
+ }
+ bool operator!=(const UTF8Iterator &other) const {
+ return doc != other.doc || position != other.position;
+ }
+ int Pos() const {
+ return position;
+ }
+ int PosRoundUp() const {
+ return position;
+ }
+};
+
+#endif
+
+std::regex_constants::match_flag_type MatchFlags(const Document *doc, int startPos, int endPos) {
+ std::regex_constants::match_flag_type flagsMatch = std::regex_constants::match_default;
+ if (!doc->IsLineStartPosition(startPos))
+ flagsMatch |= std::regex_constants::match_not_bol;
+ if (!doc->IsLineEndPosition(endPos))
+ flagsMatch |= std::regex_constants::match_not_eol;
+ return flagsMatch;
+}
+
+template<typename Iterator, typename Regex>
+bool MatchOnLines(const Document *doc, const Regex &regexp, const RESearchRange &resr, RESearch &search) {
+ bool matched = false;
+ std::match_results<Iterator> match;
+
+ // MSVC and libc++ have problems with ^ and $ matching line ends inside a range
+ // If they didn't then the line by line iteration could be removed for the forwards
+ // case and replaced with the following 4 lines:
+ // Iterator uiStart(doc, startPos);
+ // Iterator uiEnd(doc, endPos);
+ // flagsMatch = MatchFlags(doc, startPos, endPos);
+ // matched = std::regex_search(uiStart, uiEnd, match, regexp, flagsMatch);
+
+ // Line by line.
+ for (int line = resr.lineRangeStart; line != resr.lineRangeBreak; line += resr.increment) {
+ const Range lineRange = resr.LineRange(line);
+ Iterator itStart(doc, lineRange.start);
+ Iterator itEnd(doc, lineRange.end);
+ std::regex_constants::match_flag_type flagsMatch = MatchFlags(doc, lineRange.start, lineRange.end);
+ matched = std::regex_search(itStart, itEnd, match, regexp, flagsMatch);
+ // Check for the last match on this line.
+ if (matched) {
+ if (resr.increment == -1) {
+ while (matched) {
+ Iterator itNext(doc, match[0].second.PosRoundUp());
+ flagsMatch = MatchFlags(doc, itNext.Pos(), lineRange.end);
+ std::match_results<Iterator> matchNext;
+ matched = std::regex_search(itNext, itEnd, matchNext, regexp, flagsMatch);
+ if (matched) {
+ if (match[0].first == match[0].second) {
+ // Empty match means failure so exit
+ return false;
+ }
+ match = matchNext;
+ }
+ }
+ matched = true;
+ }
+ break;
+ }
+ }
+ if (matched) {
+ for (size_t co = 0; co < match.size(); co++) {
+ search.bopat[co] = match[co].first.Pos();
+ search.eopat[co] = match[co].second.PosRoundUp();
+ size_t lenMatch = search.eopat[co] - search.bopat[co];
+ search.pat[co].resize(lenMatch);
+ for (size_t iPos = 0; iPos < lenMatch; iPos++) {
+ search.pat[co][iPos] = doc->CharAt(iPos + search.bopat[co]);
+ }
+ }
+ }
+ return matched;
+}
+
+long Cxx11RegexFindText(Document *doc, int minPos, int maxPos, const char *s,
+ bool caseSensitive, int *length, RESearch &search) {
+ const RESearchRange resr(doc, minPos, maxPos);
+ try {
+ //ElapsedTime et;
+ std::regex::flag_type flagsRe = std::regex::ECMAScript;
+ // Flags that apper to have no effect:
+ // | std::regex::collate | std::regex::extended;
+ if (!caseSensitive)
+ flagsRe = flagsRe | std::regex::icase;
+
+ // Clear the RESearch so can fill in matches
+ search.Clear();
+
+ bool matched = false;
+ if (SC_CP_UTF8 == doc->dbcsCodePage) {
+ unsigned int lenS = static_cast<unsigned int>(strlen(s));
+ std::vector<wchar_t> ws(lenS + 1);
+#if WCHAR_T_IS_16
+ size_t outLen = UTF16FromUTF8(s, lenS, &ws[0], lenS);
+#else
+ size_t outLen = UTF32FromUTF8(s, lenS, reinterpret_cast<unsigned int *>(&ws[0]), lenS);
+#endif
+ ws[outLen] = 0;
+ std::wregex regexp;
+#if defined(__APPLE__)
+ // Using a UTF-8 locale doesn't change to Unicode over a byte buffer so '.'
+ // is one byte not one character.
+ // However, on OS X this makes wregex act as Unicode
+ std::locale localeU("en_US.UTF-8");
+ regexp.imbue(localeU);
+#endif
+ regexp.assign(&ws[0], flagsRe);
+ matched = MatchOnLines<UTF8Iterator>(doc, regexp, resr, search);
+
+ } else {
+ std::regex regexp;
+ regexp.assign(s, flagsRe);
+ matched = MatchOnLines<ByteIterator>(doc, regexp, resr, search);
+ }
+
+ int posMatch = -1;
+ if (matched) {
+ posMatch = search.bopat[0];
+ *length = search.eopat[0] - search.bopat[0];
+ }
+ // Example - search in doc/ScintillaHistory.html for
+ // [[:upper:]]eta[[:space:]]
+ // On MacBook, normally around 1 second but with locale imbued -> 14 seconds.
+ //double durSearch = et.Duration(true);
+ //Platform::DebugPrintf("Search:%9.6g \n", durSearch);
+ return posMatch;
+ } catch (std::regex_error &) {
+ // Failed to create regular expression
+ throw RegexError();
+ } catch (...) {
+ // Failed in some other way
+ return -1;
+ }
+}
+
+#endif
+
+}
+
long BuiltinRegex::FindText(Document *doc, int minPos, int maxPos, const char *s,
bool caseSensitive, bool, bool, int flags,
int *length) {
- const bool posix = (flags & SCFIND_POSIX) != 0;
- const int increment = (minPos <= maxPos) ? 1 : -1;
- int startPos = minPos;
- int endPos = maxPos;
+#ifdef CXX11_REGEX
+ if (flags & SCFIND_CXX11REGEX) {
+ return Cxx11RegexFindText(doc, minPos, maxPos, s,
+ caseSensitive, length, search);
+ }
+#endif
- // Range endpoints should not be inside DBCS characters, but just in case, move them.
- startPos = doc->MovePositionOutsideChar(startPos, 1, false);
- endPos = doc->MovePositionOutsideChar(endPos, 1, false);
+ const RESearchRange resr(doc, minPos, maxPos);
+
+ const bool posix = (flags & SCFIND_POSIX) != 0;
const char *errmsg = search.Compile(s, *length, caseSensitive, posix);
if (errmsg) {
@@ -2219,50 +2658,34 @@ long BuiltinRegex::FindText(Document *doc, int minPos, int maxPos, const char *s
// Replace first '.' with '-' in each property file variable reference:
// Search: \$(\([A-Za-z0-9_-]+\)\.\([A-Za-z0-9_.]+\))
// Replace: $(\1-\2)
- int lineRangeStart = doc->LineFromPosition(startPos);
- const int lineRangeEnd = doc->LineFromPosition(endPos);
- if ((increment == 1) &&
- (startPos >= doc->LineEnd(lineRangeStart)) &&
- (lineRangeStart < lineRangeEnd)) {
- // the start position is at end of line or between line end characters.
- lineRangeStart++;
- startPos = doc->LineStart(lineRangeStart);
- } else if ((increment == -1) &&
- (startPos <= doc->LineStart(lineRangeStart)) &&
- (lineRangeStart > lineRangeEnd)) {
- // the start position is at beginning of line.
- lineRangeStart--;
- startPos = doc->LineEnd(lineRangeStart);
- }
int pos = -1;
int lenRet = 0;
const char searchEnd = s[*length - 1];
const char searchEndPrev = (*length > 1) ? s[*length - 2] : '\0';
- const int lineRangeBreak = lineRangeEnd + increment;
- for (int line = lineRangeStart; line != lineRangeBreak; line += increment) {
+ for (int line = resr.lineRangeStart; line != resr.lineRangeBreak; line += resr.increment) {
int startOfLine = doc->LineStart(line);
int endOfLine = doc->LineEnd(line);
- if (increment == 1) {
- if (line == lineRangeStart) {
- if ((startPos != startOfLine) && (s[0] == '^'))
+ if (resr.increment == 1) {
+ if (line == resr.lineRangeStart) {
+ if ((resr.startPos != startOfLine) && (s[0] == '^'))
continue; // Can't match start of line if start position after start of line
- startOfLine = startPos;
+ startOfLine = resr.startPos;
}
- if (line == lineRangeEnd) {
- if ((endPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\'))
+ if (line == resr.lineRangeEnd) {
+ if ((resr.endPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\'))
continue; // Can't match end of line if end position before end of line
- endOfLine = endPos;
+ endOfLine = resr.endPos;
}
} else {
- if (line == lineRangeEnd) {
- if ((endPos != startOfLine) && (s[0] == '^'))
+ if (line == resr.lineRangeEnd) {
+ if ((resr.endPos != startOfLine) && (s[0] == '^'))
continue; // Can't match start of line if end position after start of line
- startOfLine = endPos;
+ startOfLine = resr.endPos;
}
- if (line == lineRangeStart) {
- if ((startPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\'))
+ if (line == resr.lineRangeStart) {
+ if ((resr.startPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\'))
continue; // Can't match end of line if start position before end of line
- endOfLine = startPos;
+ endOfLine = resr.startPos;
}
}
@@ -2274,7 +2697,7 @@ long BuiltinRegex::FindText(Document *doc, int minPos, int maxPos, const char *s
search.eopat[0] = doc->MovePositionOutsideChar(search.eopat[0], 1, false);
lenRet = search.eopat[0] - search.bopat[0];
// There can be only one start of a line, so no need to look for last match in line
- if ((increment == -1) && (s[0] != '^')) {
+ if ((resr.increment == -1) && (s[0] != '^')) {
// Check for the last match on this line.
int repetitions = 1000; // Break out of infinite loop
while (success && (search.eopat[0] <= endOfLine) && (repetitions--)) {