Allow using C++11 <regex> for searches as a provisional feature.

author: Neil <nyamatongwe@gmail.com> 2014-10-02 18:17:13 +1000
committer: Neil <nyamatongwe@gmail.com> 2014-10-02 18:17:13 +1000
commit: 2603f1e2074b0f880886b533ffc47ecef4fd33f7 (patch)
tree: 53bfaff1eca31d0768f43d0c496c81b551230a2d /src/Document.cxx
parent: 8c42cc95ae3fbfdb8d4b7e8893c2fee283efe1a6 (diff)
download: scintilla-mirror-2603f1e2074b0f880886b533ffc47ecef4fd33f7.tar.gz
1 files changed, 462 insertions, 39 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index c4faee603..4d2c48f1f 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -16,6 +16,10 @@
 #include <vector>
 #include <algorithm>
 
+#ifdef CXX11_REGEX
+#include <regex>
+#endif
+
 #include "Platform.h"
 
 #include "ILexer.h"
@@ -336,6 +340,10 @@ int SCI_METHOD Document::LineStart(int line) const {
 	return cb.LineStart(line);
 }
 
+bool Document::IsLineStartPosition(int position) const {
+	return LineStart(LineFromPosition(position)) == position;
+}
+
 int SCI_METHOD Document::LineEnd(int line) const {
 	if (line >= LinesTotal() - 1) {
 		return LineStart(line + 1);
@@ -602,7 +610,7 @@ bool Document::InGoodUTF8(int pos, int &start, int &end) const {
 // When lines are terminated with \r\n pairs which should be treated as one character.
 // When displaying DBCS text such as Japanese.
 // If moving, move the position in the indicated direction.
-int Document::MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd) {
+int Document::MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd) const {
 	//Platform::DebugPrintf("NoCRLF %d %d\n", pos, moveDir);
 	// If out of range, just return minimum/maximum value.
 	if (pos <= 0)
@@ -1587,6 +1595,25 @@ void Document::SetCaseFolder(CaseFolder *pcf_) {
 	pcf = pcf_;
 }
 
+Document::CharacterExtracted Document::ExtractCharacter(int position) const {
+	const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(position));
+	if (UTF8IsAscii(leadByte)) {
+		// Common case: ASCII character
+		return CharacterExtracted(leadByte, 1);
+	}
+	const int widthCharBytes = UTF8BytesOfLead[leadByte];
+	unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 };
+	for (int b=1; b<widthCharBytes; b++)
+		charBytes[b] = static_cast<unsigned char>(cb.CharAt(position + b));
+	int utf8status = UTF8Classify(charBytes, widthCharBytes);
+	if (utf8status & UTF8MaskInvalid) {
+		// Treat as invalid and use up just one byte
+		return CharacterExtracted(unicodeReplacementChar, 1);
+	} else {
+		return CharacterExtracted(UnicodeFromBytes(charBytes), utf8status & UTF8MaskWidth);
+	}
+}
+
 /**
  * Find text in document, supporting both forward and backward
  * searches (just pass minPos > maxPos to do a backward search)
@@ -2178,6 +2205,61 @@ private:
 	std::string substituted;
 };
 
+namespace {
+
+/**
+* RESearchRange keeps track of search range.
+*/
+class RESearchRange {
+public:
+	const Document *doc;
+	int increment;
+	int startPos;
+	int endPos;
+	int lineRangeStart;
+	int lineRangeEnd;
+	int lineRangeBreak;
+	RESearchRange(const Document *doc_, int minPos, int maxPos) : doc(doc_) {
+		increment = (minPos <= maxPos) ? 1 : -1;
+
+		// Range endpoints should not be inside DBCS characters, but just in case, move them.
+		startPos = doc->MovePositionOutsideChar(minPos, 1, false);
+		endPos = doc->MovePositionOutsideChar(maxPos, 1, false);
+
+		lineRangeStart = doc->LineFromPosition(startPos);
+		lineRangeEnd = doc->LineFromPosition(endPos);
+		if ((increment == 1) &&
+			(startPos >= doc->LineEnd(lineRangeStart)) &&
+			(lineRangeStart < lineRangeEnd)) {
+			// the start position is at end of line or between line end characters.
+			lineRangeStart++;
+			startPos = doc->LineStart(lineRangeStart);
+		} else if ((increment == -1) &&
+			(startPos <= doc->LineStart(lineRangeStart)) &&
+			(lineRangeStart > lineRangeEnd)) {
+			// the start position is at beginning of line.
+			lineRangeStart--;
+			startPos = doc->LineEnd(lineRangeStart);
+		}
+		lineRangeBreak = lineRangeEnd + increment;
+	}
+	Range LineRange(int line) const {
+		Range range(doc->LineStart(line), doc->LineEnd(line));
+		if (increment == 1) {
+			if (line == lineRangeStart)
+				range.start = startPos;
+			if (line == lineRangeEnd)
+				range.end = endPos;
+		} else {
+			if (line == lineRangeEnd)
+				range.start = endPos;
+			if (line == lineRangeStart)
+				range.end = startPos;
+		}
+		return range;
+	}
+};
+
 // Define a way for the Regular Expression code to access the document
 class DocumentIndexer : public CharacterIndexer {
 	Document *pdoc;
@@ -2198,18 +2280,375 @@ public:
 	}
 };
 
+#ifdef CXX11_REGEX
+
+class ByteIterator : public std::iterator<std::bidirectional_iterator_tag, char> {
+public:
+	const Document *doc;
+	Position position;
+	ByteIterator(const Document *doc_ = 0, Position position_ = 0) : doc(doc_), position(position_) {
+	}
+	ByteIterator(const ByteIterator &other) {
+		doc = other.doc;
+		position = other.position;
+	}
+	ByteIterator &operator=(const ByteIterator &other) {
+		if (this != &other) {
+			doc = other.doc;
+			position = other.position;
+		}
+		return *this;
+	}
+	char operator*() const {
+		return doc->CharAt(position);
+	}
+	ByteIterator &operator++() {
+		position++;
+		return *this;
+	}
+	ByteIterator operator++(int) {
+		ByteIterator retVal(*this);
+		position++;
+		return retVal;
+	}
+	ByteIterator &operator--() {
+		position--;
+		return *this;
+	}
+	bool operator==(const ByteIterator &other) const {
+		return doc == other.doc && position == other.position;
+	}
+	bool operator!=(const ByteIterator &other) const {
+		return doc != other.doc || position != other.position;
+	}
+	int Pos() const {
+		return position;
+	}
+	int PosRoundUp() const {
+		return position;
+	}
+};
+
+// On Windows, wchar_t is 16 bits wide and on Unix it is 32 bits wide.
+// Would be better to use sizeof(wchar_t) or similar to differentiate
+// but easier for now to hard-code platforms.
+// C++11 has char16_t and char32_t but neither Clang nor Visual C++
+// appear to allow specializing basic_regex over these.
+
+#ifdef _WIN32
+#define WCHAR_T_IS_16 1
+#else
+#define WCHAR_T_IS_16 0
+#endif
+
+#if WCHAR_T_IS_16
+
+// On Windows, report non-BMP characters as 2 separate surrogates as that
+// matches wregex since it is based on wchar_t.
+class UTF8Iterator : public std::iterator<std::bidirectional_iterator_tag, wchar_t> {
+	// These 3 fields determine the iterator position and are used for comparisons
+	const Document *doc;
+	Position position;
+	size_t characterIndex;
+	// Remaining fields are derived from the determining fields so are excluded in comparisons
+	unsigned int lenBytes;
+	size_t lenCharacters;
+	wchar_t buffered[2];
+public:
+	UTF8Iterator(const Document *doc_ = 0, Position position_ = 0) :
+		doc(doc_), position(position_), characterIndex(0), lenBytes(0), lenCharacters(0) {
+		buffered[0] = 0;
+		buffered[1] = 0;
+	}
+	UTF8Iterator(const UTF8Iterator &other) {
+		doc = other.doc;
+		position = other.position;
+		characterIndex = other.characterIndex;
+		lenBytes = other.lenBytes;
+		lenCharacters = other.lenCharacters;
+		buffered[0] = other.buffered[0];
+		buffered[1] = other.buffered[1];
+	}
+	UTF8Iterator &operator=(const UTF8Iterator &other) {
+		if (this != &other) {
+			doc = other.doc;
+			position = other.position;
+			characterIndex = other.characterIndex;
+			lenBytes = other.lenBytes;
+			lenCharacters = other.lenCharacters;
+			buffered[0] = other.buffered[0];
+			buffered[1] = other.buffered[1];
+		}
+		return *this;
+	}
+	wchar_t operator*() {
+		if (lenCharacters == 0) {
+			ReadCharacter();
+		}
+		return buffered[characterIndex];
+	}
+	UTF8Iterator &operator++() {
+		if ((characterIndex + 1) < (lenCharacters)) {
+			characterIndex++;
+		} else {
+			position += lenBytes;
+			ReadCharacter();
+			characterIndex = 0;
+		}
+		return *this;
+	}
+	UTF8Iterator operator++(int) {
+		UTF8Iterator retVal(*this);
+		if ((characterIndex + 1) < (lenCharacters)) {
+			characterIndex++;
+		} else {
+			position += lenBytes;
+			ReadCharacter();
+			characterIndex = 0;
+		}
+		return retVal;
+	}
+	UTF8Iterator &operator--() {
+		if (characterIndex) {
+			characterIndex--;
+		} else {
+			position = doc->NextPosition(position, -1);
+			ReadCharacter();
+			characterIndex = lenCharacters - 1;
+		}
+		return *this;
+	}
+	bool operator==(const UTF8Iterator &other) const {
+		// Only test the determining fields, not the character widths and values derived from this
+		return doc == other.doc &&
+			position == other.position &&
+			characterIndex == other.characterIndex;
+	}
+	bool operator!=(const UTF8Iterator &other) const {
+		// Only test the determining fields, not the character widths and values derived from this
+		return doc != other.doc ||
+			position != other.position ||
+			characterIndex != other.characterIndex;
+	}
+	int Pos() const {
+		return position;
+	}
+	int PosRoundUp() const {
+		if (characterIndex)
+			return position + lenBytes;	// Force to end of character
+		else
+			return position;
+	}
+private:
+	void ReadCharacter() {
+		Document::CharacterExtracted charExtracted = doc->ExtractCharacter(position);
+		lenBytes = charExtracted.widthBytes;
+		if (charExtracted.character == unicodeReplacementChar) {
+			lenCharacters = 1;
+			buffered[0] = static_cast<wchar_t>(charExtracted.character);
+		} else {
+			lenCharacters = UTF16FromUTF32Character(charExtracted.character, buffered);
+		}
+	}
+};
+
+#else
+
+// On Unix, report non-BMP characters as single characters
+
+class UTF8Iterator : public std::iterator<std::bidirectional_iterator_tag, wchar_t> {
+	const Document *doc;
+	Position position;
+public:
+	UTF8Iterator(const Document *doc_=0, Position position_=0) : doc(doc_), position(position_) {
+	}
+	UTF8Iterator(const UTF8Iterator &other) {
+		doc = other.doc;
+		position = other.position;
+	}
+	UTF8Iterator &operator=(const UTF8Iterator &other) {
+		if (this != &other) {
+			doc = other.doc;
+			position = other.position;
+		}
+		return *this;
+	}
+	wchar_t operator*() const {
+		Document::CharacterExtracted charExtracted = doc->ExtractCharacter(position);
+		return charExtracted.character;
+	}
+	UTF8Iterator &operator++() {
+		position = doc->NextPosition(position, 1);
+		return *this;
+	}
+	UTF8Iterator operator++(int) {
+		UTF8Iterator retVal(*this);
+		position = doc->NextPosition(position, 1);
+		return retVal;
+	}
+	UTF8Iterator &operator--() {
+		position = doc->NextPosition(position, -1);
+		return *this;
+	}
+	bool operator==(const UTF8Iterator &other) const {
+		return doc == other.doc && position == other.position;
+	}
+	bool operator!=(const UTF8Iterator &other) const {
+		return doc != other.doc || position != other.position;
+	}
+	int Pos() const {
+		return position; 
+	}
+	int PosRoundUp() const {
+		return position; 
+	}
+};
+
+#endif
+
+std::regex_constants::match_flag_type MatchFlags(const Document *doc, int startPos, int endPos) {
+	std::regex_constants::match_flag_type flagsMatch = std::regex_constants::match_default;
+	if (!doc->IsLineStartPosition(startPos))
+		flagsMatch |= std::regex_constants::match_not_bol;
+	if (!doc->IsLineEndPosition(endPos))
+		flagsMatch |= std::regex_constants::match_not_eol;
+	return flagsMatch;
+}
+
+template<typename Iterator, typename Regex>
+bool MatchOnLines(const Document *doc, const Regex &regexp, const RESearchRange &resr, RESearch &search) {
+	bool matched = false;
+	std::match_results<Iterator> match;
+
+	// MSVC and libc++ have problems with ^ and $ matching line ends inside a range
+	// If they didn't then the line by line iteration could be removed for the forwards
+	// case and replaced with the following 4 lines:
+	//	Iterator uiStart(doc, startPos);
+	//	Iterator uiEnd(doc, endPos);
+	//	flagsMatch = MatchFlags(doc, startPos, endPos);
+	//	matched = std::regex_search(uiStart, uiEnd, match, regexp, flagsMatch);
+
+	// Line by line.
+	for (int line = resr.lineRangeStart; line != resr.lineRangeBreak; line += resr.increment) {
+		const Range lineRange = resr.LineRange(line);
+		Iterator itStart(doc, lineRange.start);
+		Iterator itEnd(doc, lineRange.end);
+		std::regex_constants::match_flag_type flagsMatch = MatchFlags(doc, lineRange.start, lineRange.end);
+		matched = std::regex_search(itStart, itEnd, match, regexp, flagsMatch);
+		// Check for the last match on this line.
+		if (matched) {
+			if (resr.increment == -1) {
+				while (matched) {
+					Iterator itNext(doc, match[0].second.PosRoundUp());
+					flagsMatch = MatchFlags(doc, itNext.Pos(), lineRange.end);
+					std::match_results<Iterator> matchNext;
+					matched = std::regex_search(itNext, itEnd, matchNext, regexp, flagsMatch);
+					if (matched) {
+						if (match[0].first == match[0].second) {
+							// Empty match means failure so exit
+							return false;
+						}
+						match = matchNext;
+					}
+				}
+				matched = true;
+			}
+			break;
+		}
+	}
+	if (matched) {
+		for (size_t co = 0; co < match.size(); co++) {
+			search.bopat[co] = match[co].first.Pos();
+			search.eopat[co] = match[co].second.PosRoundUp();
+			size_t lenMatch = search.eopat[co] - search.bopat[co];
+			search.pat[co].resize(lenMatch);
+			for (size_t iPos = 0; iPos < lenMatch; iPos++) {
+				search.pat[co][iPos] = doc->CharAt(iPos + search.bopat[co]);
+			}
+		}
+	}
+	return matched;
+}
+
+long Cxx11RegexFindText(Document *doc, int minPos, int maxPos, const char *s,
+	bool caseSensitive, int *length, RESearch &search) {
+	const RESearchRange resr(doc, minPos, maxPos);
+	try {
+		//ElapsedTime et;
+		std::regex::flag_type flagsRe = std::regex::ECMAScript;
+		// Flags that apper to have no effect:
+		// | std::regex::collate | std::regex::extended;
+		if (!caseSensitive)
+			flagsRe = flagsRe | std::regex::icase;
+
+		// Clear the RESearch so can fill in matches
+		search.Clear();
+
+		bool matched = false;
+		if (SC_CP_UTF8 == doc->dbcsCodePage) {
+			unsigned int lenS = static_cast<unsigned int>(strlen(s));
+			std::vector<wchar_t> ws(lenS + 1);
+#if WCHAR_T_IS_16
+			size_t outLen = UTF16FromUTF8(s, lenS, &ws[0], lenS);
+#else
+			size_t outLen = UTF32FromUTF8(s, lenS, reinterpret_cast<unsigned int *>(&ws[0]), lenS);
+#endif
+			ws[outLen] = 0;
+			std::wregex regexp;
+#if defined(__APPLE__)
+			// Using a UTF-8 locale doesn't change to Unicode over a byte buffer so '.'
+			// is one byte not one character. 
+			// However, on OS X this makes wregex act as Unicode
+			std::locale localeU("en_US.UTF-8");
+			regexp.imbue(localeU);
+#endif
+			regexp.assign(&ws[0], flagsRe);
+			matched = MatchOnLines<UTF8Iterator>(doc, regexp, resr, search);
+
+		} else {
+			std::regex regexp;
+			regexp.assign(s, flagsRe);
+			matched = MatchOnLines<ByteIterator>(doc, regexp, resr, search);
+		}
+
+		int posMatch = -1;
+		if (matched) {
+			posMatch = search.bopat[0];
+			*length = search.eopat[0] - search.bopat[0];
+		}
+		// Example - search in doc/ScintillaHistory.html for
+		// [[:upper:]]eta[[:space:]]
+		// On MacBook, normally around 1 second but with locale imbued -> 14 seconds.
+		//double durSearch = et.Duration(true);
+		//Platform::DebugPrintf("Search:%9.6g \n", durSearch);
+		return posMatch;
+	} catch (std::regex_error &) {
+		// Failed to create regular expression
+		throw RegexError();
+	} catch (...) {
+		// Failed in some other way
+		return -1;
+	}
+}
+
+#endif
+
+}
+
 long BuiltinRegex::FindText(Document *doc, int minPos, int maxPos, const char *s,
                         bool caseSensitive, bool, bool, int flags,
                         int *length) {
-	const bool posix = (flags & SCFIND_POSIX) != 0;
-	const int increment = (minPos <= maxPos) ? 1 : -1;
 
-	int startPos = minPos;
-	int endPos = maxPos;
+#ifdef CXX11_REGEX
+	if (flags & SCFIND_CXX11REGEX) {
+			return Cxx11RegexFindText(doc, minPos, maxPos, s,
+			caseSensitive, length, search);
+	}
+#endif
 
-	// Range endpoints should not be inside DBCS characters, but just in case, move them.
-	startPos = doc->MovePositionOutsideChar(startPos, 1, false);
-	endPos = doc->MovePositionOutsideChar(endPos, 1, false);
+	const RESearchRange resr(doc, minPos, maxPos);
+
+	const bool posix = (flags & SCFIND_POSIX) != 0;
 
 	const char *errmsg = search.Compile(s, *length, caseSensitive, posix);
 	if (errmsg) {
@@ -2219,50 +2658,34 @@ long BuiltinRegex::FindText(Document *doc, int minPos, int maxPos, const char *s
 	// Replace first '.' with '-' in each property file variable reference:
 	//     Search: \$(\([A-Za-z0-9_-]+\)\.\([A-Za-z0-9_.]+\))
 	//     Replace: $(\1-\2)
-	int lineRangeStart = doc->LineFromPosition(startPos);
-	const int lineRangeEnd = doc->LineFromPosition(endPos);
-	if ((increment == 1) &&
-		(startPos >= doc->LineEnd(lineRangeStart)) &&
-		(lineRangeStart < lineRangeEnd)) {
-		// the start position is at end of line or between line end characters.
-		lineRangeStart++;
-		startPos = doc->LineStart(lineRangeStart);
-	} else if ((increment == -1) &&
-	           (startPos <= doc->LineStart(lineRangeStart)) &&
-	           (lineRangeStart > lineRangeEnd)) {
-		// the start position is at beginning of line.
-		lineRangeStart--;
-		startPos = doc->LineEnd(lineRangeStart);
-	}
 	int pos = -1;
 	int lenRet = 0;
 	const char searchEnd = s[*length - 1];
 	const char searchEndPrev = (*length > 1) ? s[*length - 2] : '\0';
-	const int lineRangeBreak = lineRangeEnd + increment;
-	for (int line = lineRangeStart; line != lineRangeBreak; line += increment) {
+	for (int line = resr.lineRangeStart; line != resr.lineRangeBreak; line += resr.increment) {
 		int startOfLine = doc->LineStart(line);
 		int endOfLine = doc->LineEnd(line);
-		if (increment == 1) {
-			if (line == lineRangeStart) {
-				if ((startPos != startOfLine) && (s[0] == '^'))
+		if (resr.increment == 1) {
+			if (line == resr.lineRangeStart) {
+				if ((resr.startPos != startOfLine) && (s[0] == '^'))
 					continue;	// Can't match start of line if start position after start of line
-				startOfLine = startPos;
+				startOfLine = resr.startPos;
 			}
-			if (line == lineRangeEnd) {
-				if ((endPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\'))
+			if (line == resr.lineRangeEnd) {
+				if ((resr.endPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\'))
 					continue;	// Can't match end of line if end position before end of line
-				endOfLine = endPos;
+				endOfLine = resr.endPos;
 			}
 		} else {
-			if (line == lineRangeEnd) {
-				if ((endPos != startOfLine) && (s[0] == '^'))
+			if (line == resr.lineRangeEnd) {
+				if ((resr.endPos != startOfLine) && (s[0] == '^'))
 					continue;	// Can't match start of line if end position after start of line
-				startOfLine = endPos;
+				startOfLine = resr.endPos;
 			}
-			if (line == lineRangeStart) {
-				if ((startPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\'))
+			if (line == resr.lineRangeStart) {
+				if ((resr.startPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\'))
 					continue;	// Can't match end of line if start position before end of line
-				endOfLine = startPos;
+				endOfLine = resr.startPos;
 			}
 		}
 
@@ -2274,7 +2697,7 @@ long BuiltinRegex::FindText(Document *doc, int minPos, int maxPos, const char *s
 			search.eopat[0] = doc->MovePositionOutsideChar(search.eopat[0], 1, false);
 			lenRet = search.eopat[0] - search.bopat[0];
 			// There can be only one start of a line, so no need to look for last match in line
-			if ((increment == -1) && (s[0] != '^')) {
+			if ((resr.increment == -1) && (s[0] != '^')) {
 				// Check for the last match on this line.
 				int repetitions = 1000;	// Break out of infinite loop
 				while (success && (search.eopat[0] <= endOfLine) && (repetitions--)) {
author	Neil <nyamatongwe@gmail.com>	2014-10-02 18:17:13 +1000
committer	Neil <nyamatongwe@gmail.com>	2014-10-02 18:17:13 +1000
commit	2603f1e2074b0f880886b533ffc47ecef4fd33f7 (patch)
tree	53bfaff1eca31d0768f43d0c496c81b551230a2d /src/Document.cxx
parent	8c42cc95ae3fbfdb8d4b7e8893c2fee283efe1a6 (diff)
download	scintilla-mirror-2603f1e2074b0f880886b533ffc47ecef4fd33f7.tar.gz