diff options
Diffstat (limited to 'src/Document.cxx')
| -rw-r--r-- | src/Document.cxx | 501 | 
1 files changed, 462 insertions, 39 deletions
| diff --git a/src/Document.cxx b/src/Document.cxx index c4faee603..4d2c48f1f 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -16,6 +16,10 @@  #include <vector>  #include <algorithm> +#ifdef CXX11_REGEX +#include <regex> +#endif +  #include "Platform.h"  #include "ILexer.h" @@ -336,6 +340,10 @@ int SCI_METHOD Document::LineStart(int line) const {  	return cb.LineStart(line);  } +bool Document::IsLineStartPosition(int position) const { +	return LineStart(LineFromPosition(position)) == position; +} +  int SCI_METHOD Document::LineEnd(int line) const {  	if (line >= LinesTotal() - 1) {  		return LineStart(line + 1); @@ -602,7 +610,7 @@ bool Document::InGoodUTF8(int pos, int &start, int &end) const {  // When lines are terminated with \r\n pairs which should be treated as one character.  // When displaying DBCS text such as Japanese.  // If moving, move the position in the indicated direction. -int Document::MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd) { +int Document::MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd) const {  	//Platform::DebugPrintf("NoCRLF %d %d\n", pos, moveDir);  	// If out of range, just return minimum/maximum value.  	if (pos <= 0) @@ -1587,6 +1595,25 @@ void Document::SetCaseFolder(CaseFolder *pcf_) {  	pcf = pcf_;  } +Document::CharacterExtracted Document::ExtractCharacter(int position) const { +	const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(position)); +	if (UTF8IsAscii(leadByte)) { +		// Common case: ASCII character +		return CharacterExtracted(leadByte, 1); +	} +	const int widthCharBytes = UTF8BytesOfLead[leadByte]; +	unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 }; +	for (int b=1; b<widthCharBytes; b++) +		charBytes[b] = static_cast<unsigned char>(cb.CharAt(position + b)); +	int utf8status = UTF8Classify(charBytes, widthCharBytes); +	if (utf8status & UTF8MaskInvalid) { +		// Treat as invalid and use up just one byte +		return CharacterExtracted(unicodeReplacementChar, 1); +	} else { +		return CharacterExtracted(UnicodeFromBytes(charBytes), utf8status & UTF8MaskWidth); +	} +} +  /**   * Find text in document, supporting both forward and backward   * searches (just pass minPos > maxPos to do a backward search) @@ -2178,6 +2205,61 @@ private:  	std::string substituted;  }; +namespace { + +/** +* RESearchRange keeps track of search range. +*/ +class RESearchRange { +public: +	const Document *doc; +	int increment; +	int startPos; +	int endPos; +	int lineRangeStart; +	int lineRangeEnd; +	int lineRangeBreak; +	RESearchRange(const Document *doc_, int minPos, int maxPos) : doc(doc_) { +		increment = (minPos <= maxPos) ? 1 : -1; + +		// Range endpoints should not be inside DBCS characters, but just in case, move them. +		startPos = doc->MovePositionOutsideChar(minPos, 1, false); +		endPos = doc->MovePositionOutsideChar(maxPos, 1, false); + +		lineRangeStart = doc->LineFromPosition(startPos); +		lineRangeEnd = doc->LineFromPosition(endPos); +		if ((increment == 1) && +			(startPos >= doc->LineEnd(lineRangeStart)) && +			(lineRangeStart < lineRangeEnd)) { +			// the start position is at end of line or between line end characters. +			lineRangeStart++; +			startPos = doc->LineStart(lineRangeStart); +		} else if ((increment == -1) && +			(startPos <= doc->LineStart(lineRangeStart)) && +			(lineRangeStart > lineRangeEnd)) { +			// the start position is at beginning of line. +			lineRangeStart--; +			startPos = doc->LineEnd(lineRangeStart); +		} +		lineRangeBreak = lineRangeEnd + increment; +	} +	Range LineRange(int line) const { +		Range range(doc->LineStart(line), doc->LineEnd(line)); +		if (increment == 1) { +			if (line == lineRangeStart) +				range.start = startPos; +			if (line == lineRangeEnd) +				range.end = endPos; +		} else { +			if (line == lineRangeEnd) +				range.start = endPos; +			if (line == lineRangeStart) +				range.end = startPos; +		} +		return range; +	} +}; +  // Define a way for the Regular Expression code to access the document  class DocumentIndexer : public CharacterIndexer {  	Document *pdoc; @@ -2198,18 +2280,375 @@ public:  	}  }; +#ifdef CXX11_REGEX + +class ByteIterator : public std::iterator<std::bidirectional_iterator_tag, char> { +public: +	const Document *doc; +	Position position; +	ByteIterator(const Document *doc_ = 0, Position position_ = 0) : doc(doc_), position(position_) { +	} +	ByteIterator(const ByteIterator &other) { +		doc = other.doc; +		position = other.position; +	} +	ByteIterator &operator=(const ByteIterator &other) { +		if (this != &other) { +			doc = other.doc; +			position = other.position; +		} +		return *this; +	} +	char operator*() const { +		return doc->CharAt(position); +	} +	ByteIterator &operator++() { +		position++; +		return *this; +	} +	ByteIterator operator++(int) { +		ByteIterator retVal(*this); +		position++; +		return retVal; +	} +	ByteIterator &operator--() { +		position--; +		return *this; +	} +	bool operator==(const ByteIterator &other) const { +		return doc == other.doc && position == other.position; +	} +	bool operator!=(const ByteIterator &other) const { +		return doc != other.doc || position != other.position; +	} +	int Pos() const { +		return position; +	} +	int PosRoundUp() const { +		return position; +	} +}; + +// On Windows, wchar_t is 16 bits wide and on Unix it is 32 bits wide. +// Would be better to use sizeof(wchar_t) or similar to differentiate +// but easier for now to hard-code platforms. +// C++11 has char16_t and char32_t but neither Clang nor Visual C++ +// appear to allow specializing basic_regex over these. + +#ifdef _WIN32 +#define WCHAR_T_IS_16 1 +#else +#define WCHAR_T_IS_16 0 +#endif + +#if WCHAR_T_IS_16 + +// On Windows, report non-BMP characters as 2 separate surrogates as that +// matches wregex since it is based on wchar_t. +class UTF8Iterator : public std::iterator<std::bidirectional_iterator_tag, wchar_t> { +	// These 3 fields determine the iterator position and are used for comparisons +	const Document *doc; +	Position position; +	size_t characterIndex; +	// Remaining fields are derived from the determining fields so are excluded in comparisons +	unsigned int lenBytes; +	size_t lenCharacters; +	wchar_t buffered[2]; +public: +	UTF8Iterator(const Document *doc_ = 0, Position position_ = 0) : +		doc(doc_), position(position_), characterIndex(0), lenBytes(0), lenCharacters(0) { +		buffered[0] = 0; +		buffered[1] = 0; +	} +	UTF8Iterator(const UTF8Iterator &other) { +		doc = other.doc; +		position = other.position; +		characterIndex = other.characterIndex; +		lenBytes = other.lenBytes; +		lenCharacters = other.lenCharacters; +		buffered[0] = other.buffered[0]; +		buffered[1] = other.buffered[1]; +	} +	UTF8Iterator &operator=(const UTF8Iterator &other) { +		if (this != &other) { +			doc = other.doc; +			position = other.position; +			characterIndex = other.characterIndex; +			lenBytes = other.lenBytes; +			lenCharacters = other.lenCharacters; +			buffered[0] = other.buffered[0]; +			buffered[1] = other.buffered[1]; +		} +		return *this; +	} +	wchar_t operator*() { +		if (lenCharacters == 0) { +			ReadCharacter(); +		} +		return buffered[characterIndex]; +	} +	UTF8Iterator &operator++() { +		if ((characterIndex + 1) < (lenCharacters)) { +			characterIndex++; +		} else { +			position += lenBytes; +			ReadCharacter(); +			characterIndex = 0; +		} +		return *this; +	} +	UTF8Iterator operator++(int) { +		UTF8Iterator retVal(*this); +		if ((characterIndex + 1) < (lenCharacters)) { +			characterIndex++; +		} else { +			position += lenBytes; +			ReadCharacter(); +			characterIndex = 0; +		} +		return retVal; +	} +	UTF8Iterator &operator--() { +		if (characterIndex) { +			characterIndex--; +		} else { +			position = doc->NextPosition(position, -1); +			ReadCharacter(); +			characterIndex = lenCharacters - 1; +		} +		return *this; +	} +	bool operator==(const UTF8Iterator &other) const { +		// Only test the determining fields, not the character widths and values derived from this +		return doc == other.doc && +			position == other.position && +			characterIndex == other.characterIndex; +	} +	bool operator!=(const UTF8Iterator &other) const { +		// Only test the determining fields, not the character widths and values derived from this +		return doc != other.doc || +			position != other.position || +			characterIndex != other.characterIndex; +	} +	int Pos() const { +		return position; +	} +	int PosRoundUp() const { +		if (characterIndex) +			return position + lenBytes;	// Force to end of character +		else +			return position; +	} +private: +	void ReadCharacter() { +		Document::CharacterExtracted charExtracted = doc->ExtractCharacter(position); +		lenBytes = charExtracted.widthBytes; +		if (charExtracted.character == unicodeReplacementChar) { +			lenCharacters = 1; +			buffered[0] = static_cast<wchar_t>(charExtracted.character); +		} else { +			lenCharacters = UTF16FromUTF32Character(charExtracted.character, buffered); +		} +	} +}; + +#else + +// On Unix, report non-BMP characters as single characters + +class UTF8Iterator : public std::iterator<std::bidirectional_iterator_tag, wchar_t> { +	const Document *doc; +	Position position; +public: +	UTF8Iterator(const Document *doc_=0, Position position_=0) : doc(doc_), position(position_) { +	} +	UTF8Iterator(const UTF8Iterator &other) { +		doc = other.doc; +		position = other.position; +	} +	UTF8Iterator &operator=(const UTF8Iterator &other) { +		if (this != &other) { +			doc = other.doc; +			position = other.position; +		} +		return *this; +	} +	wchar_t operator*() const { +		Document::CharacterExtracted charExtracted = doc->ExtractCharacter(position); +		return charExtracted.character; +	} +	UTF8Iterator &operator++() { +		position = doc->NextPosition(position, 1); +		return *this; +	} +	UTF8Iterator operator++(int) { +		UTF8Iterator retVal(*this); +		position = doc->NextPosition(position, 1); +		return retVal; +	} +	UTF8Iterator &operator--() { +		position = doc->NextPosition(position, -1); +		return *this; +	} +	bool operator==(const UTF8Iterator &other) const { +		return doc == other.doc && position == other.position; +	} +	bool operator!=(const UTF8Iterator &other) const { +		return doc != other.doc || position != other.position; +	} +	int Pos() const { +		return position;  +	} +	int PosRoundUp() const { +		return position;  +	} +}; + +#endif + +std::regex_constants::match_flag_type MatchFlags(const Document *doc, int startPos, int endPos) { +	std::regex_constants::match_flag_type flagsMatch = std::regex_constants::match_default; +	if (!doc->IsLineStartPosition(startPos)) +		flagsMatch |= std::regex_constants::match_not_bol; +	if (!doc->IsLineEndPosition(endPos)) +		flagsMatch |= std::regex_constants::match_not_eol; +	return flagsMatch; +} + +template<typename Iterator, typename Regex> +bool MatchOnLines(const Document *doc, const Regex ®exp, const RESearchRange &resr, RESearch &search) { +	bool matched = false; +	std::match_results<Iterator> match; + +	// MSVC and libc++ have problems with ^ and $ matching line ends inside a range +	// If they didn't then the line by line iteration could be removed for the forwards +	// case and replaced with the following 4 lines: +	//	Iterator uiStart(doc, startPos); +	//	Iterator uiEnd(doc, endPos); +	//	flagsMatch = MatchFlags(doc, startPos, endPos); +	//	matched = std::regex_search(uiStart, uiEnd, match, regexp, flagsMatch); + +	// Line by line. +	for (int line = resr.lineRangeStart; line != resr.lineRangeBreak; line += resr.increment) { +		const Range lineRange = resr.LineRange(line); +		Iterator itStart(doc, lineRange.start); +		Iterator itEnd(doc, lineRange.end); +		std::regex_constants::match_flag_type flagsMatch = MatchFlags(doc, lineRange.start, lineRange.end); +		matched = std::regex_search(itStart, itEnd, match, regexp, flagsMatch); +		// Check for the last match on this line. +		if (matched) { +			if (resr.increment == -1) { +				while (matched) { +					Iterator itNext(doc, match[0].second.PosRoundUp()); +					flagsMatch = MatchFlags(doc, itNext.Pos(), lineRange.end); +					std::match_results<Iterator> matchNext; +					matched = std::regex_search(itNext, itEnd, matchNext, regexp, flagsMatch); +					if (matched) { +						if (match[0].first == match[0].second) { +							// Empty match means failure so exit +							return false; +						} +						match = matchNext; +					} +				} +				matched = true; +			} +			break; +		} +	} +	if (matched) { +		for (size_t co = 0; co < match.size(); co++) { +			search.bopat[co] = match[co].first.Pos(); +			search.eopat[co] = match[co].second.PosRoundUp(); +			size_t lenMatch = search.eopat[co] - search.bopat[co]; +			search.pat[co].resize(lenMatch); +			for (size_t iPos = 0; iPos < lenMatch; iPos++) { +				search.pat[co][iPos] = doc->CharAt(iPos + search.bopat[co]); +			} +		} +	} +	return matched; +} + +long Cxx11RegexFindText(Document *doc, int minPos, int maxPos, const char *s, +	bool caseSensitive, int *length, RESearch &search) { +	const RESearchRange resr(doc, minPos, maxPos); +	try { +		//ElapsedTime et; +		std::regex::flag_type flagsRe = std::regex::ECMAScript; +		// Flags that apper to have no effect: +		// | std::regex::collate | std::regex::extended; +		if (!caseSensitive) +			flagsRe = flagsRe | std::regex::icase; + +		// Clear the RESearch so can fill in matches +		search.Clear(); + +		bool matched = false; +		if (SC_CP_UTF8 == doc->dbcsCodePage) { +			unsigned int lenS = static_cast<unsigned int>(strlen(s)); +			std::vector<wchar_t> ws(lenS + 1); +#if WCHAR_T_IS_16 +			size_t outLen = UTF16FromUTF8(s, lenS, &ws[0], lenS); +#else +			size_t outLen = UTF32FromUTF8(s, lenS, reinterpret_cast<unsigned int *>(&ws[0]), lenS); +#endif +			ws[outLen] = 0; +			std::wregex regexp; +#if defined(__APPLE__) +			// Using a UTF-8 locale doesn't change to Unicode over a byte buffer so '.' +			// is one byte not one character.  +			// However, on OS X this makes wregex act as Unicode +			std::locale localeU("en_US.UTF-8"); +			regexp.imbue(localeU); +#endif +			regexp.assign(&ws[0], flagsRe); +			matched = MatchOnLines<UTF8Iterator>(doc, regexp, resr, search); + +		} else { +			std::regex regexp; +			regexp.assign(s, flagsRe); +			matched = MatchOnLines<ByteIterator>(doc, regexp, resr, search); +		} + +		int posMatch = -1; +		if (matched) { +			posMatch = search.bopat[0]; +			*length = search.eopat[0] - search.bopat[0]; +		} +		// Example - search in doc/ScintillaHistory.html for +		// [[:upper:]]eta[[:space:]] +		// On MacBook, normally around 1 second but with locale imbued -> 14 seconds. +		//double durSearch = et.Duration(true); +		//Platform::DebugPrintf("Search:%9.6g \n", durSearch); +		return posMatch; +	} catch (std::regex_error &) { +		// Failed to create regular expression +		throw RegexError(); +	} catch (...) { +		// Failed in some other way +		return -1; +	} +} + +#endif + +} +  long BuiltinRegex::FindText(Document *doc, int minPos, int maxPos, const char *s,                          bool caseSensitive, bool, bool, int flags,                          int *length) { -	const bool posix = (flags & SCFIND_POSIX) != 0; -	const int increment = (minPos <= maxPos) ? 1 : -1; -	int startPos = minPos; -	int endPos = maxPos; +#ifdef CXX11_REGEX +	if (flags & SCFIND_CXX11REGEX) { +			return Cxx11RegexFindText(doc, minPos, maxPos, s, +			caseSensitive, length, search); +	} +#endif -	// Range endpoints should not be inside DBCS characters, but just in case, move them. -	startPos = doc->MovePositionOutsideChar(startPos, 1, false); -	endPos = doc->MovePositionOutsideChar(endPos, 1, false); +	const RESearchRange resr(doc, minPos, maxPos); + +	const bool posix = (flags & SCFIND_POSIX) != 0;  	const char *errmsg = search.Compile(s, *length, caseSensitive, posix);  	if (errmsg) { @@ -2219,50 +2658,34 @@ long BuiltinRegex::FindText(Document *doc, int minPos, int maxPos, const char *s  	// Replace first '.' with '-' in each property file variable reference:  	//     Search: \$(\([A-Za-z0-9_-]+\)\.\([A-Za-z0-9_.]+\))  	//     Replace: $(\1-\2) -	int lineRangeStart = doc->LineFromPosition(startPos); -	const int lineRangeEnd = doc->LineFromPosition(endPos); -	if ((increment == 1) && -		(startPos >= doc->LineEnd(lineRangeStart)) && -		(lineRangeStart < lineRangeEnd)) { -		// the start position is at end of line or between line end characters. -		lineRangeStart++; -		startPos = doc->LineStart(lineRangeStart); -	} else if ((increment == -1) && -	           (startPos <= doc->LineStart(lineRangeStart)) && -	           (lineRangeStart > lineRangeEnd)) { -		// the start position is at beginning of line. -		lineRangeStart--; -		startPos = doc->LineEnd(lineRangeStart); -	}  	int pos = -1;  	int lenRet = 0;  	const char searchEnd = s[*length - 1];  	const char searchEndPrev = (*length > 1) ? s[*length - 2] : '\0'; -	const int lineRangeBreak = lineRangeEnd + increment; -	for (int line = lineRangeStart; line != lineRangeBreak; line += increment) { +	for (int line = resr.lineRangeStart; line != resr.lineRangeBreak; line += resr.increment) {  		int startOfLine = doc->LineStart(line);  		int endOfLine = doc->LineEnd(line); -		if (increment == 1) { -			if (line == lineRangeStart) { -				if ((startPos != startOfLine) && (s[0] == '^')) +		if (resr.increment == 1) { +			if (line == resr.lineRangeStart) { +				if ((resr.startPos != startOfLine) && (s[0] == '^'))  					continue;	// Can't match start of line if start position after start of line -				startOfLine = startPos; +				startOfLine = resr.startPos;  			} -			if (line == lineRangeEnd) { -				if ((endPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\')) +			if (line == resr.lineRangeEnd) { +				if ((resr.endPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\'))  					continue;	// Can't match end of line if end position before end of line -				endOfLine = endPos; +				endOfLine = resr.endPos;  			}  		} else { -			if (line == lineRangeEnd) { -				if ((endPos != startOfLine) && (s[0] == '^')) +			if (line == resr.lineRangeEnd) { +				if ((resr.endPos != startOfLine) && (s[0] == '^'))  					continue;	// Can't match start of line if end position after start of line -				startOfLine = endPos; +				startOfLine = resr.endPos;  			} -			if (line == lineRangeStart) { -				if ((startPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\')) +			if (line == resr.lineRangeStart) { +				if ((resr.startPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\'))  					continue;	// Can't match end of line if start position before end of line -				endOfLine = startPos; +				endOfLine = resr.startPos;  			}  		} @@ -2274,7 +2697,7 @@ long BuiltinRegex::FindText(Document *doc, int minPos, int maxPos, const char *s  			search.eopat[0] = doc->MovePositionOutsideChar(search.eopat[0], 1, false);  			lenRet = search.eopat[0] - search.bopat[0];  			// There can be only one start of a line, so no need to look for last match in line -			if ((increment == -1) && (s[0] != '^')) { +			if ((resr.increment == -1) && (s[0] != '^')) {  				// Check for the last match on this line.  				int repetitions = 1000;	// Break out of infinite loop  				while (success && (search.eopat[0] <= endOfLine) && (repetitions--)) { | 
