diff options
| -rw-r--r-- | doc/ScintillaDoc.html | 30 | ||||
| -rw-r--r-- | gtk/makefile | 6 | ||||
| -rw-r--r-- | include/Scintilla.h | 3 | ||||
| -rw-r--r-- | include/Scintilla.iface | 3 | ||||
| -rw-r--r-- | scripts/HeaderOrder.txt | 1 | ||||
| -rw-r--r-- | src/Document.cxx | 501 | ||||
| -rw-r--r-- | src/Document.h | 17 | ||||
| -rw-r--r-- | src/Editor.cxx | 91 | ||||
| -rw-r--r-- | src/RESearch.h | 2 | ||||
| -rw-r--r-- | src/UniConversion.cxx | 46 | ||||
| -rw-r--r-- | src/UniConversion.h | 4 | ||||
| -rw-r--r-- | win32/makefile | 6 | ||||
| -rw-r--r-- | win32/scintilla.mak | 4 | 
13 files changed, 630 insertions, 84 deletions
| diff --git a/doc/ScintillaDoc.html b/doc/ScintillaDoc.html index 52225cc26..04528b35e 100644 --- a/doc/ScintillaDoc.html +++ b/doc/ScintillaDoc.html @@ -606,8 +606,11 @@ struct Sci_TextRange {      <h2 id="Searching">Searching</h2>      <p> -    There are methods to search for text and for regular expressions. The regular expression support -    is limited and should only be used for simple cases and initial development. A different regular expression +    There are methods to search for text and for regular expressions. The base regular expression support +    is limited and should only be used for simple cases and initial development. +    <span class="provisional">When using a C++11 compliant compiler and runtime, it may be possible to use the +    runtime's implementation of <regex> by compiling Scintilla with <code>CXX11_REGEX</code> defined.</span> +    A different regular expression      library can be <a class="jump" href="#AlternativeRegEx">integrated into Scintilla</a>      or can be called from the container using direct access to the buffer contents through      <a class="message" href="#SCI_GETCHARACTERPOINTER">SCI_GETCHARACTERPOINTER</a>. @@ -658,6 +661,17 @@ struct Sci_TextRange {            <td>Treat regular expression in a more POSIX compatible manner              by interpreting bare ( and ) for tagged sections rather than \( and \).</td>          </tr> +        <tr class="provisional"> +          <td><code>SCFIND_CXX11REGEX</code></td> + +          <td>When compiled with <code>CXX11_REGEX</code> this flag +	  may be set to use <regex> instead of Scintilla's basic regular expressions. +	  If the regular expression is invalid then -1 is returned and status is set to +	  <code>SC_STATUS_WARN_REGEX</code>. +	  The ECMAScript flag is set on the regex object and UTF-8 documents will exhibit Unicode-compliant +	  behaviour. For MSVC, where wchar_t is 16-bits, the reular expression ".." will match a single +	  astral-plane character. There may be other differences between compilers.</td> +        </tr>        </tbody>      </table> @@ -971,6 +985,8 @@ struct Sci_TextToFind {       If an error occurs, Scintilla may set an internal error number that can be retrieved with      <code>SCI_GETSTATUS</code>.      To clear the error status call <code>SCI_SETSTATUS(0)</code>. +    Status values from 1 to 999 are errors and status <code>SC_STATUS_WARN_START</code> (1000) +    and above are warnings.      The currently defined statuses are:      </p> @@ -988,12 +1004,18 @@ struct Sci_TextToFind {            <td>Generic failure</td>          </tr> -        <tr> +        <tr class="provisional">            <th align="left">SC_STATUS_BADALLOC</th>            <td>2</td>            <td>Memory is exhausted</td>          </tr> +        <tr> +          <th align="left">SC_STATUS_WARN_REGEX</th> +          <td>1001</td> +          <td>Regular expression is invalid</td> +        </tr> +        </tbody>      </table> @@ -7433,6 +7455,8 @@ for line = lineStart to lineEnd do SCI_ENSUREVISIBLE(line) next      The <code class="provisional">SC_TECHNOLOGY_DIRECTWRITERETAIN</code> value for      <a class="message" href="#SCI_SETTECHNOLOGY">SCI_SETTECHNOLOGY</a> is provisional.</p> +    <p>Using C++11 <regex> is provisional.</p> +      <p>Some developers may want to only use features that are stable and have graduated from      provisional status. To avoid using provisional messages compile with the symbol      <code>SCI_DISABLE_PROVISIONAL</code> defined.</p> diff --git a/gtk/makefile b/gtk/makefile index 3c754f2e3..cdf1aa16e 100644 --- a/gtk/makefile +++ b/gtk/makefile @@ -54,6 +54,10 @@ else  THREADFLAGS=  endif +ifdef CXX11_REGEX +REFLAGS=-DCXX11_REGEX +endif +  ifdef DEBUG  ifdef CLANG  CTFLAGS=-DDEBUG -g -fsanitize=$(SANITIZE) $(CXXBASEFLAGS) $(THREADFLAGS) @@ -65,7 +69,7 @@ CTFLAGS=-DNDEBUG -Os $(CXXBASEFLAGS) $(THREADFLAGS)  endif  CFLAGS:=$(CTFLAGS) -CXXTFLAGS:=--std=c++0x $(CTFLAGS) +CXXTFLAGS:=--std=c++0x $(CTFLAGS) $(REFLAGS)  CONFIGFLAGS:=$(shell pkg-config --cflags $(GTKVERSION))  MARSHALLER=scintilla-marshal.o diff --git a/include/Scintilla.h b/include/Scintilla.h index 74b0cefd5..154f57010 100644 --- a/include/Scintilla.h +++ b/include/Scintilla.h @@ -372,6 +372,7 @@ typedef sptr_t (*SciFnDirect)(sptr_t ptr, unsigned int iMessage, uptr_t wParam,  #define SCFIND_WORDSTART 0x00100000  #define SCFIND_REGEXP 0x00200000  #define SCFIND_POSIX 0x00400000 +#define SCFIND_CXX11REGEX 0x00800000  #define SCI_FINDTEXT 2150  #define SCI_FORMATRANGE 2151  #define SCI_GETFIRSTVISIBLELINE 2152 @@ -643,6 +644,8 @@ typedef sptr_t (*SciFnDirect)(sptr_t ptr, unsigned int iMessage, uptr_t wParam,  #define SC_STATUS_OK 0  #define SC_STATUS_FAILURE 1  #define SC_STATUS_BADALLOC 2 +#define SC_STATUS_WARN_START 1000 +#define SC_STATUS_WARN_REGEX 1001  #define SCI_SETSTATUS 2382  #define SCI_GETSTATUS 2383  #define SCI_SETMOUSEDOWNCAPTURES 2384 diff --git a/include/Scintilla.iface b/include/Scintilla.iface index bf3fa0811..f93ae23c2 100644 --- a/include/Scintilla.iface +++ b/include/Scintilla.iface @@ -884,6 +884,7 @@ val SCFIND_MATCHCASE=0x4  val SCFIND_WORDSTART=0x00100000  val SCFIND_REGEXP=0x00200000  val SCFIND_POSIX=0x00400000 +val SCFIND_CXX11REGEX=0x00800000  # Find some text in the document.  fun position FindText=2150(int flags, findtext ft) @@ -1661,6 +1662,8 @@ enu Status=SC_STATUS_  val SC_STATUS_OK=0  val SC_STATUS_FAILURE=1  val SC_STATUS_BADALLOC=2 +val SC_STATUS_WARN_START=1000 +val SC_STATUS_WARN_REGEX=1001  # Change error status - 0 = OK.  set void SetStatus=2382(int statusCode,) diff --git a/scripts/HeaderOrder.txt b/scripts/HeaderOrder.txt index 8339c93d5..e9c88bb3f 100644 --- a/scripts/HeaderOrder.txt +++ b/scripts/HeaderOrder.txt @@ -30,6 +30,7 @@  #include <set>  #include <algorithm>  #include <memory> +#include <regex>  // GTK+ headers  #include <glib.h> diff --git a/src/Document.cxx b/src/Document.cxx index c4faee603..4d2c48f1f 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -16,6 +16,10 @@  #include <vector>  #include <algorithm> +#ifdef CXX11_REGEX +#include <regex> +#endif +  #include "Platform.h"  #include "ILexer.h" @@ -336,6 +340,10 @@ int SCI_METHOD Document::LineStart(int line) const {  	return cb.LineStart(line);  } +bool Document::IsLineStartPosition(int position) const { +	return LineStart(LineFromPosition(position)) == position; +} +  int SCI_METHOD Document::LineEnd(int line) const {  	if (line >= LinesTotal() - 1) {  		return LineStart(line + 1); @@ -602,7 +610,7 @@ bool Document::InGoodUTF8(int pos, int &start, int &end) const {  // When lines are terminated with \r\n pairs which should be treated as one character.  // When displaying DBCS text such as Japanese.  // If moving, move the position in the indicated direction. -int Document::MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd) { +int Document::MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd) const {  	//Platform::DebugPrintf("NoCRLF %d %d\n", pos, moveDir);  	// If out of range, just return minimum/maximum value.  	if (pos <= 0) @@ -1587,6 +1595,25 @@ void Document::SetCaseFolder(CaseFolder *pcf_) {  	pcf = pcf_;  } +Document::CharacterExtracted Document::ExtractCharacter(int position) const { +	const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(position)); +	if (UTF8IsAscii(leadByte)) { +		// Common case: ASCII character +		return CharacterExtracted(leadByte, 1); +	} +	const int widthCharBytes = UTF8BytesOfLead[leadByte]; +	unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 }; +	for (int b=1; b<widthCharBytes; b++) +		charBytes[b] = static_cast<unsigned char>(cb.CharAt(position + b)); +	int utf8status = UTF8Classify(charBytes, widthCharBytes); +	if (utf8status & UTF8MaskInvalid) { +		// Treat as invalid and use up just one byte +		return CharacterExtracted(unicodeReplacementChar, 1); +	} else { +		return CharacterExtracted(UnicodeFromBytes(charBytes), utf8status & UTF8MaskWidth); +	} +} +  /**   * Find text in document, supporting both forward and backward   * searches (just pass minPos > maxPos to do a backward search) @@ -2178,6 +2205,61 @@ private:  	std::string substituted;  }; +namespace { + +/** +* RESearchRange keeps track of search range. +*/ +class RESearchRange { +public: +	const Document *doc; +	int increment; +	int startPos; +	int endPos; +	int lineRangeStart; +	int lineRangeEnd; +	int lineRangeBreak; +	RESearchRange(const Document *doc_, int minPos, int maxPos) : doc(doc_) { +		increment = (minPos <= maxPos) ? 1 : -1; + +		// Range endpoints should not be inside DBCS characters, but just in case, move them. +		startPos = doc->MovePositionOutsideChar(minPos, 1, false); +		endPos = doc->MovePositionOutsideChar(maxPos, 1, false); + +		lineRangeStart = doc->LineFromPosition(startPos); +		lineRangeEnd = doc->LineFromPosition(endPos); +		if ((increment == 1) && +			(startPos >= doc->LineEnd(lineRangeStart)) && +			(lineRangeStart < lineRangeEnd)) { +			// the start position is at end of line or between line end characters. +			lineRangeStart++; +			startPos = doc->LineStart(lineRangeStart); +		} else if ((increment == -1) && +			(startPos <= doc->LineStart(lineRangeStart)) && +			(lineRangeStart > lineRangeEnd)) { +			// the start position is at beginning of line. +			lineRangeStart--; +			startPos = doc->LineEnd(lineRangeStart); +		} +		lineRangeBreak = lineRangeEnd + increment; +	} +	Range LineRange(int line) const { +		Range range(doc->LineStart(line), doc->LineEnd(line)); +		if (increment == 1) { +			if (line == lineRangeStart) +				range.start = startPos; +			if (line == lineRangeEnd) +				range.end = endPos; +		} else { +			if (line == lineRangeEnd) +				range.start = endPos; +			if (line == lineRangeStart) +				range.end = startPos; +		} +		return range; +	} +}; +  // Define a way for the Regular Expression code to access the document  class DocumentIndexer : public CharacterIndexer {  	Document *pdoc; @@ -2198,18 +2280,375 @@ public:  	}  }; +#ifdef CXX11_REGEX + +class ByteIterator : public std::iterator<std::bidirectional_iterator_tag, char> { +public: +	const Document *doc; +	Position position; +	ByteIterator(const Document *doc_ = 0, Position position_ = 0) : doc(doc_), position(position_) { +	} +	ByteIterator(const ByteIterator &other) { +		doc = other.doc; +		position = other.position; +	} +	ByteIterator &operator=(const ByteIterator &other) { +		if (this != &other) { +			doc = other.doc; +			position = other.position; +		} +		return *this; +	} +	char operator*() const { +		return doc->CharAt(position); +	} +	ByteIterator &operator++() { +		position++; +		return *this; +	} +	ByteIterator operator++(int) { +		ByteIterator retVal(*this); +		position++; +		return retVal; +	} +	ByteIterator &operator--() { +		position--; +		return *this; +	} +	bool operator==(const ByteIterator &other) const { +		return doc == other.doc && position == other.position; +	} +	bool operator!=(const ByteIterator &other) const { +		return doc != other.doc || position != other.position; +	} +	int Pos() const { +		return position; +	} +	int PosRoundUp() const { +		return position; +	} +}; + +// On Windows, wchar_t is 16 bits wide and on Unix it is 32 bits wide. +// Would be better to use sizeof(wchar_t) or similar to differentiate +// but easier for now to hard-code platforms. +// C++11 has char16_t and char32_t but neither Clang nor Visual C++ +// appear to allow specializing basic_regex over these. + +#ifdef _WIN32 +#define WCHAR_T_IS_16 1 +#else +#define WCHAR_T_IS_16 0 +#endif + +#if WCHAR_T_IS_16 + +// On Windows, report non-BMP characters as 2 separate surrogates as that +// matches wregex since it is based on wchar_t. +class UTF8Iterator : public std::iterator<std::bidirectional_iterator_tag, wchar_t> { +	// These 3 fields determine the iterator position and are used for comparisons +	const Document *doc; +	Position position; +	size_t characterIndex; +	// Remaining fields are derived from the determining fields so are excluded in comparisons +	unsigned int lenBytes; +	size_t lenCharacters; +	wchar_t buffered[2]; +public: +	UTF8Iterator(const Document *doc_ = 0, Position position_ = 0) : +		doc(doc_), position(position_), characterIndex(0), lenBytes(0), lenCharacters(0) { +		buffered[0] = 0; +		buffered[1] = 0; +	} +	UTF8Iterator(const UTF8Iterator &other) { +		doc = other.doc; +		position = other.position; +		characterIndex = other.characterIndex; +		lenBytes = other.lenBytes; +		lenCharacters = other.lenCharacters; +		buffered[0] = other.buffered[0]; +		buffered[1] = other.buffered[1]; +	} +	UTF8Iterator &operator=(const UTF8Iterator &other) { +		if (this != &other) { +			doc = other.doc; +			position = other.position; +			characterIndex = other.characterIndex; +			lenBytes = other.lenBytes; +			lenCharacters = other.lenCharacters; +			buffered[0] = other.buffered[0]; +			buffered[1] = other.buffered[1]; +		} +		return *this; +	} +	wchar_t operator*() { +		if (lenCharacters == 0) { +			ReadCharacter(); +		} +		return buffered[characterIndex]; +	} +	UTF8Iterator &operator++() { +		if ((characterIndex + 1) < (lenCharacters)) { +			characterIndex++; +		} else { +			position += lenBytes; +			ReadCharacter(); +			characterIndex = 0; +		} +		return *this; +	} +	UTF8Iterator operator++(int) { +		UTF8Iterator retVal(*this); +		if ((characterIndex + 1) < (lenCharacters)) { +			characterIndex++; +		} else { +			position += lenBytes; +			ReadCharacter(); +			characterIndex = 0; +		} +		return retVal; +	} +	UTF8Iterator &operator--() { +		if (characterIndex) { +			characterIndex--; +		} else { +			position = doc->NextPosition(position, -1); +			ReadCharacter(); +			characterIndex = lenCharacters - 1; +		} +		return *this; +	} +	bool operator==(const UTF8Iterator &other) const { +		// Only test the determining fields, not the character widths and values derived from this +		return doc == other.doc && +			position == other.position && +			characterIndex == other.characterIndex; +	} +	bool operator!=(const UTF8Iterator &other) const { +		// Only test the determining fields, not the character widths and values derived from this +		return doc != other.doc || +			position != other.position || +			characterIndex != other.characterIndex; +	} +	int Pos() const { +		return position; +	} +	int PosRoundUp() const { +		if (characterIndex) +			return position + lenBytes;	// Force to end of character +		else +			return position; +	} +private: +	void ReadCharacter() { +		Document::CharacterExtracted charExtracted = doc->ExtractCharacter(position); +		lenBytes = charExtracted.widthBytes; +		if (charExtracted.character == unicodeReplacementChar) { +			lenCharacters = 1; +			buffered[0] = static_cast<wchar_t>(charExtracted.character); +		} else { +			lenCharacters = UTF16FromUTF32Character(charExtracted.character, buffered); +		} +	} +}; + +#else + +// On Unix, report non-BMP characters as single characters + +class UTF8Iterator : public std::iterator<std::bidirectional_iterator_tag, wchar_t> { +	const Document *doc; +	Position position; +public: +	UTF8Iterator(const Document *doc_=0, Position position_=0) : doc(doc_), position(position_) { +	} +	UTF8Iterator(const UTF8Iterator &other) { +		doc = other.doc; +		position = other.position; +	} +	UTF8Iterator &operator=(const UTF8Iterator &other) { +		if (this != &other) { +			doc = other.doc; +			position = other.position; +		} +		return *this; +	} +	wchar_t operator*() const { +		Document::CharacterExtracted charExtracted = doc->ExtractCharacter(position); +		return charExtracted.character; +	} +	UTF8Iterator &operator++() { +		position = doc->NextPosition(position, 1); +		return *this; +	} +	UTF8Iterator operator++(int) { +		UTF8Iterator retVal(*this); +		position = doc->NextPosition(position, 1); +		return retVal; +	} +	UTF8Iterator &operator--() { +		position = doc->NextPosition(position, -1); +		return *this; +	} +	bool operator==(const UTF8Iterator &other) const { +		return doc == other.doc && position == other.position; +	} +	bool operator!=(const UTF8Iterator &other) const { +		return doc != other.doc || position != other.position; +	} +	int Pos() const { +		return position;  +	} +	int PosRoundUp() const { +		return position;  +	} +}; + +#endif + +std::regex_constants::match_flag_type MatchFlags(const Document *doc, int startPos, int endPos) { +	std::regex_constants::match_flag_type flagsMatch = std::regex_constants::match_default; +	if (!doc->IsLineStartPosition(startPos)) +		flagsMatch |= std::regex_constants::match_not_bol; +	if (!doc->IsLineEndPosition(endPos)) +		flagsMatch |= std::regex_constants::match_not_eol; +	return flagsMatch; +} + +template<typename Iterator, typename Regex> +bool MatchOnLines(const Document *doc, const Regex ®exp, const RESearchRange &resr, RESearch &search) { +	bool matched = false; +	std::match_results<Iterator> match; + +	// MSVC and libc++ have problems with ^ and $ matching line ends inside a range +	// If they didn't then the line by line iteration could be removed for the forwards +	// case and replaced with the following 4 lines: +	//	Iterator uiStart(doc, startPos); +	//	Iterator uiEnd(doc, endPos); +	//	flagsMatch = MatchFlags(doc, startPos, endPos); +	//	matched = std::regex_search(uiStart, uiEnd, match, regexp, flagsMatch); + +	// Line by line. +	for (int line = resr.lineRangeStart; line != resr.lineRangeBreak; line += resr.increment) { +		const Range lineRange = resr.LineRange(line); +		Iterator itStart(doc, lineRange.start); +		Iterator itEnd(doc, lineRange.end); +		std::regex_constants::match_flag_type flagsMatch = MatchFlags(doc, lineRange.start, lineRange.end); +		matched = std::regex_search(itStart, itEnd, match, regexp, flagsMatch); +		// Check for the last match on this line. +		if (matched) { +			if (resr.increment == -1) { +				while (matched) { +					Iterator itNext(doc, match[0].second.PosRoundUp()); +					flagsMatch = MatchFlags(doc, itNext.Pos(), lineRange.end); +					std::match_results<Iterator> matchNext; +					matched = std::regex_search(itNext, itEnd, matchNext, regexp, flagsMatch); +					if (matched) { +						if (match[0].first == match[0].second) { +							// Empty match means failure so exit +							return false; +						} +						match = matchNext; +					} +				} +				matched = true; +			} +			break; +		} +	} +	if (matched) { +		for (size_t co = 0; co < match.size(); co++) { +			search.bopat[co] = match[co].first.Pos(); +			search.eopat[co] = match[co].second.PosRoundUp(); +			size_t lenMatch = search.eopat[co] - search.bopat[co]; +			search.pat[co].resize(lenMatch); +			for (size_t iPos = 0; iPos < lenMatch; iPos++) { +				search.pat[co][iPos] = doc->CharAt(iPos + search.bopat[co]); +			} +		} +	} +	return matched; +} + +long Cxx11RegexFindText(Document *doc, int minPos, int maxPos, const char *s, +	bool caseSensitive, int *length, RESearch &search) { +	const RESearchRange resr(doc, minPos, maxPos); +	try { +		//ElapsedTime et; +		std::regex::flag_type flagsRe = std::regex::ECMAScript; +		// Flags that apper to have no effect: +		// | std::regex::collate | std::regex::extended; +		if (!caseSensitive) +			flagsRe = flagsRe | std::regex::icase; + +		// Clear the RESearch so can fill in matches +		search.Clear(); + +		bool matched = false; +		if (SC_CP_UTF8 == doc->dbcsCodePage) { +			unsigned int lenS = static_cast<unsigned int>(strlen(s)); +			std::vector<wchar_t> ws(lenS + 1); +#if WCHAR_T_IS_16 +			size_t outLen = UTF16FromUTF8(s, lenS, &ws[0], lenS); +#else +			size_t outLen = UTF32FromUTF8(s, lenS, reinterpret_cast<unsigned int *>(&ws[0]), lenS); +#endif +			ws[outLen] = 0; +			std::wregex regexp; +#if defined(__APPLE__) +			// Using a UTF-8 locale doesn't change to Unicode over a byte buffer so '.' +			// is one byte not one character.  +			// However, on OS X this makes wregex act as Unicode +			std::locale localeU("en_US.UTF-8"); +			regexp.imbue(localeU); +#endif +			regexp.assign(&ws[0], flagsRe); +			matched = MatchOnLines<UTF8Iterator>(doc, regexp, resr, search); + +		} else { +			std::regex regexp; +			regexp.assign(s, flagsRe); +			matched = MatchOnLines<ByteIterator>(doc, regexp, resr, search); +		} + +		int posMatch = -1; +		if (matched) { +			posMatch = search.bopat[0]; +			*length = search.eopat[0] - search.bopat[0]; +		} +		// Example - search in doc/ScintillaHistory.html for +		// [[:upper:]]eta[[:space:]] +		// On MacBook, normally around 1 second but with locale imbued -> 14 seconds. +		//double durSearch = et.Duration(true); +		//Platform::DebugPrintf("Search:%9.6g \n", durSearch); +		return posMatch; +	} catch (std::regex_error &) { +		// Failed to create regular expression +		throw RegexError(); +	} catch (...) { +		// Failed in some other way +		return -1; +	} +} + +#endif + +} +  long BuiltinRegex::FindText(Document *doc, int minPos, int maxPos, const char *s,                          bool caseSensitive, bool, bool, int flags,                          int *length) { -	const bool posix = (flags & SCFIND_POSIX) != 0; -	const int increment = (minPos <= maxPos) ? 1 : -1; -	int startPos = minPos; -	int endPos = maxPos; +#ifdef CXX11_REGEX +	if (flags & SCFIND_CXX11REGEX) { +			return Cxx11RegexFindText(doc, minPos, maxPos, s, +			caseSensitive, length, search); +	} +#endif -	// Range endpoints should not be inside DBCS characters, but just in case, move them. -	startPos = doc->MovePositionOutsideChar(startPos, 1, false); -	endPos = doc->MovePositionOutsideChar(endPos, 1, false); +	const RESearchRange resr(doc, minPos, maxPos); + +	const bool posix = (flags & SCFIND_POSIX) != 0;  	const char *errmsg = search.Compile(s, *length, caseSensitive, posix);  	if (errmsg) { @@ -2219,50 +2658,34 @@ long BuiltinRegex::FindText(Document *doc, int minPos, int maxPos, const char *s  	// Replace first '.' with '-' in each property file variable reference:  	//     Search: \$(\([A-Za-z0-9_-]+\)\.\([A-Za-z0-9_.]+\))  	//     Replace: $(\1-\2) -	int lineRangeStart = doc->LineFromPosition(startPos); -	const int lineRangeEnd = doc->LineFromPosition(endPos); -	if ((increment == 1) && -		(startPos >= doc->LineEnd(lineRangeStart)) && -		(lineRangeStart < lineRangeEnd)) { -		// the start position is at end of line or between line end characters. -		lineRangeStart++; -		startPos = doc->LineStart(lineRangeStart); -	} else if ((increment == -1) && -	           (startPos <= doc->LineStart(lineRangeStart)) && -	           (lineRangeStart > lineRangeEnd)) { -		// the start position is at beginning of line. -		lineRangeStart--; -		startPos = doc->LineEnd(lineRangeStart); -	}  	int pos = -1;  	int lenRet = 0;  	const char searchEnd = s[*length - 1];  	const char searchEndPrev = (*length > 1) ? s[*length - 2] : '\0'; -	const int lineRangeBreak = lineRangeEnd + increment; -	for (int line = lineRangeStart; line != lineRangeBreak; line += increment) { +	for (int line = resr.lineRangeStart; line != resr.lineRangeBreak; line += resr.increment) {  		int startOfLine = doc->LineStart(line);  		int endOfLine = doc->LineEnd(line); -		if (increment == 1) { -			if (line == lineRangeStart) { -				if ((startPos != startOfLine) && (s[0] == '^')) +		if (resr.increment == 1) { +			if (line == resr.lineRangeStart) { +				if ((resr.startPos != startOfLine) && (s[0] == '^'))  					continue;	// Can't match start of line if start position after start of line -				startOfLine = startPos; +				startOfLine = resr.startPos;  			} -			if (line == lineRangeEnd) { -				if ((endPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\')) +			if (line == resr.lineRangeEnd) { +				if ((resr.endPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\'))  					continue;	// Can't match end of line if end position before end of line -				endOfLine = endPos; +				endOfLine = resr.endPos;  			}  		} else { -			if (line == lineRangeEnd) { -				if ((endPos != startOfLine) && (s[0] == '^')) +			if (line == resr.lineRangeEnd) { +				if ((resr.endPos != startOfLine) && (s[0] == '^'))  					continue;	// Can't match start of line if end position after start of line -				startOfLine = endPos; +				startOfLine = resr.endPos;  			} -			if (line == lineRangeStart) { -				if ((startPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\')) +			if (line == resr.lineRangeStart) { +				if ((resr.startPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\'))  					continue;	// Can't match end of line if start position before end of line -				endOfLine = startPos; +				endOfLine = resr.startPos;  			}  		} @@ -2274,7 +2697,7 @@ long BuiltinRegex::FindText(Document *doc, int minPos, int maxPos, const char *s  			search.eopat[0] = doc->MovePositionOutsideChar(search.eopat[0], 1, false);  			lenRet = search.eopat[0] - search.bopat[0];  			// There can be only one start of a line, so no need to look for last match in line -			if ((increment == -1) && (s[0] != '^')) { +			if ((resr.increment == -1) && (s[0] != '^')) {  				// Check for the last match on this line.  				int repetitions = 1000;	// Break out of infinite loop  				while (success && (search.eopat[0] <= endOfLine) && (repetitions--)) { diff --git a/src/Document.h b/src/Document.h index e84be14e4..477b4dc60 100644 --- a/src/Document.h +++ b/src/Document.h @@ -188,6 +188,10 @@ public:  	}  }; +struct RegexError : public std::runtime_error { +	RegexError() : std::runtime_error("regex failure") {} +}; +  /**   */  class Document : PerLine, public IDocumentWithLineEnd, public ILoader { @@ -271,7 +275,7 @@ public:  	bool IsCrLf(int pos) const;  	int LenChar(int pos);  	bool InGoodUTF8(int pos, int &start, int &end) const; -	int MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd=true); +	int MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd=true) const;  	int NextPosition(int pos, int moveDir) const;  	bool NextCharacter(int &pos, int moveDir) const;	// Returns true if pos changed  	int SCI_METHOD GetRelativePosition(int positionStart, int characterOffset) const; @@ -345,6 +349,7 @@ public:  	void DeleteAllMarks(int markerNum);  	int LineFromHandle(int markerHandle);  	int SCI_METHOD LineStart(int line) const; +	bool IsLineStartPosition(int position) const;  	int SCI_METHOD LineEnd(int line) const;  	int LineEndPosition(int position) const;  	bool IsLineEndPosition(int position) const; @@ -364,6 +369,16 @@ public:  	int NextWordEnd(int pos, int delta);  	int SCI_METHOD Length() const { return cb.Length(); }  	void Allocate(int newSize) { cb.Allocate(newSize); } + +	struct CharacterExtracted { +		unsigned int character; +		unsigned int widthBytes; +		CharacterExtracted(unsigned int character_, unsigned int widthBytes_) :  +			character(character_), widthBytes(widthBytes_) { +		} +	}; +	CharacterExtracted ExtractCharacter(int position) const; +  	bool MatchesWordOptions(bool word, bool wordStart, int pos, int length) const;  	bool HasCaseFolder(void) const;  	void SetCaseFolder(CaseFolder *pcf_); diff --git a/src/Editor.cxx b/src/Editor.cxx index 8748d89c1..80f96a7c8 100644 --- a/src/Editor.cxx +++ b/src/Editor.cxx @@ -3639,18 +3639,23 @@ long Editor::FindText(  	int lengthFound = istrlen(ft->lpstrText);  	if (!pdoc->HasCaseFolder())  		pdoc->SetCaseFolder(CaseFolderForEncoding()); -	int pos = pdoc->FindText(ft->chrg.cpMin, ft->chrg.cpMax, ft->lpstrText, -	        (wParam & SCFIND_MATCHCASE) != 0, -	        (wParam & SCFIND_WHOLEWORD) != 0, -	        (wParam & SCFIND_WORDSTART) != 0, -	        (wParam & SCFIND_REGEXP) != 0, -	        static_cast<int>(wParam), -	        &lengthFound); -	if (pos != -1) { -		ft->chrgText.cpMin = pos; -		ft->chrgText.cpMax = pos + lengthFound; +	try { +		int pos = pdoc->FindText(ft->chrg.cpMin, ft->chrg.cpMax, ft->lpstrText, +			(wParam & SCFIND_MATCHCASE) != 0, +			(wParam & SCFIND_WHOLEWORD) != 0, +			(wParam & SCFIND_WORDSTART) != 0, +			(wParam & SCFIND_REGEXP) != 0, +			static_cast<int>(wParam), +			&lengthFound); +		if (pos != -1) { +			ft->chrgText.cpMin = pos; +			ft->chrgText.cpMax = pos + lengthFound; +		} +		return pos; +	} catch (RegexError &) { +		errorStatus = SC_STATUS_WARN_REGEX; +		return -1;  	} -	return pos;  }  /** @@ -3684,22 +3689,27 @@ long Editor::SearchText(  	int lengthFound = istrlen(txt);  	if (!pdoc->HasCaseFolder())  		pdoc->SetCaseFolder(CaseFolderForEncoding()); -	if (iMessage == SCI_SEARCHNEXT) { -		pos = pdoc->FindText(searchAnchor, pdoc->Length(), txt, -		        (wParam & SCFIND_MATCHCASE) != 0, -		        (wParam & SCFIND_WHOLEWORD) != 0, -		        (wParam & SCFIND_WORDSTART) != 0, -		        (wParam & SCFIND_REGEXP) != 0, -		        static_cast<int>(wParam), -		        &lengthFound); -	} else { -		pos = pdoc->FindText(searchAnchor, 0, txt, -		        (wParam & SCFIND_MATCHCASE) != 0, -		        (wParam & SCFIND_WHOLEWORD) != 0, -		        (wParam & SCFIND_WORDSTART) != 0, -		        (wParam & SCFIND_REGEXP) != 0, -		        static_cast<int>(wParam), -		        &lengthFound); +	try { +		if (iMessage == SCI_SEARCHNEXT) { +			pos = pdoc->FindText(searchAnchor, pdoc->Length(), txt, +					(wParam & SCFIND_MATCHCASE) != 0, +					(wParam & SCFIND_WHOLEWORD) != 0, +					(wParam & SCFIND_WORDSTART) != 0, +					(wParam & SCFIND_REGEXP) != 0, +					static_cast<int>(wParam), +					&lengthFound); +		} else { +			pos = pdoc->FindText(searchAnchor, 0, txt, +					(wParam & SCFIND_MATCHCASE) != 0, +					(wParam & SCFIND_WHOLEWORD) != 0, +					(wParam & SCFIND_WORDSTART) != 0, +					(wParam & SCFIND_REGEXP) != 0, +					static_cast<int>(wParam), +					&lengthFound); +		} +	} catch (RegexError &) { +		errorStatus = SC_STATUS_WARN_REGEX; +		return -1;  	}  	if (pos != -1) {  		SetSelection(pos, pos + lengthFound); @@ -3734,18 +3744,23 @@ long Editor::SearchInTarget(const char *text, int length) {  	if (!pdoc->HasCaseFolder())  		pdoc->SetCaseFolder(CaseFolderForEncoding()); -	int pos = pdoc->FindText(targetStart, targetEnd, text, -	        (searchFlags & SCFIND_MATCHCASE) != 0, -	        (searchFlags & SCFIND_WHOLEWORD) != 0, -	        (searchFlags & SCFIND_WORDSTART) != 0, -	        (searchFlags & SCFIND_REGEXP) != 0, -	        searchFlags, -	        &lengthFound); -	if (pos != -1) { -		targetStart = pos; -		targetEnd = pos + lengthFound; +	try { +		int pos = pdoc->FindText(targetStart, targetEnd, text, +				(searchFlags & SCFIND_MATCHCASE) != 0, +				(searchFlags & SCFIND_WHOLEWORD) != 0, +				(searchFlags & SCFIND_WORDSTART) != 0, +				(searchFlags & SCFIND_REGEXP) != 0, +				searchFlags, +				&lengthFound); +		if (pos != -1) { +			targetStart = pos; +			targetEnd = pos + lengthFound; +		} +		return pos; +	} catch (RegexError &) { +		errorStatus = SC_STATUS_WARN_REGEX; +		return -1;  	} -	return pos;  }  void Editor::GoToLine(int lineNo) { diff --git a/src/RESearch.h b/src/RESearch.h index 38875a3f4..3a7f0e4d6 100644 --- a/src/RESearch.h +++ b/src/RESearch.h @@ -33,6 +33,7 @@ class RESearch {  public:  	explicit RESearch(CharClassify *charClassTable);  	~RESearch(); +	void Clear();  	void GrabMatches(CharacterIndexer &ci);  	const char *Compile(const char *pattern, int length, bool caseSensitive, bool posix);  	int Execute(CharacterIndexer &ci, int lp, int endp); @@ -46,7 +47,6 @@ public:  	std::string pat[MAXTAG];  private: -	void Clear();  	void ChSet(unsigned char c);  	void ChSetWithCase(unsigned char c, bool caseSensitive);  	int GetBackslashExpression(const char *pattern, int &incr); diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index 2286e047d..d19828a52 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -20,6 +20,7 @@ namespace Scintilla {  enum { SURROGATE_LEAD_FIRST = 0xD800 };  enum { SURROGATE_TRAIL_FIRST = 0xDC00 };  enum { SURROGATE_TRAIL_LAST = 0xDFFF }; +enum { SUPPLEMENTAL_PLANE_FIRST = 0x10000 };  unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen) {  	unsigned int len = 0; @@ -138,6 +139,51 @@ unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsig  	return ui;  } +unsigned int UTF32FromUTF8(const char *s, unsigned int len, unsigned int *tbuf, unsigned int tlen) { +	unsigned int ui=0; +	const unsigned char *us = reinterpret_cast<const unsigned char *>(s); +	unsigned int i=0; +	while ((i<len) && (ui<tlen)) { +		unsigned char ch = us[i++]; +		wchar_t value = 0; +		if (ch < 0x80) { +			value = ch; +		} else if (((len-i) >= 1) && (ch < 0x80 + 0x40 + 0x20)) { +			value = (ch & 0x1F) << 6; +			ch = us[i++]; +			value += ch & 0x7F; +		} else if (((len-i) >= 2) && (ch < 0x80 + 0x40 + 0x20 + 0x10)) { +			value = (ch & 0xF) << 12; +			ch = us[i++]; +			value += (ch & 0x7F) << 6; +			ch = us[i++]; +			value += ch & 0x7F; +		} else if ((len-i) >= 3) { +			value = (ch & 0x7) << 18; +			ch = us[i++]; +			value += (ch & 0x3F) << 12; +			ch = us[i++]; +			value += (ch & 0x3F) << 6; +			ch = us[i++]; +			value += ch & 0x3F; +		} +		tbuf[ui] = value; +		ui++; +	} +	return ui; +} + +unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) { +	if (val < SUPPLEMENTAL_PLANE_FIRST) { +		tbuf[0] = static_cast<wchar_t>(val); +		return 1; +	} else { +		tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST); +		tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST); +		return 2; +	} +} +  int UTF8BytesOfLead[256];  static bool initialisedBytesOfLead = false; diff --git a/src/UniConversion.h b/src/UniConversion.h index 753490bab..760f50476 100644 --- a/src/UniConversion.h +++ b/src/UniConversion.h @@ -14,11 +14,15 @@ namespace Scintilla {  const int UTF8MaxBytes = 4; +const int unicodeReplacementChar = 0xFFFD; +  unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen);  void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len);  unsigned int UTF8CharLength(unsigned char ch);  unsigned int UTF16Length(const char *s, unsigned int len);  unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen); +unsigned int UTF32FromUTF8(const char *s, unsigned int len, unsigned int *tbuf, unsigned int tlen); +unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf);  extern int UTF8BytesOfLead[256];  void UTF8BytesOfLeadInitialise(); diff --git a/win32/makefile b/win32/makefile index 299c3c32c..bd808fe41 100644 --- a/win32/makefile +++ b/win32/makefile @@ -27,6 +27,10 @@ endif  CXXBASEFLAGS=--std=c++0x -Wall -pedantic $(INCLUDEDIRS) -fno-rtti $(D2DFLAGS) +ifdef CXX11_REGEX +REFLAGS=-DCXX11_REGEX +endif +  ifdef DEBUG  CXXFLAGS=-DDEBUG -g $(CXXBASEFLAGS)  else @@ -35,7 +39,7 @@ STRIPFLAG=-s  endif  .cxx.o: -	$(CXX) $(CXXFLAGS) -c $< +	$(CXX) $(CXXFLAGS) $(REFLAGS) -c $<  ALL:	$(COMPONENT) $(LEXCOMPONENT) $(LEXLIB) ScintillaWinS.o diff --git a/win32/scintilla.mak b/win32/scintilla.mak index aef73f572..43679cbc7 100644 --- a/win32/scintilla.mak +++ b/win32/scintilla.mak @@ -39,6 +39,10 @@ CXXFLAGS=$(CXXFLAGS) -DDISABLE_D2D  !MESSAGE Direct2D is not available  !ENDIF +!IFDEF CXX11_REGEX +CXXFLAGS=$(CXXFLAGS) -DCXX11_REGEX +!ENDIF +  !IFDEF DEBUG  CXXFLAGS=$(CXXFLAGS) $(CXXDEBUG)  LDFLAGS=$(LDDEBUG) $(LDFLAGS) | 
