diff options
| author | nyamatongwe <unknown> | 2013-06-27 17:22:43 +1000 | 
|---|---|---|
| committer | nyamatongwe <unknown> | 2013-06-27 17:22:43 +1000 | 
| commit | 35639b0cec5c5e293355c8660a7f36dfb71b5651 (patch) | |
| tree | 0f9d1f7d614ffda7ff98d5246aca5bfaaa013fd5 | |
| parent | 02e0b189ea2eb78f301b98fa2363943980a96a28 (diff) | |
| download | scintilla-mirror-35639b0cec5c5e293355c8660a7f36dfb71b5651.tar.gz | |
Bug: [#1483]. Adding StyleContext::GetRelativeCharacter for character-oriented access.
Implemented using new method IDocumentWithLineEnd::GetRelativePosition.
| -rw-r--r-- | doc/ScintillaDoc.html | 7 | ||||
| -rw-r--r-- | include/ILexer.h | 1 | ||||
| -rw-r--r-- | lexlib/LexAccessor.h | 15 | ||||
| -rw-r--r-- | lexlib/StyleContext.h | 115 | ||||
| -rw-r--r-- | src/Document.cxx | 65 | ||||
| -rw-r--r-- | src/Document.h | 1 | 
6 files changed, 147 insertions, 57 deletions
| diff --git a/doc/ScintillaDoc.html b/doc/ScintillaDoc.html index 281bbf957..abec92a5b 100644 --- a/doc/ScintillaDoc.html +++ b/doc/ScintillaDoc.html @@ -6321,13 +6321,18 @@ exception options.</p>  <p>  To allow lexers to determine the end position of a line and thus more easily support Unicode line ends -<code>IDocument</code> is extended to <code>IDocumentWithLineEnd</code>. +<code>IDocument</code> is extended to <code>IDocumentWithLineEnd</code>.</p> +<p>The <code>GetRelativePosition</code> method allows navigating the document by whole characters and provides a standard +conversion from UTF-8 bytes to a UTF-32 character or from DBCS to a 16 bit value. +Invalid UTF-8 is reported as a character for each byte with values 0xDC80+byteValue, which are +not valid Unicode code points.  </p>  <div class="highlighted">  <span class="S5">class</span><span class="S0"> </span>IDocumentWithLineEnd<span class="S0"> </span><span class="S10">:</span><span class="S0"> </span><span class="S5">public</span><span class="S0"> </span>IDocument<span class="S0"> </span><span class="S10">{</span><br />  <span class="S5">public</span><span class="S10">:</span><br />  <span class="S0">        </span><span class="S5">virtual</span><span class="S0"> </span><span class="S5">int</span><span class="S0"> </span>SCI_METHOD<span class="S0"> </span>LineEnd<span class="S10">(</span><span class="S5">int</span><span class="S0"> </span>line<span class="S10">)</span><span class="S0"> </span><span class="S5">const</span><span class="S0"> </span><span class="S10">=</span><span class="S0"> </span><span class="S4">0</span><span class="S10">;</span><br /> +<span class="S0">        </span><span class="S5">virtual</span><span class="S0"> </span><span class="S5">int</span><span class="S0"> </span>SCI_METHOD<span class="S0"> </span>GetRelativePosition<span class="S10">(</span><span class="S5">int</span><span class="S0"> </span>start<span class="S10">,</span><span class="S0"> </span><span class="S5">int</span><span class="S0"> </span>characterOffset<span class="S10">,</span><span class="S0"> </span><span class="S5">int</span><span class="S0"> </span><span class="S10">*</span>character<span class="S10">,</span><span class="S0"> </span><span class="S5">int</span><span class="S0"> </span><span class="S10">*</span>width<span class="S10">)</span><span class="S0"> </span><span class="S5">const</span><span class="S0"> </span><span class="S10">=</span><span class="S0"> </span><span class="S4">0</span><span class="S10">;</span><br />  <span class="S10">};</span><br />  </div> diff --git a/include/ILexer.h b/include/ILexer.h index 1260c1373..9f9225ef2 100644 --- a/include/ILexer.h +++ b/include/ILexer.h @@ -48,6 +48,7 @@ public:  class IDocumentWithLineEnd : public IDocument {  public:  	virtual int SCI_METHOD LineEnd(int line) const = 0; +	virtual int SCI_METHOD GetRelativePosition(int start, int characterOffset, int *character, int *width) const = 0;  };  enum { lvOriginal=0, lvSubStyles=1 }; diff --git a/lexlib/LexAccessor.h b/lexlib/LexAccessor.h index 4223f302d..92e719360 100644 --- a/lexlib/LexAccessor.h +++ b/lexlib/LexAccessor.h @@ -126,6 +126,21 @@ public:  				return startNext - 1;  		}  	} +	int GetRelativePosition(int start, int characterOffset, int *character, int *width) { +		if (documentVersion >= dvLineEnd) { +			return (static_cast<IDocumentWithLineEnd *>(pAccess))->GetRelativePosition( +				start, characterOffset, character, width); +		} else { +			// Old version -> byte-oriented only +			// Handle doc range overflow +			int posNew = start + characterOffset; +			if ((posNew < 0) || (posNew > Length())) +				return -1; +			*character = SafeGetCharAt(posNew, 0); +			*width = 1; +			return start + characterOffset; +		} +	}  	int LevelAt(int line) const {  		return pAccess->GetLevel(line);  	} diff --git a/lexlib/StyleContext.h b/lexlib/StyleContext.h index 2c010645b..0b5dee379 100644 --- a/lexlib/StyleContext.h +++ b/lexlib/StyleContext.h @@ -51,35 +51,27 @@ class StyleContext {  	LexAccessor &styler;  	unsigned int endPos;  	unsigned int lengthDocument; +	 +	// Used for optimizing GetRelativeCharacter +	unsigned int posRelative; +	unsigned int currentPosLastRelative; +	int offsetRelative; +  	StyleContext &operator=(const StyleContext &); -	void GetNextChar(unsigned int pos) { -		chNext = static_cast<unsigned char>(styler.SafeGetCharAt(pos+1, 0)); -		if (styler.Encoding() == encUnicode) { -			if (chNext >= 0x80) { -				unsigned char bytes[4] = { static_cast<unsigned char>(chNext), 0, 0, 0 }; -				for (int trail=1; trail<3; trail++) { -					bytes[trail] = static_cast<unsigned char>(styler.SafeGetCharAt(pos+1+trail, 0)); -					if (!((bytes[trail] >= 0x80) && (bytes[trail] < 0xc0))) { -						bytes[trail] = 0; -						break; -					} -				} -				chNext = UnicodeCodePoint(bytes); -			} -		} else if (styler.Encoding() == encDBCS) { -			if (styler.IsLeadByte(static_cast<char>(chNext))) { -				chNext = chNext << 8; -				chNext |= static_cast<unsigned char>(styler.SafeGetCharAt(pos+2, 0)); -			} +	void GetNextChar() { +		if (styler.Encoding() == enc8bit) { +			chNext = static_cast<unsigned char>(styler.SafeGetCharAt(currentPos+width, 0)); +			widthNext = 1; +		} else { +			styler.GetRelativePosition(currentPos+width, 0, &chNext, &widthNext);  		} -		// End of line? -		// Trigger on CR only (Mac style) or either on LF from CR+LF (Dos/Win) -		// or on LF alone (Unix). Avoid triggering two times on Dos/Win. +		// End of line determined from line end position, allowing CR, LF,  +		// CRLF and Unicode line ends as set by document.  		if (currentLine < lineDocEnd) -			atLineEnd = static_cast<int>(pos) >= (lineStartNext-1); +			atLineEnd = static_cast<int>(currentPos) >= (lineStartNext-1);  		else // Last line -			atLineEnd = static_cast<int>(pos) >= lineStartNext; +			atLineEnd = static_cast<int>(currentPos) >= lineStartNext;  	}  public: @@ -92,12 +84,17 @@ public:  	int state;  	int chPrev;  	int ch; +	int width;  	int chNext; +	int widthNext;  	StyleContext(unsigned int startPos, unsigned int length,                          int initStyle, LexAccessor &styler_, char chMask=31) :  		styler(styler_),  		endPos(startPos + length), +		posRelative(0), +		currentPosLastRelative(0x7FFFFFFF), +		offsetRelative(0),  		currentPos(startPos),  		currentLine(-1),  		lineStartNext(-1), @@ -105,7 +102,9 @@ public:  		state(initStyle & chMask), // Mask off all bits which aren't in the chMask.  		chPrev(0),  		ch(0), -		chNext(0) { +		width(0), +		chNext(0), +		widthNext(1) {  		styler.StartAt(startPos, chMask);  		styler.StartSegment(startPos);  		currentLine = styler.GetLine(startPos); @@ -115,21 +114,14 @@ public:  			endPos++;  		lineDocEnd = styler.GetLine(lengthDocument);  		atLineStart = static_cast<unsigned int>(styler.LineStart(currentLine)) == startPos; -		unsigned int pos = currentPos; -		ch = static_cast<unsigned char>(styler.SafeGetCharAt(pos, 0)); -		if (styler.Encoding() == encUnicode) { -			// Get the current char -			GetNextChar(pos-1); -			ch = chNext; -			pos += BytesInUnicodeCodePoint(ch) - 1; -		} else if (styler.Encoding() == encDBCS) { -			if (styler.IsLeadByte(static_cast<char>(ch))) { -				pos++; -				ch = ch << 8; -				ch |= static_cast<unsigned char>(styler.SafeGetCharAt(pos, 0)); -			} -		} -		GetNextChar(pos); + +		// Variable width is now 0 so GetNextChar gets the char at currentPos into chNext/widthNext +		width = 0; +		GetNextChar(); +		ch = chNext; +		width = widthNext; + +		GetNextChar();  	}  	void Complete() {  		styler.ColourTo(currentPos - ((currentPos > lengthDocument) ? 2 : 1), state); @@ -146,23 +138,10 @@ public:  				lineStartNext = styler.LineStart(currentLine+1);  			}  			chPrev = ch; -			if (styler.Encoding() == encUnicode) { -				currentPos += BytesInUnicodeCodePoint(ch); -			} else if (styler.Encoding() == encDBCS) { -				currentPos++; -				if (ch >= 0x100) -					currentPos++; -			} else { -				currentPos++; -			} +			currentPos += width;  			ch = chNext; -			if (styler.Encoding() == encUnicode) { -				GetNextChar(currentPos + BytesInUnicodeCodePoint(ch)-1); -			} else if (styler.Encoding() == encDBCS) { -				GetNextChar(currentPos + ((ch >= 0x100) ? 1 : 0)); -			} else { -				GetNextChar(currentPos); -			} +			width = widthNext; +			GetNextChar();  		} else {  			atLineStart = false;  			chPrev = ' '; @@ -200,6 +179,30 @@ public:  	int GetRelative(int n) {  		return static_cast<unsigned char>(styler.SafeGetCharAt(currentPos+n, 0));  	} +	int GetRelativeCharacter(int n) { +		if (n == 0) +			return ch; +		if (styler.Encoding() == enc8bit) { +			// fast version for single byte encodings +			return static_cast<unsigned char>(styler.SafeGetCharAt(currentPos + n, 0)); +		} else { +			int ch = 0; +			int width = 0; +			//styler.GetRelativePosition(currentPos, n, &ch, &width); +			if ((currentPosLastRelative != currentPos) || +				((n > 0) && ((offsetRelative < 0) || (n < offsetRelative))) || +				((n < 0) && ((offsetRelative > 0) || (n > offsetRelative)))) { +				posRelative = currentPos; +				offsetRelative = 0; +			} +			int diffRelative = n - offsetRelative; +			int posNew = styler.GetRelativePosition(posRelative, diffRelative, &ch, &width); +			posRelative = posNew; +			currentPosLastRelative = currentPos; +			offsetRelative = n; +			return ch; +		} +	}  	bool Match(char ch0) const {  		return ch == static_cast<unsigned char>(ch0);  	} diff --git a/src/Document.cxx b/src/Document.cxx index 8523a00fa..472567068 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -699,6 +699,71 @@ bool Document::NextCharacter(int &pos, int moveDir) const {  	}  } +static inline int UnicodeFromBytes(const unsigned char *us) { +	if (us[0] < 0xC2) { +		return us[0]; +	} else if (us[0] < 0xE0) { +		return ((us[0] & 0x1F) << 6) + (us[1] & 0x3F); +	} else if (us[0] < 0xF0) { +		return ((us[0] & 0xF) << 12) + ((us[1] & 0x3F) << 6) + (us[2] & 0x3F); +	} else if (us[0] < 0xF5) { +		return ((us[0] & 0x7) << 18) + ((us[1] & 0x3F) << 12) + ((us[2] & 0x3F) << 6) + (us[3] & 0x3F); +	} +	return us[0]; +} + +// Return -1  on out-of-bounds +int SCI_METHOD Document::GetRelativePosition(int start, int characterOffset, int *character, int *width) const { +	int pos = start; +	if (dbcsCodePage) { +		const int increment = (characterOffset > 0) ? 1 : -1; +		while (characterOffset != 0) { +			const int posNext = NextPosition(pos, increment); +			if (posNext == pos) +				return -1; +			pos = posNext; +			characterOffset -= increment; +		} +		const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(pos)); +		if (SC_CP_UTF8 == dbcsCodePage) { +			if (UTF8IsAscii(leadByte)) { +				// Single byte character or invalid +				*character = leadByte; +				*width = 1; +			} else { +				const int widthCharBytes = UTF8BytesOfLead[leadByte]; +				unsigned char charBytes[UTF8MaxBytes] = {leadByte,0,0,0}; +				for (int b=1; b<widthCharBytes; b++) +					charBytes[b] = static_cast<unsigned char>(cb.CharAt(pos+b)); +				int utf8status = UTF8Classify(charBytes, widthCharBytes); +				if (utf8status & UTF8MaskInvalid) { +					// Report as singleton surrogate values which are invalid in Unicode +					*character = 0xDC80 + leadByte; +					*width = 1; +				} else { +					*character = UnicodeFromBytes(charBytes); +					*width = utf8status & UTF8MaskWidth; +				} +			} +		} else if (dbcsCodePage) { +			if (IsDBCSLeadByte(leadByte)) { +				*character = (leadByte << 8) | static_cast<unsigned char>(cb.CharAt(pos+1)); +				*width = 2; +			} else { +				*character = leadByte; +				*width = 1; +			} +		} +	} else { +		pos = start + characterOffset; +		if ((pos < 0) || (pos > Length())) +			return -1; +		*character = cb.CharAt(pos); +		*width = 1; +	} +	return pos; +} +  int SCI_METHOD Document::CodePage() const {  	return dbcsCodePage;  } diff --git a/src/Document.h b/src/Document.h index f3b49e1fe..8eb8db74a 100644 --- a/src/Document.h +++ b/src/Document.h @@ -279,6 +279,7 @@ public:  	int MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd=true);  	int NextPosition(int pos, int moveDir) const;  	bool NextCharacter(int &pos, int moveDir) const;	// Returns true if pos changed +	int SCI_METHOD GetRelativePosition(int start, int characterOffset, int *character, int *width) const;  	int SCI_METHOD CodePage() const;  	bool SCI_METHOD IsDBCSLeadByte(char ch) const;  	int SafeSegment(const char *text, int length, int lengthSegment) const; | 
