diff options
| -rw-r--r-- | lexlib/LexAccessor.h | 21 | ||||
| -rw-r--r-- | lexlib/StyleContext.h | 97 | ||||
| -rw-r--r-- | lexlib/SubStyles.h | 158 | 
3 files changed, 260 insertions, 16 deletions
| diff --git a/lexlib/LexAccessor.h b/lexlib/LexAccessor.h index 6458525cc..59ae11346 100644 --- a/lexlib/LexAccessor.h +++ b/lexlib/LexAccessor.h @@ -12,6 +12,8 @@  namespace Scintilla {  #endif +enum EncodingType { enc8bit, encUnicode, encDBCS }; +  class LexAccessor {  private:  	IDocument *pAccess; @@ -25,7 +27,7 @@ private:  	int startPos;  	int endPos;  	int codePage; -	enum { enc8bit, encUnicode, encDBCS } encodingType; +	enum EncodingType encodingType;  	int lenDoc;  	int mask;  	char styleBuf[bufferSize]; @@ -91,7 +93,9 @@ public:  	bool IsLeadByte(char ch) {  		return pAccess->IsDBCSLeadByte(ch);  	} - +	EncodingType Encoding() const { +		return encodingType; +	}  	bool Match(int pos, const char *s) {  		for (int i=0; *s; i++) {  			if (*s != SafeGetCharAt(pos+i)) @@ -109,6 +113,19 @@ public:  	int LineStart(int line) {  		return pAccess->LineStart(line);  	} +	int LineEnd(int line) { +		if (documentVersion >= dvLineEnd) { +			return (static_cast<IDocumentWithLineEnd *>(pAccess))->LineEnd(line); +		} else { +			// Old interface means only '\r', '\n' and '\r\n' line ends. +			int startNext = pAccess->LineStart(line+1); +			char chLineEnd = SafeGetCharAt(startNext-1); +			if (chLineEnd == '\n' && (SafeGetCharAt(startNext-2)  == '\r')) +				return startNext - 2; +			else +				return startNext - 1; +		} +	}  	int LevelAt(int line) {  		return pAccess->GetLevel(line);  	} diff --git a/lexlib/StyleContext.h b/lexlib/StyleContext.h index c2d223e3f..9f1818f21 100644 --- a/lexlib/StyleContext.h +++ b/lexlib/StyleContext.h @@ -19,6 +19,30 @@ static inline int MakeLowerCase(int ch) {  		return ch - 'A' + 'a';  } +inline int UnicodeCodePoint(const unsigned char *us) { +	if (us[0] < 0xC2) { +		return us[0]; +	} else if (us[0] < 0xE0) { +		return ((us[0] & 0x1F) << 6) + (us[1] & 0x3F); +	} else if (us[0] < 0xF0) { +		return ((us[0] & 0xF) << 12) + ((us[1] & 0x3F) << 6) + (us[2] & 0x3F); +	} else if (us[0] < 0xF5) { +		return ((us[0] & 0x7) << 18) + ((us[1] & 0x3F) << 12) + ((us[2] & 0x3F) << 6) + (us[3] & 0x3F); +	} +	return us[0]; +} + +inline int BytesInUnicodeCodePoint(int codePoint) { +	if (codePoint < 0x80) +		return 1; +	else if (codePoint < 0x800) +		return 2; +	else if (codePoint < 0x10000) +		return 3; +	else +		return 4; +} +  // All languages handled so far can treat all characters >= 0x80 as one class  // which just continues the current token or starts an identifier if in default.  // DBCS treated specially as the second character can be < 0x80 and hence @@ -27,22 +51,40 @@ class StyleContext {  	LexAccessor &styler;  	unsigned int endPos;  	StyleContext &operator=(const StyleContext &); +  	void GetNextChar(unsigned int pos) {  		chNext = static_cast<unsigned char>(styler.SafeGetCharAt(pos+1)); -		if (styler.IsLeadByte(static_cast<char>(chNext))) { -			chNext = chNext << 8; -			chNext |= static_cast<unsigned char>(styler.SafeGetCharAt(pos+2)); +		if (styler.Encoding() == encUnicode) { +			if (chNext >= 0x80) { +				unsigned char bytes[4] = { static_cast<unsigned char>(chNext), 0, 0, 0 }; +				for (int trail=1; trail<3; trail++) { +					bytes[trail] = static_cast<unsigned char>(styler.SafeGetCharAt(pos+1+trail)); +					if (!((bytes[trail] >= 0x80) && (bytes[trail] < 0xc0))) { +						bytes[trail] = 0; +						break; +					} +				} +				chNext = UnicodeCodePoint(bytes); +			} +		} else if (styler.Encoding() == encDBCS) { +			if (styler.IsLeadByte(static_cast<char>(chNext))) { +				chNext = chNext << 8; +				chNext |= static_cast<unsigned char>(styler.SafeGetCharAt(pos+2)); +			}  		}  		// End of line?  		// Trigger on CR only (Mac style) or either on LF from CR+LF (Dos/Win)  		// or on LF alone (Unix). Avoid triggering two times on Dos/Win. -		atLineEnd = (ch == '\r' && chNext != '\n') || -					(ch == '\n') || -					(currentPos >= endPos); +		if (lineStartNext < styler.Length()) +			atLineEnd = static_cast<int>(pos) >= (lineStartNext-1); +		else // Last line +			atLineEnd = static_cast<int>(pos) >= lineStartNext;  	}  public:  	unsigned int currentPos; +	int currentLine; +	int lineStartNext;  	bool atLineStart;  	bool atLineEnd;  	int state; @@ -55,6 +97,8 @@ public:  		styler(styler_),  		endPos(startPos + length),  		currentPos(startPos), +		currentLine(-1), +		lineStartNext(-1),  		atLineEnd(false),  		state(initStyle & chMask), // Mask off all bits which aren't in the chMask.  		chPrev(0), @@ -62,13 +106,22 @@ public:  		chNext(0) {  		styler.StartAt(startPos, chMask);  		styler.StartSegment(startPos); -		atLineStart = static_cast<unsigned int>(styler.LineStart(styler.GetLine(startPos))) == startPos; +		currentLine = styler.GetLine(startPos); +		lineStartNext = styler.LineStart(currentLine+1); +		atLineStart = static_cast<unsigned int>(styler.LineStart(currentLine)) == startPos;  		unsigned int pos = currentPos;  		ch = static_cast<unsigned char>(styler.SafeGetCharAt(pos)); -		if (styler.IsLeadByte(static_cast<char>(ch))) { -			pos++; -			ch = ch << 8; -			ch |= static_cast<unsigned char>(styler.SafeGetCharAt(pos)); +		if (styler.Encoding() == encUnicode) { +			// Get the current char +			GetNextChar(pos-1); +			ch = chNext; +			pos += BytesInUnicodeCodePoint(ch) - 1; +		} else if (styler.Encoding() == encDBCS) { +			if (styler.IsLeadByte(static_cast<char>(ch))) { +				pos++; +				ch = ch << 8; +				ch |= static_cast<unsigned char>(styler.SafeGetCharAt(pos)); +			}  		}  		GetNextChar(pos);  	} @@ -82,12 +135,28 @@ public:  	void Forward() {  		if (currentPos < endPos) {  			atLineStart = atLineEnd; +			if (atLineStart) { +				currentLine++; +				lineStartNext = styler.LineStart(currentLine+1); +			}  			chPrev = ch; -			currentPos++; -			if (ch >= 0x100) +			if (styler.Encoding() == encUnicode) { +				currentPos += BytesInUnicodeCodePoint(ch); +			} else if (styler.Encoding() == encDBCS) { +				currentPos++; +				if (ch >= 0x100) +					currentPos++; +			} else {  				currentPos++; +			}  			ch = chNext; -			GetNextChar(currentPos + ((ch >= 0x100) ? 1 : 0)); +			if (styler.Encoding() == encUnicode) { +				GetNextChar(currentPos + BytesInUnicodeCodePoint(ch)-1); +			} else if (styler.Encoding() == encDBCS) { +				GetNextChar(currentPos + ((ch >= 0x100) ? 1 : 0)); +			} else { +				GetNextChar(currentPos); +			}  		} else {  			atLineStart = false;  			chPrev = ' '; diff --git a/lexlib/SubStyles.h b/lexlib/SubStyles.h new file mode 100644 index 000000000..7dc7804ef --- /dev/null +++ b/lexlib/SubStyles.h @@ -0,0 +1,158 @@ +// Scintilla source code edit control +/** @file SubStyles.h + ** Manage substyles for a lexer. + **/ +// Copyright 2012 by Neil Hodgson <neilh@scintilla.org> +// The License.txt file describes the conditions under which this software may be distributed. + +#ifndef SUBSTYLES_H +#define SUBSTYLES_H + +#ifdef SCI_NAMESPACE +namespace Scintilla { +#endif + +class WordClassifier { +	int firstStyle; +	int lenStyles; +	std::map<std::string, int> wordToStyle; + +public: + +	WordClassifier() : firstStyle(0), lenStyles(0) { +	} + +	void Allocate(int firstStyle_, int lenStyles_) { +		firstStyle = firstStyle_; +		lenStyles = lenStyles_; +		wordToStyle.clear(); +	} + +	int Start() const { +		return firstStyle; +	} + +	int Length() const { +		return lenStyles; +	} + +	void Clear() { +		firstStyle = 0; +		lenStyles = 0; +		wordToStyle.clear(); +	} + +	int ValueFor(const std::string &s) const { +		std::map<std::string, int>::const_iterator it = wordToStyle.find(s); +		if (it != wordToStyle.end()) +			return it->second; +		else +			return -1; +	} + +	bool IncludesStyle(int style) const { +		return (style >= firstStyle) && (style < (firstStyle + lenStyles)); +	} + +	void SetIdentifiers(int style, const char *identifiers) { +		while (*identifiers) { +			const char *cpSpace = identifiers; +			while (*cpSpace && *cpSpace != ' ') +				cpSpace++; +			std::string word(identifiers, cpSpace - identifiers); +			wordToStyle[word] = style; +			identifiers = cpSpace; +			if (*identifiers) +				identifiers++; +		} +	} +}; + +class SubStyles { +	int classifications; +	const char *baseStyles; +	int styleFirst; +	int stylesAvailable; +	int secondaryDistance; +	int allocated; +	std::vector<WordClassifier> classifiers; + +	int BlockFromBaseStyle(int baseStyle) const { +		for (int b=0; b < classifications; b++) { +			if (baseStyle == baseStyles[b]) +				return b; +		} +		return -1; +	} + +	int BlockFromStyle(int style) const { +		int b = 0; +		for (std::vector<WordClassifier>::const_iterator it=classifiers.begin(); it != classifiers.end(); ++it) { +			if (it->IncludesStyle(style)) +				return b; +			b++; +		} +		return -1; +	} + +public: + +	SubStyles(const char *baseStyles_, int styleFirst_, int stylesAvailable_, int secondaryDistance_) : +		classifications(0), +		baseStyles(baseStyles_), +		styleFirst(styleFirst_), +		stylesAvailable(stylesAvailable_), +		secondaryDistance(secondaryDistance_), +		allocated(0) { +		while (baseStyles[classifications]) { +			classifications++; +			classifiers.push_back(WordClassifier()); +		} +	} + +	int Allocate(int styleBase, int numberStyles) { +		int block = BlockFromBaseStyle(styleBase); +		if (block >= 0) { +			if ((allocated + numberStyles) > stylesAvailable) +				return -1; +			int startBlock = styleFirst + allocated; +			allocated += numberStyles; +			classifiers[block].Allocate(startBlock, numberStyles); +			return startBlock; +		} else { +			return -1; +		} +	} + +	int Start(int styleBase) { +		int block = BlockFromBaseStyle(styleBase); +		return (block >= 0) ? classifiers[block].Start() : -1; +	} + +	int Length(int styleBase) { +		int block = BlockFromBaseStyle(styleBase); +		return (block >= 0) ? classifiers[block].Length() : 0; +	} + +	void SetIdentifiers(int style, const char *identifiers) { +		int block = BlockFromStyle(style); +		if (block >= 0) +			classifiers[block].SetIdentifiers(style, identifiers); +	} + +	void Free() { +		allocated = 0; +		for (std::vector<WordClassifier>::iterator it=classifiers.begin(); it != classifiers.end(); ++it) +			it->Clear(); +	} + +	const WordClassifier &Classifier(int baseStyle) const { +		return classifiers[BlockFromBaseStyle(baseStyle)]; +	} +}; + +#ifdef SCI_NAMESPACE +} +#endif + +#endif | 
