diff options
-rw-r--r-- | doc/ScintillaDoc.html | 13 | ||||
-rw-r--r-- | doc/ScintillaHistory.html | 7 | ||||
-rw-r--r-- | src/Document.cxx | 462 | ||||
-rw-r--r-- | src/Document.h | 36 | ||||
-rw-r--r-- | src/EditView.cxx | 25 |
5 files changed, 396 insertions, 147 deletions
diff --git a/doc/ScintillaDoc.html b/doc/ScintillaDoc.html index 3b2c480d0..adf3b9907 100644 --- a/doc/ScintillaDoc.html +++ b/doc/ScintillaDoc.html @@ -2322,8 +2322,13 @@ struct Sci_TextToFind { Line ends are not selected by double clicking but do act as word separators. </p> - <p>Words are defined in terms of bytes, not characters so there are some issues with - UTF-8 and DCBS documents.</p> + <p>Words are defined in terms of characters and the sets of characters in each category can be customized to an extent. + The NUL character (0) is always a space as the APIs to set categories use NUL-terminated strings. + For single-byte encodings a category may be assigned to any character (1 to 0xFF). + For multi-byte encodings a category may be assigned to characters from 1 to 0x7F with static behaviour from 0x80. + For UTF-8, characters from 0x80 will use a category based on their Unicode general category. + For Asian encodings, code pages 932, 936, 949, 950, and 1361, characters from 0x80 are treated as word characters. + </p> <p>Identifiers in programming languages are often sequences of words with capitalisation (aCamelCaseIdentifier) or underscores (an_under_bar_ident) used to mark word boundaries. @@ -2437,7 +2442,7 @@ struct Sci_TextToFind { </table> <p><b id="SCI_SETWORDCHARS">SCI_SETWORDCHARS(<unused>, const char *characters)</b><br /> - This message defines which characters (bytes) are members of the word category. + This message defines which characters are members of the word category. The character categories are set to default values before processing this function. For example, if you don't allow '_' in your set of characters use:<br /> @@ -2449,6 +2454,8 @@ struct Sci_TextToFind { If the characters parameter is 0 then the length that should be allocated to store the entire set is returned.</p> + <p>For multi-byte encodings, this API will not return meaningful values for 0x80 and above.</p> + <p><b id="SCI_SETWHITESPACECHARS">SCI_SETWHITESPACECHARS(<unused>, const char *characters)</b><br /> <b id="SCI_GETWHITESPACECHARS">SCI_GETWHITESPACECHARS(<unused>, char *characters) → int</b><br /> Similar to <code>SCI_SETWORDCHARS</code>, this message allows the user to define which chars Scintilla considers diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html index fb004776b..a134035dc 100644 --- a/doc/ScintillaHistory.html +++ b/doc/ScintillaHistory.html @@ -523,6 +523,13 @@ Released 4 September 2016. </li> <li> + Word selection, navigation, and manipulation is now performed on characters instead of bytes + leading to more natural behaviour for multi-byte encodings like UTF-8. + For UTF-8 characters 0x80 and above, classification into word; punctuation; space; or line-end + is based on the Unicode general category of the character and is not customizable. + <a href="http://sourceforge.net/p/scintilla/bugs/1832/">Bug #1832</a>. + </li> + <li> Two enums changed in Scintilla.iface which may lead to changed bindings. There were 2 FontQuality enums and the first is now PhasesDraw. The prefix for FoldAction was SC_FOLDACTION and is now SC_FOLDACTION_ diff --git a/src/Document.cxx b/src/Document.cxx index f10e40aad..58f663376 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -26,6 +26,7 @@ #include "Scintilla.h" #include "CharacterSet.h" +#include "CharacterCategory.h" #include "Position.h" #include "SplitVector.h" #include "Partitioning.h" @@ -44,10 +45,6 @@ using namespace Scintilla; #endif -static inline bool IsPunctuation(char ch) { - return IsASCII(ch) && ispunct(ch); -} - void LexInterface::Colourise(int start, int end) { if (pdoc && instance && !performingStyle) { // Protect against reentrance, which may occur, for example, when @@ -771,6 +768,77 @@ bool Document::NextCharacter(int &pos, int moveDir) const { } } +Document::CharacterExtracted Document::CharacterAfter(int position) const { + if (position >= Length()) { + return CharacterExtracted(unicodeReplacementChar, 0); + } + const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(position)); + if (!dbcsCodePage || UTF8IsAscii(leadByte)) { + // Common case: ASCII character + return CharacterExtracted(leadByte, 1); + } + if (SC_CP_UTF8 == dbcsCodePage) { + const int widthCharBytes = UTF8BytesOfLead[leadByte]; + unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 }; + for (int b = 1; b<widthCharBytes; b++) + charBytes[b] = static_cast<unsigned char>(cb.CharAt(position + b)); + int utf8status = UTF8Classify(charBytes, widthCharBytes); + if (utf8status & UTF8MaskInvalid) { + // Treat as invalid and use up just one byte + return CharacterExtracted(unicodeReplacementChar, 1); + } else { + return CharacterExtracted(UnicodeFromUTF8(charBytes), utf8status & UTF8MaskWidth); + } + } else { + if (IsDBCSLeadByte(leadByte) && ((position + 1) < Length())) { + return CharacterExtracted::DBCS(leadByte, static_cast<unsigned char>(cb.CharAt(position + 1))); + } else { + return CharacterExtracted(leadByte, 1); + } + } +} + +Document::CharacterExtracted Document::CharacterBefore(int position) const { + if (position <= 0) { + return CharacterExtracted(unicodeReplacementChar, 0); + } + const unsigned char previousByte = static_cast<unsigned char>(cb.CharAt(position - 1)); + if (0 == dbcsCodePage) { + return CharacterExtracted(previousByte, 1); + } + if (SC_CP_UTF8 == dbcsCodePage) { + if (UTF8IsAscii(previousByte)) { + return CharacterExtracted(previousByte, 1); + } + position--; + // If previousByte is not a trail byte then its invalid + if (UTF8IsTrailByte(previousByte)) { + // If previousByte is a trail byte in a valid UTF-8 character then find start of character + int startUTF = position; + int endUTF = position; + if (InGoodUTF8(position, startUTF, endUTF)) { + const int widthCharBytes = endUTF - startUTF; + unsigned char charBytes[UTF8MaxBytes] = { 0, 0, 0, 0 }; + for (int b = 0; b<widthCharBytes; b++) + charBytes[b] = static_cast<unsigned char>(cb.CharAt(startUTF + b)); + int utf8status = UTF8Classify(charBytes, widthCharBytes); + if (utf8status & UTF8MaskInvalid) { + // Treat as invalid and use up just one byte + return CharacterExtracted(unicodeReplacementChar, 1); + } else { + return CharacterExtracted(UnicodeFromUTF8(charBytes), utf8status & UTF8MaskWidth); + } + } + // Else invalid UTF-8 so return position of isolated trail byte + } + return CharacterExtracted(unicodeReplacementChar, 1); + } else { + // Moving backwards in DBCS is complex so use NextPosition + const int posStartCharacter = NextPosition(position, -1); + return CharacterAfter(posStartCharacter); + } +} + // Return -1 on out-of-bounds Sci_Position SCI_METHOD Document::GetRelativePosition(Sci_Position positionStart, Sci_Position characterOffset) const { int pos = positionStart; @@ -1485,28 +1553,104 @@ int Document::ParaDown(int pos) const { return LineEnd(line-1); } -CharClassify::cc Document::WordCharClass(unsigned char ch) const { - if ((SC_CP_UTF8 == dbcsCodePage) && (!UTF8IsAscii(ch))) - return CharClassify::ccWord; - return charClass.GetClass(ch); +bool Document::IsASCIIWordByte(unsigned char ch) const { + if (IsASCII(ch)) { + return charClass.GetClass(ch) == CharClassify::ccWord; + } else { + return false; + } +} + +CharClassify::cc Document::WordCharacterClass(unsigned int ch) const { + if (dbcsCodePage && (!UTF8IsAscii(ch))) { + if (SC_CP_UTF8 == dbcsCodePage) { + // Use hard coded Unicode class + const CharacterCategory cc = CategoriseCharacter(ch); + switch (cc) { + + // Separator, Line/Paragraph + case ccZl: + case ccZp: + return CharClassify::ccNewLine; + + // Separator, Space + case ccZs: + // Other + case ccCc: + case ccCf: + case ccCs: + case ccCo: + case ccCn: + return CharClassify::ccSpace; + + // Letter + case ccLu: + case ccLl: + case ccLt: + case ccLm: + case ccLo: + // Number + case ccNd: + case ccNl: + case ccNo: + // Mark - includes combining diacritics + case ccMn: + case ccMc: + case ccMe: + return CharClassify::ccWord; + + // Punctuation + case ccPc: + case ccPd: + case ccPs: + case ccPe: + case ccPi: + case ccPf: + case ccPo: + // Symbol + case ccSm: + case ccSc: + case ccSk: + case ccSo: + return CharClassify::ccPunctuation; + + } + } else { + // Asian DBCS + return CharClassify::ccWord; + } + } + return charClass.GetClass(static_cast<unsigned char>(ch)); } /** * Used by commmands that want to select whole words. * Finds the start of word at pos when delta < 0 or the end of the word when delta >= 0. */ -int Document::ExtendWordSelect(int pos, int delta, bool onlyWordCharacters) { +int Document::ExtendWordSelect(int pos, int delta, bool onlyWordCharacters) const { CharClassify::cc ccStart = CharClassify::ccWord; if (delta < 0) { - if (!onlyWordCharacters) - ccStart = WordCharClass(cb.CharAt(pos-1)); - while (pos > 0 && (WordCharClass(cb.CharAt(pos - 1)) == ccStart)) - pos--; + if (!onlyWordCharacters) { + const CharacterExtracted ce = CharacterBefore(pos); + ccStart = WordCharacterClass(ce.character); + } + while (pos > 0) { + const CharacterExtracted ce = CharacterBefore(pos); + if (WordCharacterClass(ce.character) != ccStart) + break; + pos -= ce.widthBytes; + } } else { - if (!onlyWordCharacters && pos < Length()) - ccStart = WordCharClass(cb.CharAt(pos)); - while (pos < (Length()) && (WordCharClass(cb.CharAt(pos)) == ccStart)) - pos++; + if (!onlyWordCharacters && pos < Length()) { + const CharacterExtracted ce = CharacterAfter(pos); + ccStart = WordCharacterClass(ce.character); + } + while (pos < Length()) { + const CharacterExtracted ce = CharacterAfter(pos); + if (WordCharacterClass(ce.character) != ccStart) + break; + pos += ce.widthBytes; + } } return MovePositionOutsideChar(pos, delta, true); } @@ -1518,22 +1662,39 @@ int Document::ExtendWordSelect(int pos, int delta, bool onlyWordCharacters) { * additional movement to transit white space. * Used by cursor movement by word commands. */ -int Document::NextWordStart(int pos, int delta) { +int Document::NextWordStart(int pos, int delta) const { if (delta < 0) { - while (pos > 0 && (WordCharClass(cb.CharAt(pos - 1)) == CharClassify::ccSpace)) - pos--; + while (pos > 0) { + const CharacterExtracted ce = CharacterBefore(pos); + if (WordCharacterClass(ce.character) != CharClassify::ccSpace) + break; + pos -= ce.widthBytes; + } if (pos > 0) { - CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos-1)); - while (pos > 0 && (WordCharClass(cb.CharAt(pos - 1)) == ccStart)) { - pos--; + CharacterExtracted ce = CharacterBefore(pos); + const CharClassify::cc ccStart = WordCharacterClass(ce.character); + while (pos > 0) { + ce = CharacterBefore(pos); + if (WordCharacterClass(ce.character) != ccStart) + break; + pos -= ce.widthBytes; } } } else { - CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos)); - while (pos < (Length()) && (WordCharClass(cb.CharAt(pos)) == ccStart)) - pos++; - while (pos < (Length()) && (WordCharClass(cb.CharAt(pos)) == CharClassify::ccSpace)) - pos++; + CharacterExtracted ce = CharacterAfter(pos); + const CharClassify::cc ccStart = WordCharacterClass(ce.character); + while (pos < Length()) { + ce = CharacterAfter(pos); + if (WordCharacterClass(ce.character) != ccStart) + break; + pos += ce.widthBytes; + } + while (pos < Length()) { + ce = CharacterAfter(pos); + if (WordCharacterClass(ce.character) != CharClassify::ccSpace) + break; + pos += ce.widthBytes; + } } return pos; } @@ -1545,27 +1706,41 @@ int Document::NextWordStart(int pos, int delta) { * additional movement to transit white space. * Used by cursor movement by word commands. */ -int Document::NextWordEnd(int pos, int delta) { +int Document::NextWordEnd(int pos, int delta) const { if (delta < 0) { if (pos > 0) { - CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos-1)); + CharacterExtracted ce = CharacterBefore(pos); + CharClassify::cc ccStart = WordCharacterClass(ce.character); if (ccStart != CharClassify::ccSpace) { - while (pos > 0 && WordCharClass(cb.CharAt(pos - 1)) == ccStart) { - pos--; + while (pos > 0) { + ce = CharacterBefore(pos); + if (WordCharacterClass(ce.character) != ccStart) + break; + pos -= ce.widthBytes; } } - while (pos > 0 && WordCharClass(cb.CharAt(pos - 1)) == CharClassify::ccSpace) { - pos--; + while (pos > 0) { + ce = CharacterBefore(pos); + if (WordCharacterClass(ce.character) != CharClassify::ccSpace) + break; + pos -= ce.widthBytes; } } } else { - while (pos < Length() && WordCharClass(cb.CharAt(pos)) == CharClassify::ccSpace) { - pos++; + while (pos < Length()) { + CharacterExtracted ce = CharacterAfter(pos); + if (WordCharacterClass(ce.character) != CharClassify::ccSpace) + break; + pos += ce.widthBytes; } if (pos < Length()) { - CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos)); - while (pos < Length() && WordCharClass(cb.CharAt(pos)) == ccStart) { - pos++; + CharacterExtracted ce = CharacterAfter(pos); + CharClassify::cc ccStart = WordCharacterClass(ce.character); + while (pos < Length()) { + ce = CharacterAfter(pos); + if (WordCharacterClass(ce.character) != ccStart) + break; + pos += ce.widthBytes; } } } @@ -1577,10 +1752,15 @@ int Document::NextWordEnd(int pos, int delta) { * the previous character is of a different character class. */ bool Document::IsWordStartAt(int pos) const { + if (pos >= Length()) + return false; if (pos > 0) { - CharClassify::cc ccPos = WordCharClass(CharAt(pos)); + const CharacterExtracted cePos = CharacterAfter(pos); + const CharClassify::cc ccPos = WordCharacterClass(cePos.character); + const CharacterExtracted cePrev = CharacterBefore(pos); + const CharClassify::cc ccPrev = WordCharacterClass(cePrev.character); return (ccPos == CharClassify::ccWord || ccPos == CharClassify::ccPunctuation) && - (ccPos != WordCharClass(CharAt(pos - 1))); + (ccPos != ccPrev); } return true; } @@ -1590,10 +1770,15 @@ bool Document::IsWordStartAt(int pos) const { * the next character is of a different character class. */ bool Document::IsWordEndAt(int pos) const { + if (pos <= 0) + return false; if (pos < Length()) { - CharClassify::cc ccPrev = WordCharClass(CharAt(pos-1)); + const CharacterExtracted cePos = CharacterAfter(pos); + const CharClassify::cc ccPos = WordCharacterClass(cePos.character); + const CharacterExtracted cePrev = CharacterBefore(pos); + const CharClassify::cc ccPrev = WordCharacterClass(cePrev.character); return (ccPrev == CharClassify::ccWord || ccPrev == CharClassify::ccPunctuation) && - (ccPrev != WordCharClass(CharAt(pos))); + (ccPrev != ccPos); } return true; } @@ -2075,96 +2260,137 @@ void Document::NotifyModified(DocModification mh) { } } -bool Document::IsWordPartSeparator(char ch) const { - return (WordCharClass(ch) == CharClassify::ccWord) && IsPunctuation(ch); +// Used for word part navigation. +static bool IsASCIIPunctuationCharacter(unsigned int ch) { + switch (ch) { + case '!': + case '"': + case '#': + case '$': + case '%': + case '&': + case '\'': + case '(': + case ')': + case '*': + case '+': + case ',': + case '-': + case '.': + case '/': + case ':': + case ';': + case '<': + case '=': + case '>': + case '?': + case '@': + case '[': + case '\\': + case ']': + case '^': + case '_': + case '`': + case '{': + case '|': + case '}': + case '~': + return true; + default: + return false; + } +} + +bool Document::IsWordPartSeparator(unsigned int ch) const { + return (WordCharacterClass(ch) == CharClassify::ccWord) && IsASCIIPunctuationCharacter(ch); } -int Document::WordPartLeft(int pos) { +int Document::WordPartLeft(int pos) const { if (pos > 0) { - --pos; - char startChar = cb.CharAt(pos); - if (IsWordPartSeparator(startChar)) { - while (pos > 0 && IsWordPartSeparator(cb.CharAt(pos))) { - --pos; + pos -= CharacterBefore(pos).widthBytes; + CharacterExtracted ceStart = CharacterAfter(pos); + if (IsWordPartSeparator(ceStart.character)) { + while (pos > 0 && IsWordPartSeparator(CharacterAfter(pos).character)) { + pos -= CharacterBefore(pos).widthBytes; } } if (pos > 0) { - startChar = cb.CharAt(pos); - --pos; - if (IsLowerCase(startChar)) { - while (pos > 0 && IsLowerCase(cb.CharAt(pos))) - --pos; - if (!IsUpperCase(cb.CharAt(pos)) && !IsLowerCase(cb.CharAt(pos))) - ++pos; - } else if (IsUpperCase(startChar)) { - while (pos > 0 && IsUpperCase(cb.CharAt(pos))) - --pos; - if (!IsUpperCase(cb.CharAt(pos))) - ++pos; - } else if (IsADigit(startChar)) { - while (pos > 0 && IsADigit(cb.CharAt(pos))) - --pos; - if (!IsADigit(cb.CharAt(pos))) - ++pos; - } else if (IsPunctuation(startChar)) { - while (pos > 0 && IsPunctuation(cb.CharAt(pos))) - --pos; - if (!IsPunctuation(cb.CharAt(pos))) - ++pos; - } else if (isspacechar(startChar)) { - while (pos > 0 && isspacechar(cb.CharAt(pos))) - --pos; - if (!isspacechar(cb.CharAt(pos))) - ++pos; - } else if (!IsASCII(startChar)) { - while (pos > 0 && !IsASCII(cb.CharAt(pos))) - --pos; - if (IsASCII(cb.CharAt(pos))) - ++pos; + ceStart = CharacterAfter(pos); + pos -= CharacterBefore(pos).widthBytes; + if (IsLowerCase(ceStart.character)) { + while (pos > 0 && IsLowerCase(CharacterAfter(pos).character)) + pos -= CharacterBefore(pos).widthBytes; + if (!IsUpperCase(CharacterAfter(pos).character) && !IsLowerCase(CharacterAfter(pos).character)) + pos += CharacterAfter(pos).widthBytes; + } else if (IsUpperCase(ceStart.character)) { + while (pos > 0 && IsUpperCase(CharacterAfter(pos).character)) + pos -= CharacterBefore(pos).widthBytes; + if (!IsUpperCase(CharacterAfter(pos).character)) + pos += CharacterAfter(pos).widthBytes; + } else if (IsADigit(ceStart.character)) { + while (pos > 0 && IsADigit(CharacterAfter(pos).character)) + pos -= CharacterBefore(pos).widthBytes; + if (!IsADigit(CharacterAfter(pos).character)) + pos += CharacterAfter(pos).widthBytes; + } else if (IsASCIIPunctuationCharacter(ceStart.character)) { + while (pos > 0 && IsASCIIPunctuationCharacter(CharacterAfter(pos).character)) + pos -= CharacterBefore(pos).widthBytes; + if (!IsASCIIPunctuationCharacter(CharacterAfter(pos).character)) + pos += CharacterAfter(pos).widthBytes; + } else if (isspacechar(ceStart.character)) { + while (pos > 0 && isspacechar(CharacterAfter(pos).character)) + pos -= CharacterBefore(pos).widthBytes; + if (!isspacechar(CharacterAfter(pos).character)) + pos += CharacterAfter(pos).widthBytes; + } else if (!IsASCII(ceStart.character)) { + while (pos > 0 && !IsASCII(CharacterAfter(pos).character)) + pos -= CharacterBefore(pos).widthBytes; + if (IsASCII(CharacterAfter(pos).character)) + pos += CharacterAfter(pos).widthBytes; } else { - ++pos; + pos += CharacterAfter(pos).widthBytes; } } } return pos; } -int Document::WordPartRight(int pos) { - char startChar = cb.CharAt(pos); - int length = Length(); - if (IsWordPartSeparator(startChar)) { - while (pos < length && IsWordPartSeparator(cb.CharAt(pos))) - ++pos; - startChar = cb.CharAt(pos); - } - if (!IsASCII(startChar)) { - while (pos < length && !IsASCII(cb.CharAt(pos))) - ++pos; - } else if (IsLowerCase(startChar)) { - while (pos < length && IsLowerCase(cb.CharAt(pos))) - ++pos; - } else if (IsUpperCase(startChar)) { - if (IsLowerCase(cb.CharAt(pos + 1))) { - ++pos; - while (pos < length && IsLowerCase(cb.CharAt(pos))) - ++pos; +int Document::WordPartRight(int pos) const { + CharacterExtracted ceStart = CharacterAfter(pos); + const int length = Length(); + if (IsWordPartSeparator(ceStart.character)) { + while (pos < length && IsWordPartSeparator(CharacterAfter(pos).character)) + pos += CharacterAfter(pos).widthBytes; + ceStart = CharacterAfter(pos); + } + if (!IsASCII(ceStart.character)) { + while (pos < length && !IsASCII(CharacterAfter(pos).character)) + pos += CharacterAfter(pos).widthBytes; + } else if (IsLowerCase(ceStart.character)) { + while (pos < length && IsLowerCase(CharacterAfter(pos).character)) + pos += CharacterAfter(pos).widthBytes; + } else if (IsUpperCase(ceStart.character)) { + if (IsLowerCase(CharacterAfter(pos + ceStart.widthBytes).character)) { + pos += CharacterAfter(pos).widthBytes; + while (pos < length && IsLowerCase(CharacterAfter(pos).character)) + pos += CharacterAfter(pos).widthBytes; } else { - while (pos < length && IsUpperCase(cb.CharAt(pos))) - ++pos; - } - if (IsLowerCase(cb.CharAt(pos)) && IsUpperCase(cb.CharAt(pos - 1))) - --pos; - } else if (IsADigit(startChar)) { - while (pos < length && IsADigit(cb.CharAt(pos))) - ++pos; - } else if (IsPunctuation(startChar)) { - while (pos < length && IsPunctuation(cb.CharAt(pos))) - ++pos; - } else if (isspacechar(startChar)) { - while (pos < length && isspacechar(cb.CharAt(pos))) - ++pos; + while (pos < length && IsUpperCase(CharacterAfter(pos).character)) + pos += CharacterAfter(pos).widthBytes; + } + if (IsLowerCase(CharacterAfter(pos).character) && IsUpperCase(CharacterBefore(pos).character)) + pos -= CharacterBefore(pos).widthBytes; + } else if (IsADigit(ceStart.character)) { + while (pos < length && IsADigit(CharacterAfter(pos).character)) + pos += CharacterAfter(pos).widthBytes; + } else if (IsASCIIPunctuationCharacter(ceStart.character)) { + while (pos < length && IsASCIIPunctuationCharacter(CharacterAfter(pos).character)) + pos += CharacterAfter(pos).widthBytes; + } else if (isspacechar(ceStart.character)) { + while (pos < length && isspacechar(CharacterAfter(pos).character)) + pos += CharacterAfter(pos).widthBytes; } else { - ++pos; + pos += CharacterAfter(pos).widthBytes; } return pos; } diff --git a/src/Document.h b/src/Document.h index d31465f62..c0a0bb808 100644 --- a/src/Document.h +++ b/src/Document.h @@ -238,6 +238,18 @@ private: public: + struct CharacterExtracted { + unsigned int character; + unsigned int widthBytes; + CharacterExtracted(unsigned int character_, unsigned int widthBytes_) : + character(character_), widthBytes(widthBytes_) { + } + // For DBCS characters turn 2 bytes into an int + static CharacterExtracted DBCS(unsigned char lead, unsigned char trail) { + return CharacterExtracted((lead << 8) | trail, 2); + } + }; + LexInterface *pli; int eolMode; @@ -284,6 +296,8 @@ public: int MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd=true) const; int NextPosition(int pos, int moveDir) const; bool NextCharacter(int &pos, int moveDir) const; // Returns true if pos changed + Document::CharacterExtracted CharacterAfter(int position) const; + Document::CharacterExtracted CharacterBefore(int position) const; Sci_Position SCI_METHOD GetRelativePosition(Sci_Position positionStart, Sci_Position characterOffset) const; int GetRelativePositionUTF16(int positionStart, int characterOffset) const; int SCI_METHOD GetCharacterAndWidth(Sci_Position position, Sci_Position *pWidth) const; @@ -373,19 +387,12 @@ public: void GetHighlightDelimiters(HighlightDelimiter &hDelimiter, int line, int lastLine); void Indent(bool forwards); - int ExtendWordSelect(int pos, int delta, bool onlyWordCharacters=false); - int NextWordStart(int pos, int delta); - int NextWordEnd(int pos, int delta); + int ExtendWordSelect(int pos, int delta, bool onlyWordCharacters=false) const; + int NextWordStart(int pos, int delta) const; + int NextWordEnd(int pos, int delta) const; Sci_Position SCI_METHOD Length() const { return cb.Length(); } void Allocate(int newSize) { cb.Allocate(newSize); } - struct CharacterExtracted { - unsigned int character; - unsigned int widthBytes; - CharacterExtracted(unsigned int character_, unsigned int widthBytes_) : - character(character_), widthBytes(widthBytes_) { - } - }; CharacterExtracted ExtractCharacter(int position) const; bool IsWordStartAt(int pos) const; @@ -437,10 +444,11 @@ public: bool AddWatcher(DocWatcher *watcher, void *userData); bool RemoveWatcher(DocWatcher *watcher, void *userData); - CharClassify::cc WordCharClass(unsigned char ch) const; - bool IsWordPartSeparator(char ch) const; - int WordPartLeft(int pos); - int WordPartRight(int pos); + bool IsASCIIWordByte(unsigned char ch) const; + CharClassify::cc WordCharacterClass(unsigned int ch) const; + bool IsWordPartSeparator(unsigned int ch) const; + int WordPartLeft(int pos) const; + int WordPartRight(int pos) const; int ExtendStyleRange(int pos, int delta, bool singleLine = false); bool IsWhiteLine(int line) const; int ParaUp(int pos) const; diff --git a/src/EditView.cxx b/src/EditView.cxx index 92c341d8f..e6cd8fcfe 100644 --- a/src/EditView.cxx +++ b/src/EditView.cxx @@ -25,6 +25,7 @@ #include "Scintilla.h" #include "StringCopy.h" +#include "CharacterSet.h" #include "Position.h" #include "SplitVector.h" #include "Partitioning.h" @@ -389,16 +390,16 @@ void EditView::LayoutLine(const EditModel &model, int line, Surface *surface, co (ll->chars[numCharsInLine] == chDoc); else if (vstyle.styles[ll->styles[numCharsInLine]].caseForce == Style::caseLower) allSame = allSame && - (ll->chars[numCharsInLine] == static_cast<char>(tolower(chDoc))); + (ll->chars[numCharsInLine] == MakeLowerCase(chDoc)); else if (vstyle.styles[ll->styles[numCharsInLine]].caseForce == Style::caseUpper) allSame = allSame && - (ll->chars[numCharsInLine] == static_cast<char>(toupper(chDoc))); + (ll->chars[numCharsInLine] == MakeUpperCase(chDoc)); else { // Style::caseCamel - if ((model.pdoc->WordCharClass(ll->chars[numCharsInLine]) == CharClassify::ccWord) && - ((numCharsInLine == 0) || (model.pdoc->WordCharClass(ll->chars[numCharsInLine - 1]) != CharClassify::ccWord))) { - allSame = allSame && (ll->chars[numCharsInLine] == static_cast<char>(toupper(chDoc))); + if ((model.pdoc->IsASCIIWordByte(ll->chars[numCharsInLine])) && + ((numCharsInLine == 0) || (!model.pdoc->IsASCIIWordByte(ll->chars[numCharsInLine - 1])))) { + allSame = allSame && (ll->chars[numCharsInLine] == MakeUpperCase(chDoc)); } else { - allSame = allSame && (ll->chars[numCharsInLine] == static_cast<char>(tolower(chDoc))); + allSame = allSame && (ll->chars[numCharsInLine] == MakeLowerCase(chDoc)); } } numCharsInLine++; @@ -440,15 +441,15 @@ void EditView::LayoutLine(const EditModel &model, int line, Surface *surface, co for (int charInLine = 0; charInLine<lineLength; charInLine++) { char chDoc = ll->chars[charInLine]; if (vstyle.styles[ll->styles[charInLine]].caseForce == Style::caseUpper) - ll->chars[charInLine] = static_cast<char>(toupper(chDoc)); + ll->chars[charInLine] = static_cast<char>(MakeUpperCase(chDoc)); else if (vstyle.styles[ll->styles[charInLine]].caseForce == Style::caseLower) - ll->chars[charInLine] = static_cast<char>(tolower(chDoc)); + ll->chars[charInLine] = static_cast<char>(MakeLowerCase(chDoc)); else if (vstyle.styles[ll->styles[charInLine]].caseForce == Style::caseCamel) { - if ((model.pdoc->WordCharClass(ll->chars[charInLine]) == CharClassify::ccWord) && - ((charInLine == 0) || (model.pdoc->WordCharClass(ll->chars[charInLine - 1]) != CharClassify::ccWord))) { - ll->chars[charInLine] = static_cast<char>(toupper(chDoc)); + if ((model.pdoc->IsASCIIWordByte(ll->chars[charInLine])) && + ((charInLine == 0) || (!model.pdoc->IsASCIIWordByte(ll->chars[charInLine - 1])))) { + ll->chars[charInLine] = static_cast<char>(MakeUpperCase(chDoc)); } else { - ll->chars[charInLine] = static_cast<char>(tolower(chDoc)); + ll->chars[charInLine] = static_cast<char>(MakeLowerCase(chDoc)); } } } |