aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--doc/ScintillaDoc.html13
-rw-r--r--doc/ScintillaHistory.html7
-rw-r--r--src/Document.cxx462
-rw-r--r--src/Document.h36
-rw-r--r--src/EditView.cxx25
5 files changed, 396 insertions, 147 deletions
diff --git a/doc/ScintillaDoc.html b/doc/ScintillaDoc.html
index 3b2c480d0..adf3b9907 100644
--- a/doc/ScintillaDoc.html
+++ b/doc/ScintillaDoc.html
@@ -2322,8 +2322,13 @@ struct Sci_TextToFind {
Line ends are not selected by double clicking but do act as word separators.
</p>
- <p>Words are defined in terms of bytes, not characters so there are some issues with
- UTF-8 and DCBS documents.</p>
+ <p>Words are defined in terms of characters and the sets of characters in each category can be customized to an extent.
+ The NUL character (0) is always a space as the APIs to set categories use NUL-terminated strings.
+ For single-byte encodings a category may be assigned to any character (1 to 0xFF).
+ For multi-byte encodings a category may be assigned to characters from 1 to 0x7F with static behaviour from 0x80.
+ For UTF-8, characters from 0x80 will use a category based on their Unicode general category.
+ For Asian encodings, code pages 932, 936, 949, 950, and 1361, characters from 0x80 are treated as word characters.
+ </p>
<p>Identifiers in programming languages are often sequences of words with capitalisation
(aCamelCaseIdentifier) or underscores (an_under_bar_ident) used to mark word boundaries.
@@ -2437,7 +2442,7 @@ struct Sci_TextToFind {
</table>
<p><b id="SCI_SETWORDCHARS">SCI_SETWORDCHARS(&lt;unused&gt;, const char *characters)</b><br />
- This message defines which characters (bytes) are members of the word category.
+ This message defines which characters are members of the word category.
The character categories are set to default values before processing this function.
For example, if you don't allow '_' in your set of characters
use:<br />
@@ -2449,6 +2454,8 @@ struct Sci_TextToFind {
If the characters parameter is 0 then the length that should be allocated
to store the entire set is returned.</p>
+ <p>For multi-byte encodings, this API will not return meaningful values for 0x80 and above.</p>
+
<p><b id="SCI_SETWHITESPACECHARS">SCI_SETWHITESPACECHARS(&lt;unused&gt;, const char *characters)</b><br />
<b id="SCI_GETWHITESPACECHARS">SCI_GETWHITESPACECHARS(&lt;unused&gt;, char *characters) &rarr; int</b><br />
Similar to <code>SCI_SETWORDCHARS</code>, this message allows the user to define which chars Scintilla considers
diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html
index fb004776b..a134035dc 100644
--- a/doc/ScintillaHistory.html
+++ b/doc/ScintillaHistory.html
@@ -523,6 +523,13 @@
Released 4 September 2016.
</li>
<li>
+ Word selection, navigation, and manipulation is now performed on characters instead of bytes
+ leading to more natural behaviour for multi-byte encodings like UTF-8.
+ For UTF-8 characters 0x80 and above, classification into word; punctuation; space; or line-end
+ is based on the Unicode general category of the character and is not customizable.
+ <a href="http://sourceforge.net/p/scintilla/bugs/1832/">Bug #1832</a>.
+ </li>
+ <li>
Two enums changed in Scintilla.iface which may lead to changed bindings.
There were 2 FontQuality enums and the first is now PhasesDraw.
The prefix for FoldAction was SC_FOLDACTION and is now SC_FOLDACTION_
diff --git a/src/Document.cxx b/src/Document.cxx
index f10e40aad..58f663376 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -26,6 +26,7 @@
#include "Scintilla.h"
#include "CharacterSet.h"
+#include "CharacterCategory.h"
#include "Position.h"
#include "SplitVector.h"
#include "Partitioning.h"
@@ -44,10 +45,6 @@
using namespace Scintilla;
#endif
-static inline bool IsPunctuation(char ch) {
- return IsASCII(ch) && ispunct(ch);
-}
-
void LexInterface::Colourise(int start, int end) {
if (pdoc && instance && !performingStyle) {
// Protect against reentrance, which may occur, for example, when
@@ -771,6 +768,77 @@ bool Document::NextCharacter(int &pos, int moveDir) const {
}
}
+Document::CharacterExtracted Document::CharacterAfter(int position) const {
+ if (position >= Length()) {
+ return CharacterExtracted(unicodeReplacementChar, 0);
+ }
+ const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(position));
+ if (!dbcsCodePage || UTF8IsAscii(leadByte)) {
+ // Common case: ASCII character
+ return CharacterExtracted(leadByte, 1);
+ }
+ if (SC_CP_UTF8 == dbcsCodePage) {
+ const int widthCharBytes = UTF8BytesOfLead[leadByte];
+ unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 };
+ for (int b = 1; b<widthCharBytes; b++)
+ charBytes[b] = static_cast<unsigned char>(cb.CharAt(position + b));
+ int utf8status = UTF8Classify(charBytes, widthCharBytes);
+ if (utf8status & UTF8MaskInvalid) {
+ // Treat as invalid and use up just one byte
+ return CharacterExtracted(unicodeReplacementChar, 1);
+ } else {
+ return CharacterExtracted(UnicodeFromUTF8(charBytes), utf8status & UTF8MaskWidth);
+ }
+ } else {
+ if (IsDBCSLeadByte(leadByte) && ((position + 1) < Length())) {
+ return CharacterExtracted::DBCS(leadByte, static_cast<unsigned char>(cb.CharAt(position + 1)));
+ } else {
+ return CharacterExtracted(leadByte, 1);
+ }
+ }
+}
+
+Document::CharacterExtracted Document::CharacterBefore(int position) const {
+ if (position <= 0) {
+ return CharacterExtracted(unicodeReplacementChar, 0);
+ }
+ const unsigned char previousByte = static_cast<unsigned char>(cb.CharAt(position - 1));
+ if (0 == dbcsCodePage) {
+ return CharacterExtracted(previousByte, 1);
+ }
+ if (SC_CP_UTF8 == dbcsCodePage) {
+ if (UTF8IsAscii(previousByte)) {
+ return CharacterExtracted(previousByte, 1);
+ }
+ position--;
+ // If previousByte is not a trail byte then its invalid
+ if (UTF8IsTrailByte(previousByte)) {
+ // If previousByte is a trail byte in a valid UTF-8 character then find start of character
+ int startUTF = position;
+ int endUTF = position;
+ if (InGoodUTF8(position, startUTF, endUTF)) {
+ const int widthCharBytes = endUTF - startUTF;
+ unsigned char charBytes[UTF8MaxBytes] = { 0, 0, 0, 0 };
+ for (int b = 0; b<widthCharBytes; b++)
+ charBytes[b] = static_cast<unsigned char>(cb.CharAt(startUTF + b));
+ int utf8status = UTF8Classify(charBytes, widthCharBytes);
+ if (utf8status & UTF8MaskInvalid) {
+ // Treat as invalid and use up just one byte
+ return CharacterExtracted(unicodeReplacementChar, 1);
+ } else {
+ return CharacterExtracted(UnicodeFromUTF8(charBytes), utf8status & UTF8MaskWidth);
+ }
+ }
+ // Else invalid UTF-8 so return position of isolated trail byte
+ }
+ return CharacterExtracted(unicodeReplacementChar, 1);
+ } else {
+ // Moving backwards in DBCS is complex so use NextPosition
+ const int posStartCharacter = NextPosition(position, -1);
+ return CharacterAfter(posStartCharacter);
+ }
+}
+
// Return -1 on out-of-bounds
Sci_Position SCI_METHOD Document::GetRelativePosition(Sci_Position positionStart, Sci_Position characterOffset) const {
int pos = positionStart;
@@ -1485,28 +1553,104 @@ int Document::ParaDown(int pos) const {
return LineEnd(line-1);
}
-CharClassify::cc Document::WordCharClass(unsigned char ch) const {
- if ((SC_CP_UTF8 == dbcsCodePage) && (!UTF8IsAscii(ch)))
- return CharClassify::ccWord;
- return charClass.GetClass(ch);
+bool Document::IsASCIIWordByte(unsigned char ch) const {
+ if (IsASCII(ch)) {
+ return charClass.GetClass(ch) == CharClassify::ccWord;
+ } else {
+ return false;
+ }
+}
+
+CharClassify::cc Document::WordCharacterClass(unsigned int ch) const {
+ if (dbcsCodePage && (!UTF8IsAscii(ch))) {
+ if (SC_CP_UTF8 == dbcsCodePage) {
+ // Use hard coded Unicode class
+ const CharacterCategory cc = CategoriseCharacter(ch);
+ switch (cc) {
+
+ // Separator, Line/Paragraph
+ case ccZl:
+ case ccZp:
+ return CharClassify::ccNewLine;
+
+ // Separator, Space
+ case ccZs:
+ // Other
+ case ccCc:
+ case ccCf:
+ case ccCs:
+ case ccCo:
+ case ccCn:
+ return CharClassify::ccSpace;
+
+ // Letter
+ case ccLu:
+ case ccLl:
+ case ccLt:
+ case ccLm:
+ case ccLo:
+ // Number
+ case ccNd:
+ case ccNl:
+ case ccNo:
+ // Mark - includes combining diacritics
+ case ccMn:
+ case ccMc:
+ case ccMe:
+ return CharClassify::ccWord;
+
+ // Punctuation
+ case ccPc:
+ case ccPd:
+ case ccPs:
+ case ccPe:
+ case ccPi:
+ case ccPf:
+ case ccPo:
+ // Symbol
+ case ccSm:
+ case ccSc:
+ case ccSk:
+ case ccSo:
+ return CharClassify::ccPunctuation;
+
+ }
+ } else {
+ // Asian DBCS
+ return CharClassify::ccWord;
+ }
+ }
+ return charClass.GetClass(static_cast<unsigned char>(ch));
}
/**
* Used by commmands that want to select whole words.
* Finds the start of word at pos when delta < 0 or the end of the word when delta >= 0.
*/
-int Document::ExtendWordSelect(int pos, int delta, bool onlyWordCharacters) {
+int Document::ExtendWordSelect(int pos, int delta, bool onlyWordCharacters) const {
CharClassify::cc ccStart = CharClassify::ccWord;
if (delta < 0) {
- if (!onlyWordCharacters)
- ccStart = WordCharClass(cb.CharAt(pos-1));
- while (pos > 0 && (WordCharClass(cb.CharAt(pos - 1)) == ccStart))
- pos--;
+ if (!onlyWordCharacters) {
+ const CharacterExtracted ce = CharacterBefore(pos);
+ ccStart = WordCharacterClass(ce.character);
+ }
+ while (pos > 0) {
+ const CharacterExtracted ce = CharacterBefore(pos);
+ if (WordCharacterClass(ce.character) != ccStart)
+ break;
+ pos -= ce.widthBytes;
+ }
} else {
- if (!onlyWordCharacters && pos < Length())
- ccStart = WordCharClass(cb.CharAt(pos));
- while (pos < (Length()) && (WordCharClass(cb.CharAt(pos)) == ccStart))
- pos++;
+ if (!onlyWordCharacters && pos < Length()) {
+ const CharacterExtracted ce = CharacterAfter(pos);
+ ccStart = WordCharacterClass(ce.character);
+ }
+ while (pos < Length()) {
+ const CharacterExtracted ce = CharacterAfter(pos);
+ if (WordCharacterClass(ce.character) != ccStart)
+ break;
+ pos += ce.widthBytes;
+ }
}
return MovePositionOutsideChar(pos, delta, true);
}
@@ -1518,22 +1662,39 @@ int Document::ExtendWordSelect(int pos, int delta, bool onlyWordCharacters) {
* additional movement to transit white space.
* Used by cursor movement by word commands.
*/
-int Document::NextWordStart(int pos, int delta) {
+int Document::NextWordStart(int pos, int delta) const {
if (delta < 0) {
- while (pos > 0 && (WordCharClass(cb.CharAt(pos - 1)) == CharClassify::ccSpace))
- pos--;
+ while (pos > 0) {
+ const CharacterExtracted ce = CharacterBefore(pos);
+ if (WordCharacterClass(ce.character) != CharClassify::ccSpace)
+ break;
+ pos -= ce.widthBytes;
+ }
if (pos > 0) {
- CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos-1));
- while (pos > 0 && (WordCharClass(cb.CharAt(pos - 1)) == ccStart)) {
- pos--;
+ CharacterExtracted ce = CharacterBefore(pos);
+ const CharClassify::cc ccStart = WordCharacterClass(ce.character);
+ while (pos > 0) {
+ ce = CharacterBefore(pos);
+ if (WordCharacterClass(ce.character) != ccStart)
+ break;
+ pos -= ce.widthBytes;
}
}
} else {
- CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos));
- while (pos < (Length()) && (WordCharClass(cb.CharAt(pos)) == ccStart))
- pos++;
- while (pos < (Length()) && (WordCharClass(cb.CharAt(pos)) == CharClassify::ccSpace))
- pos++;
+ CharacterExtracted ce = CharacterAfter(pos);
+ const CharClassify::cc ccStart = WordCharacterClass(ce.character);
+ while (pos < Length()) {
+ ce = CharacterAfter(pos);
+ if (WordCharacterClass(ce.character) != ccStart)
+ break;
+ pos += ce.widthBytes;
+ }
+ while (pos < Length()) {
+ ce = CharacterAfter(pos);
+ if (WordCharacterClass(ce.character) != CharClassify::ccSpace)
+ break;
+ pos += ce.widthBytes;
+ }
}
return pos;
}
@@ -1545,27 +1706,41 @@ int Document::NextWordStart(int pos, int delta) {
* additional movement to transit white space.
* Used by cursor movement by word commands.
*/
-int Document::NextWordEnd(int pos, int delta) {
+int Document::NextWordEnd(int pos, int delta) const {
if (delta < 0) {
if (pos > 0) {
- CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos-1));
+ CharacterExtracted ce = CharacterBefore(pos);
+ CharClassify::cc ccStart = WordCharacterClass(ce.character);
if (ccStart != CharClassify::ccSpace) {
- while (pos > 0 && WordCharClass(cb.CharAt(pos - 1)) == ccStart) {
- pos--;
+ while (pos > 0) {
+ ce = CharacterBefore(pos);
+ if (WordCharacterClass(ce.character) != ccStart)
+ break;
+ pos -= ce.widthBytes;
}
}
- while (pos > 0 && WordCharClass(cb.CharAt(pos - 1)) == CharClassify::ccSpace) {
- pos--;
+ while (pos > 0) {
+ ce = CharacterBefore(pos);
+ if (WordCharacterClass(ce.character) != CharClassify::ccSpace)
+ break;
+ pos -= ce.widthBytes;
}
}
} else {
- while (pos < Length() && WordCharClass(cb.CharAt(pos)) == CharClassify::ccSpace) {
- pos++;
+ while (pos < Length()) {
+ CharacterExtracted ce = CharacterAfter(pos);
+ if (WordCharacterClass(ce.character) != CharClassify::ccSpace)
+ break;
+ pos += ce.widthBytes;
}
if (pos < Length()) {
- CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos));
- while (pos < Length() && WordCharClass(cb.CharAt(pos)) == ccStart) {
- pos++;
+ CharacterExtracted ce = CharacterAfter(pos);
+ CharClassify::cc ccStart = WordCharacterClass(ce.character);
+ while (pos < Length()) {
+ ce = CharacterAfter(pos);
+ if (WordCharacterClass(ce.character) != ccStart)
+ break;
+ pos += ce.widthBytes;
}
}
}
@@ -1577,10 +1752,15 @@ int Document::NextWordEnd(int pos, int delta) {
* the previous character is of a different character class.
*/
bool Document::IsWordStartAt(int pos) const {
+ if (pos >= Length())
+ return false;
if (pos > 0) {
- CharClassify::cc ccPos = WordCharClass(CharAt(pos));
+ const CharacterExtracted cePos = CharacterAfter(pos);
+ const CharClassify::cc ccPos = WordCharacterClass(cePos.character);
+ const CharacterExtracted cePrev = CharacterBefore(pos);
+ const CharClassify::cc ccPrev = WordCharacterClass(cePrev.character);
return (ccPos == CharClassify::ccWord || ccPos == CharClassify::ccPunctuation) &&
- (ccPos != WordCharClass(CharAt(pos - 1)));
+ (ccPos != ccPrev);
}
return true;
}
@@ -1590,10 +1770,15 @@ bool Document::IsWordStartAt(int pos) const {
* the next character is of a different character class.
*/
bool Document::IsWordEndAt(int pos) const {
+ if (pos <= 0)
+ return false;
if (pos < Length()) {
- CharClassify::cc ccPrev = WordCharClass(CharAt(pos-1));
+ const CharacterExtracted cePos = CharacterAfter(pos);
+ const CharClassify::cc ccPos = WordCharacterClass(cePos.character);
+ const CharacterExtracted cePrev = CharacterBefore(pos);
+ const CharClassify::cc ccPrev = WordCharacterClass(cePrev.character);
return (ccPrev == CharClassify::ccWord || ccPrev == CharClassify::ccPunctuation) &&
- (ccPrev != WordCharClass(CharAt(pos)));
+ (ccPrev != ccPos);
}
return true;
}
@@ -2075,96 +2260,137 @@ void Document::NotifyModified(DocModification mh) {
}
}
-bool Document::IsWordPartSeparator(char ch) const {
- return (WordCharClass(ch) == CharClassify::ccWord) && IsPunctuation(ch);
+// Used for word part navigation.
+static bool IsASCIIPunctuationCharacter(unsigned int ch) {
+ switch (ch) {
+ case '!':
+ case '"':
+ case '#':
+ case '$':
+ case '%':
+ case '&':
+ case '\'':
+ case '(':
+ case ')':
+ case '*':
+ case '+':
+ case ',':
+ case '-':
+ case '.':
+ case '/':
+ case ':':
+ case ';':
+ case '<':
+ case '=':
+ case '>':
+ case '?':
+ case '@':
+ case '[':
+ case '\\':
+ case ']':
+ case '^':
+ case '_':
+ case '`':
+ case '{':
+ case '|':
+ case '}':
+ case '~':
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool Document::IsWordPartSeparator(unsigned int ch) const {
+ return (WordCharacterClass(ch) == CharClassify::ccWord) && IsASCIIPunctuationCharacter(ch);
}
-int Document::WordPartLeft(int pos) {
+int Document::WordPartLeft(int pos) const {
if (pos > 0) {
- --pos;
- char startChar = cb.CharAt(pos);
- if (IsWordPartSeparator(startChar)) {
- while (pos > 0 && IsWordPartSeparator(cb.CharAt(pos))) {
- --pos;
+ pos -= CharacterBefore(pos).widthBytes;
+ CharacterExtracted ceStart = CharacterAfter(pos);
+ if (IsWordPartSeparator(ceStart.character)) {
+ while (pos > 0 && IsWordPartSeparator(CharacterAfter(pos).character)) {
+ pos -= CharacterBefore(pos).widthBytes;
}
}
if (pos > 0) {
- startChar = cb.CharAt(pos);
- --pos;
- if (IsLowerCase(startChar)) {
- while (pos > 0 && IsLowerCase(cb.CharAt(pos)))
- --pos;
- if (!IsUpperCase(cb.CharAt(pos)) && !IsLowerCase(cb.CharAt(pos)))
- ++pos;
- } else if (IsUpperCase(startChar)) {
- while (pos > 0 && IsUpperCase(cb.CharAt(pos)))
- --pos;
- if (!IsUpperCase(cb.CharAt(pos)))
- ++pos;
- } else if (IsADigit(startChar)) {
- while (pos > 0 && IsADigit(cb.CharAt(pos)))
- --pos;
- if (!IsADigit(cb.CharAt(pos)))
- ++pos;
- } else if (IsPunctuation(startChar)) {
- while (pos > 0 && IsPunctuation(cb.CharAt(pos)))
- --pos;
- if (!IsPunctuation(cb.CharAt(pos)))
- ++pos;
- } else if (isspacechar(startChar)) {
- while (pos > 0 && isspacechar(cb.CharAt(pos)))
- --pos;
- if (!isspacechar(cb.CharAt(pos)))
- ++pos;
- } else if (!IsASCII(startChar)) {
- while (pos > 0 && !IsASCII(cb.CharAt(pos)))
- --pos;
- if (IsASCII(cb.CharAt(pos)))
- ++pos;
+ ceStart = CharacterAfter(pos);
+ pos -= CharacterBefore(pos).widthBytes;
+ if (IsLowerCase(ceStart.character)) {
+ while (pos > 0 && IsLowerCase(CharacterAfter(pos).character))
+ pos -= CharacterBefore(pos).widthBytes;
+ if (!IsUpperCase(CharacterAfter(pos).character) && !IsLowerCase(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ } else if (IsUpperCase(ceStart.character)) {
+ while (pos > 0 && IsUpperCase(CharacterAfter(pos).character))
+ pos -= CharacterBefore(pos).widthBytes;
+ if (!IsUpperCase(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ } else if (IsADigit(ceStart.character)) {
+ while (pos > 0 && IsADigit(CharacterAfter(pos).character))
+ pos -= CharacterBefore(pos).widthBytes;
+ if (!IsADigit(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ } else if (IsASCIIPunctuationCharacter(ceStart.character)) {
+ while (pos > 0 && IsASCIIPunctuationCharacter(CharacterAfter(pos).character))
+ pos -= CharacterBefore(pos).widthBytes;
+ if (!IsASCIIPunctuationCharacter(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ } else if (isspacechar(ceStart.character)) {
+ while (pos > 0 && isspacechar(CharacterAfter(pos).character))
+ pos -= CharacterBefore(pos).widthBytes;
+ if (!isspacechar(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ } else if (!IsASCII(ceStart.character)) {
+ while (pos > 0 && !IsASCII(CharacterAfter(pos).character))
+ pos -= CharacterBefore(pos).widthBytes;
+ if (IsASCII(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
} else {
- ++pos;
+ pos += CharacterAfter(pos).widthBytes;
}
}
}
return pos;
}
-int Document::WordPartRight(int pos) {
- char startChar = cb.CharAt(pos);
- int length = Length();
- if (IsWordPartSeparator(startChar)) {
- while (pos < length && IsWordPartSeparator(cb.CharAt(pos)))
- ++pos;
- startChar = cb.CharAt(pos);
- }
- if (!IsASCII(startChar)) {
- while (pos < length && !IsASCII(cb.CharAt(pos)))
- ++pos;
- } else if (IsLowerCase(startChar)) {
- while (pos < length && IsLowerCase(cb.CharAt(pos)))
- ++pos;
- } else if (IsUpperCase(startChar)) {
- if (IsLowerCase(cb.CharAt(pos + 1))) {
- ++pos;
- while (pos < length && IsLowerCase(cb.CharAt(pos)))
- ++pos;
+int Document::WordPartRight(int pos) const {
+ CharacterExtracted ceStart = CharacterAfter(pos);
+ const int length = Length();
+ if (IsWordPartSeparator(ceStart.character)) {
+ while (pos < length && IsWordPartSeparator(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ ceStart = CharacterAfter(pos);
+ }
+ if (!IsASCII(ceStart.character)) {
+ while (pos < length && !IsASCII(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ } else if (IsLowerCase(ceStart.character)) {
+ while (pos < length && IsLowerCase(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ } else if (IsUpperCase(ceStart.character)) {
+ if (IsLowerCase(CharacterAfter(pos + ceStart.widthBytes).character)) {
+ pos += CharacterAfter(pos).widthBytes;
+ while (pos < length && IsLowerCase(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
} else {
- while (pos < length && IsUpperCase(cb.CharAt(pos)))
- ++pos;
- }
- if (IsLowerCase(cb.CharAt(pos)) && IsUpperCase(cb.CharAt(pos - 1)))
- --pos;
- } else if (IsADigit(startChar)) {
- while (pos < length && IsADigit(cb.CharAt(pos)))
- ++pos;
- } else if (IsPunctuation(startChar)) {
- while (pos < length && IsPunctuation(cb.CharAt(pos)))
- ++pos;
- } else if (isspacechar(startChar)) {
- while (pos < length && isspacechar(cb.CharAt(pos)))
- ++pos;
+ while (pos < length && IsUpperCase(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ }
+ if (IsLowerCase(CharacterAfter(pos).character) && IsUpperCase(CharacterBefore(pos).character))
+ pos -= CharacterBefore(pos).widthBytes;
+ } else if (IsADigit(ceStart.character)) {
+ while (pos < length && IsADigit(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ } else if (IsASCIIPunctuationCharacter(ceStart.character)) {
+ while (pos < length && IsASCIIPunctuationCharacter(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ } else if (isspacechar(ceStart.character)) {
+ while (pos < length && isspacechar(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
} else {
- ++pos;
+ pos += CharacterAfter(pos).widthBytes;
}
return pos;
}
diff --git a/src/Document.h b/src/Document.h
index d31465f62..c0a0bb808 100644
--- a/src/Document.h
+++ b/src/Document.h
@@ -238,6 +238,18 @@ private:
public:
+ struct CharacterExtracted {
+ unsigned int character;
+ unsigned int widthBytes;
+ CharacterExtracted(unsigned int character_, unsigned int widthBytes_) :
+ character(character_), widthBytes(widthBytes_) {
+ }
+ // For DBCS characters turn 2 bytes into an int
+ static CharacterExtracted DBCS(unsigned char lead, unsigned char trail) {
+ return CharacterExtracted((lead << 8) | trail, 2);
+ }
+ };
+
LexInterface *pli;
int eolMode;
@@ -284,6 +296,8 @@ public:
int MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd=true) const;
int NextPosition(int pos, int moveDir) const;
bool NextCharacter(int &pos, int moveDir) const; // Returns true if pos changed
+ Document::CharacterExtracted CharacterAfter(int position) const;
+ Document::CharacterExtracted CharacterBefore(int position) const;
Sci_Position SCI_METHOD GetRelativePosition(Sci_Position positionStart, Sci_Position characterOffset) const;
int GetRelativePositionUTF16(int positionStart, int characterOffset) const;
int SCI_METHOD GetCharacterAndWidth(Sci_Position position, Sci_Position *pWidth) const;
@@ -373,19 +387,12 @@ public:
void GetHighlightDelimiters(HighlightDelimiter &hDelimiter, int line, int lastLine);
void Indent(bool forwards);
- int ExtendWordSelect(int pos, int delta, bool onlyWordCharacters=false);
- int NextWordStart(int pos, int delta);
- int NextWordEnd(int pos, int delta);
+ int ExtendWordSelect(int pos, int delta, bool onlyWordCharacters=false) const;
+ int NextWordStart(int pos, int delta) const;
+ int NextWordEnd(int pos, int delta) const;
Sci_Position SCI_METHOD Length() const { return cb.Length(); }
void Allocate(int newSize) { cb.Allocate(newSize); }
- struct CharacterExtracted {
- unsigned int character;
- unsigned int widthBytes;
- CharacterExtracted(unsigned int character_, unsigned int widthBytes_) :
- character(character_), widthBytes(widthBytes_) {
- }
- };
CharacterExtracted ExtractCharacter(int position) const;
bool IsWordStartAt(int pos) const;
@@ -437,10 +444,11 @@ public:
bool AddWatcher(DocWatcher *watcher, void *userData);
bool RemoveWatcher(DocWatcher *watcher, void *userData);
- CharClassify::cc WordCharClass(unsigned char ch) const;
- bool IsWordPartSeparator(char ch) const;
- int WordPartLeft(int pos);
- int WordPartRight(int pos);
+ bool IsASCIIWordByte(unsigned char ch) const;
+ CharClassify::cc WordCharacterClass(unsigned int ch) const;
+ bool IsWordPartSeparator(unsigned int ch) const;
+ int WordPartLeft(int pos) const;
+ int WordPartRight(int pos) const;
int ExtendStyleRange(int pos, int delta, bool singleLine = false);
bool IsWhiteLine(int line) const;
int ParaUp(int pos) const;
diff --git a/src/EditView.cxx b/src/EditView.cxx
index 92c341d8f..e6cd8fcfe 100644
--- a/src/EditView.cxx
+++ b/src/EditView.cxx
@@ -25,6 +25,7 @@
#include "Scintilla.h"
#include "StringCopy.h"
+#include "CharacterSet.h"
#include "Position.h"
#include "SplitVector.h"
#include "Partitioning.h"
@@ -389,16 +390,16 @@ void EditView::LayoutLine(const EditModel &model, int line, Surface *surface, co
(ll->chars[numCharsInLine] == chDoc);
else if (vstyle.styles[ll->styles[numCharsInLine]].caseForce == Style::caseLower)
allSame = allSame &&
- (ll->chars[numCharsInLine] == static_cast<char>(tolower(chDoc)));
+ (ll->chars[numCharsInLine] == MakeLowerCase(chDoc));
else if (vstyle.styles[ll->styles[numCharsInLine]].caseForce == Style::caseUpper)
allSame = allSame &&
- (ll->chars[numCharsInLine] == static_cast<char>(toupper(chDoc)));
+ (ll->chars[numCharsInLine] == MakeUpperCase(chDoc));
else { // Style::caseCamel
- if ((model.pdoc->WordCharClass(ll->chars[numCharsInLine]) == CharClassify::ccWord) &&
- ((numCharsInLine == 0) || (model.pdoc->WordCharClass(ll->chars[numCharsInLine - 1]) != CharClassify::ccWord))) {
- allSame = allSame && (ll->chars[numCharsInLine] == static_cast<char>(toupper(chDoc)));
+ if ((model.pdoc->IsASCIIWordByte(ll->chars[numCharsInLine])) &&
+ ((numCharsInLine == 0) || (!model.pdoc->IsASCIIWordByte(ll->chars[numCharsInLine - 1])))) {
+ allSame = allSame && (ll->chars[numCharsInLine] == MakeUpperCase(chDoc));
} else {
- allSame = allSame && (ll->chars[numCharsInLine] == static_cast<char>(tolower(chDoc)));
+ allSame = allSame && (ll->chars[numCharsInLine] == MakeLowerCase(chDoc));
}
}
numCharsInLine++;
@@ -440,15 +441,15 @@ void EditView::LayoutLine(const EditModel &model, int line, Surface *surface, co
for (int charInLine = 0; charInLine<lineLength; charInLine++) {
char chDoc = ll->chars[charInLine];
if (vstyle.styles[ll->styles[charInLine]].caseForce == Style::caseUpper)
- ll->chars[charInLine] = static_cast<char>(toupper(chDoc));
+ ll->chars[charInLine] = static_cast<char>(MakeUpperCase(chDoc));
else if (vstyle.styles[ll->styles[charInLine]].caseForce == Style::caseLower)
- ll->chars[charInLine] = static_cast<char>(tolower(chDoc));
+ ll->chars[charInLine] = static_cast<char>(MakeLowerCase(chDoc));
else if (vstyle.styles[ll->styles[charInLine]].caseForce == Style::caseCamel) {
- if ((model.pdoc->WordCharClass(ll->chars[charInLine]) == CharClassify::ccWord) &&
- ((charInLine == 0) || (model.pdoc->WordCharClass(ll->chars[charInLine - 1]) != CharClassify::ccWord))) {
- ll->chars[charInLine] = static_cast<char>(toupper(chDoc));
+ if ((model.pdoc->IsASCIIWordByte(ll->chars[charInLine])) &&
+ ((charInLine == 0) || (!model.pdoc->IsASCIIWordByte(ll->chars[charInLine - 1])))) {
+ ll->chars[charInLine] = static_cast<char>(MakeUpperCase(chDoc));
} else {
- ll->chars[charInLine] = static_cast<char>(tolower(chDoc));
+ ll->chars[charInLine] = static_cast<char>(MakeLowerCase(chDoc));
}
}
}