aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/Document.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'src/Document.cxx')
-rw-r--r--src/Document.cxx462
1 files changed, 344 insertions, 118 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index f10e40aad..58f663376 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -26,6 +26,7 @@
#include "Scintilla.h"
#include "CharacterSet.h"
+#include "CharacterCategory.h"
#include "Position.h"
#include "SplitVector.h"
#include "Partitioning.h"
@@ -44,10 +45,6 @@
using namespace Scintilla;
#endif
-static inline bool IsPunctuation(char ch) {
- return IsASCII(ch) && ispunct(ch);
-}
-
void LexInterface::Colourise(int start, int end) {
if (pdoc && instance && !performingStyle) {
// Protect against reentrance, which may occur, for example, when
@@ -771,6 +768,77 @@ bool Document::NextCharacter(int &pos, int moveDir) const {
}
}
+Document::CharacterExtracted Document::CharacterAfter(int position) const {
+ if (position >= Length()) {
+ return CharacterExtracted(unicodeReplacementChar, 0);
+ }
+ const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(position));
+ if (!dbcsCodePage || UTF8IsAscii(leadByte)) {
+ // Common case: ASCII character
+ return CharacterExtracted(leadByte, 1);
+ }
+ if (SC_CP_UTF8 == dbcsCodePage) {
+ const int widthCharBytes = UTF8BytesOfLead[leadByte];
+ unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 };
+ for (int b = 1; b<widthCharBytes; b++)
+ charBytes[b] = static_cast<unsigned char>(cb.CharAt(position + b));
+ int utf8status = UTF8Classify(charBytes, widthCharBytes);
+ if (utf8status & UTF8MaskInvalid) {
+ // Treat as invalid and use up just one byte
+ return CharacterExtracted(unicodeReplacementChar, 1);
+ } else {
+ return CharacterExtracted(UnicodeFromUTF8(charBytes), utf8status & UTF8MaskWidth);
+ }
+ } else {
+ if (IsDBCSLeadByte(leadByte) && ((position + 1) < Length())) {
+ return CharacterExtracted::DBCS(leadByte, static_cast<unsigned char>(cb.CharAt(position + 1)));
+ } else {
+ return CharacterExtracted(leadByte, 1);
+ }
+ }
+}
+
+Document::CharacterExtracted Document::CharacterBefore(int position) const {
+ if (position <= 0) {
+ return CharacterExtracted(unicodeReplacementChar, 0);
+ }
+ const unsigned char previousByte = static_cast<unsigned char>(cb.CharAt(position - 1));
+ if (0 == dbcsCodePage) {
+ return CharacterExtracted(previousByte, 1);
+ }
+ if (SC_CP_UTF8 == dbcsCodePage) {
+ if (UTF8IsAscii(previousByte)) {
+ return CharacterExtracted(previousByte, 1);
+ }
+ position--;
+ // If previousByte is not a trail byte then its invalid
+ if (UTF8IsTrailByte(previousByte)) {
+ // If previousByte is a trail byte in a valid UTF-8 character then find start of character
+ int startUTF = position;
+ int endUTF = position;
+ if (InGoodUTF8(position, startUTF, endUTF)) {
+ const int widthCharBytes = endUTF - startUTF;
+ unsigned char charBytes[UTF8MaxBytes] = { 0, 0, 0, 0 };
+ for (int b = 0; b<widthCharBytes; b++)
+ charBytes[b] = static_cast<unsigned char>(cb.CharAt(startUTF + b));
+ int utf8status = UTF8Classify(charBytes, widthCharBytes);
+ if (utf8status & UTF8MaskInvalid) {
+ // Treat as invalid and use up just one byte
+ return CharacterExtracted(unicodeReplacementChar, 1);
+ } else {
+ return CharacterExtracted(UnicodeFromUTF8(charBytes), utf8status & UTF8MaskWidth);
+ }
+ }
+ // Else invalid UTF-8 so return position of isolated trail byte
+ }
+ return CharacterExtracted(unicodeReplacementChar, 1);
+ } else {
+ // Moving backwards in DBCS is complex so use NextPosition
+ const int posStartCharacter = NextPosition(position, -1);
+ return CharacterAfter(posStartCharacter);
+ }
+}
+
// Return -1 on out-of-bounds
Sci_Position SCI_METHOD Document::GetRelativePosition(Sci_Position positionStart, Sci_Position characterOffset) const {
int pos = positionStart;
@@ -1485,28 +1553,104 @@ int Document::ParaDown(int pos) const {
return LineEnd(line-1);
}
-CharClassify::cc Document::WordCharClass(unsigned char ch) const {
- if ((SC_CP_UTF8 == dbcsCodePage) && (!UTF8IsAscii(ch)))
- return CharClassify::ccWord;
- return charClass.GetClass(ch);
+bool Document::IsASCIIWordByte(unsigned char ch) const {
+ if (IsASCII(ch)) {
+ return charClass.GetClass(ch) == CharClassify::ccWord;
+ } else {
+ return false;
+ }
+}
+
+CharClassify::cc Document::WordCharacterClass(unsigned int ch) const {
+ if (dbcsCodePage && (!UTF8IsAscii(ch))) {
+ if (SC_CP_UTF8 == dbcsCodePage) {
+ // Use hard coded Unicode class
+ const CharacterCategory cc = CategoriseCharacter(ch);
+ switch (cc) {
+
+ // Separator, Line/Paragraph
+ case ccZl:
+ case ccZp:
+ return CharClassify::ccNewLine;
+
+ // Separator, Space
+ case ccZs:
+ // Other
+ case ccCc:
+ case ccCf:
+ case ccCs:
+ case ccCo:
+ case ccCn:
+ return CharClassify::ccSpace;
+
+ // Letter
+ case ccLu:
+ case ccLl:
+ case ccLt:
+ case ccLm:
+ case ccLo:
+ // Number
+ case ccNd:
+ case ccNl:
+ case ccNo:
+ // Mark - includes combining diacritics
+ case ccMn:
+ case ccMc:
+ case ccMe:
+ return CharClassify::ccWord;
+
+ // Punctuation
+ case ccPc:
+ case ccPd:
+ case ccPs:
+ case ccPe:
+ case ccPi:
+ case ccPf:
+ case ccPo:
+ // Symbol
+ case ccSm:
+ case ccSc:
+ case ccSk:
+ case ccSo:
+ return CharClassify::ccPunctuation;
+
+ }
+ } else {
+ // Asian DBCS
+ return CharClassify::ccWord;
+ }
+ }
+ return charClass.GetClass(static_cast<unsigned char>(ch));
}
/**
* Used by commmands that want to select whole words.
* Finds the start of word at pos when delta < 0 or the end of the word when delta >= 0.
*/
-int Document::ExtendWordSelect(int pos, int delta, bool onlyWordCharacters) {
+int Document::ExtendWordSelect(int pos, int delta, bool onlyWordCharacters) const {
CharClassify::cc ccStart = CharClassify::ccWord;
if (delta < 0) {
- if (!onlyWordCharacters)
- ccStart = WordCharClass(cb.CharAt(pos-1));
- while (pos > 0 && (WordCharClass(cb.CharAt(pos - 1)) == ccStart))
- pos--;
+ if (!onlyWordCharacters) {
+ const CharacterExtracted ce = CharacterBefore(pos);
+ ccStart = WordCharacterClass(ce.character);
+ }
+ while (pos > 0) {
+ const CharacterExtracted ce = CharacterBefore(pos);
+ if (WordCharacterClass(ce.character) != ccStart)
+ break;
+ pos -= ce.widthBytes;
+ }
} else {
- if (!onlyWordCharacters && pos < Length())
- ccStart = WordCharClass(cb.CharAt(pos));
- while (pos < (Length()) && (WordCharClass(cb.CharAt(pos)) == ccStart))
- pos++;
+ if (!onlyWordCharacters && pos < Length()) {
+ const CharacterExtracted ce = CharacterAfter(pos);
+ ccStart = WordCharacterClass(ce.character);
+ }
+ while (pos < Length()) {
+ const CharacterExtracted ce = CharacterAfter(pos);
+ if (WordCharacterClass(ce.character) != ccStart)
+ break;
+ pos += ce.widthBytes;
+ }
}
return MovePositionOutsideChar(pos, delta, true);
}
@@ -1518,22 +1662,39 @@ int Document::ExtendWordSelect(int pos, int delta, bool onlyWordCharacters) {
* additional movement to transit white space.
* Used by cursor movement by word commands.
*/
-int Document::NextWordStart(int pos, int delta) {
+int Document::NextWordStart(int pos, int delta) const {
if (delta < 0) {
- while (pos > 0 && (WordCharClass(cb.CharAt(pos - 1)) == CharClassify::ccSpace))
- pos--;
+ while (pos > 0) {
+ const CharacterExtracted ce = CharacterBefore(pos);
+ if (WordCharacterClass(ce.character) != CharClassify::ccSpace)
+ break;
+ pos -= ce.widthBytes;
+ }
if (pos > 0) {
- CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos-1));
- while (pos > 0 && (WordCharClass(cb.CharAt(pos - 1)) == ccStart)) {
- pos--;
+ CharacterExtracted ce = CharacterBefore(pos);
+ const CharClassify::cc ccStart = WordCharacterClass(ce.character);
+ while (pos > 0) {
+ ce = CharacterBefore(pos);
+ if (WordCharacterClass(ce.character) != ccStart)
+ break;
+ pos -= ce.widthBytes;
}
}
} else {
- CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos));
- while (pos < (Length()) && (WordCharClass(cb.CharAt(pos)) == ccStart))
- pos++;
- while (pos < (Length()) && (WordCharClass(cb.CharAt(pos)) == CharClassify::ccSpace))
- pos++;
+ CharacterExtracted ce = CharacterAfter(pos);
+ const CharClassify::cc ccStart = WordCharacterClass(ce.character);
+ while (pos < Length()) {
+ ce = CharacterAfter(pos);
+ if (WordCharacterClass(ce.character) != ccStart)
+ break;
+ pos += ce.widthBytes;
+ }
+ while (pos < Length()) {
+ ce = CharacterAfter(pos);
+ if (WordCharacterClass(ce.character) != CharClassify::ccSpace)
+ break;
+ pos += ce.widthBytes;
+ }
}
return pos;
}
@@ -1545,27 +1706,41 @@ int Document::NextWordStart(int pos, int delta) {
* additional movement to transit white space.
* Used by cursor movement by word commands.
*/
-int Document::NextWordEnd(int pos, int delta) {
+int Document::NextWordEnd(int pos, int delta) const {
if (delta < 0) {
if (pos > 0) {
- CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos-1));
+ CharacterExtracted ce = CharacterBefore(pos);
+ CharClassify::cc ccStart = WordCharacterClass(ce.character);
if (ccStart != CharClassify::ccSpace) {
- while (pos > 0 && WordCharClass(cb.CharAt(pos - 1)) == ccStart) {
- pos--;
+ while (pos > 0) {
+ ce = CharacterBefore(pos);
+ if (WordCharacterClass(ce.character) != ccStart)
+ break;
+ pos -= ce.widthBytes;
}
}
- while (pos > 0 && WordCharClass(cb.CharAt(pos - 1)) == CharClassify::ccSpace) {
- pos--;
+ while (pos > 0) {
+ ce = CharacterBefore(pos);
+ if (WordCharacterClass(ce.character) != CharClassify::ccSpace)
+ break;
+ pos -= ce.widthBytes;
}
}
} else {
- while (pos < Length() && WordCharClass(cb.CharAt(pos)) == CharClassify::ccSpace) {
- pos++;
+ while (pos < Length()) {
+ CharacterExtracted ce = CharacterAfter(pos);
+ if (WordCharacterClass(ce.character) != CharClassify::ccSpace)
+ break;
+ pos += ce.widthBytes;
}
if (pos < Length()) {
- CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos));
- while (pos < Length() && WordCharClass(cb.CharAt(pos)) == ccStart) {
- pos++;
+ CharacterExtracted ce = CharacterAfter(pos);
+ CharClassify::cc ccStart = WordCharacterClass(ce.character);
+ while (pos < Length()) {
+ ce = CharacterAfter(pos);
+ if (WordCharacterClass(ce.character) != ccStart)
+ break;
+ pos += ce.widthBytes;
}
}
}
@@ -1577,10 +1752,15 @@ int Document::NextWordEnd(int pos, int delta) {
* the previous character is of a different character class.
*/
bool Document::IsWordStartAt(int pos) const {
+ if (pos >= Length())
+ return false;
if (pos > 0) {
- CharClassify::cc ccPos = WordCharClass(CharAt(pos));
+ const CharacterExtracted cePos = CharacterAfter(pos);
+ const CharClassify::cc ccPos = WordCharacterClass(cePos.character);
+ const CharacterExtracted cePrev = CharacterBefore(pos);
+ const CharClassify::cc ccPrev = WordCharacterClass(cePrev.character);
return (ccPos == CharClassify::ccWord || ccPos == CharClassify::ccPunctuation) &&
- (ccPos != WordCharClass(CharAt(pos - 1)));
+ (ccPos != ccPrev);
}
return true;
}
@@ -1590,10 +1770,15 @@ bool Document::IsWordStartAt(int pos) const {
* the next character is of a different character class.
*/
bool Document::IsWordEndAt(int pos) const {
+ if (pos <= 0)
+ return false;
if (pos < Length()) {
- CharClassify::cc ccPrev = WordCharClass(CharAt(pos-1));
+ const CharacterExtracted cePos = CharacterAfter(pos);
+ const CharClassify::cc ccPos = WordCharacterClass(cePos.character);
+ const CharacterExtracted cePrev = CharacterBefore(pos);
+ const CharClassify::cc ccPrev = WordCharacterClass(cePrev.character);
return (ccPrev == CharClassify::ccWord || ccPrev == CharClassify::ccPunctuation) &&
- (ccPrev != WordCharClass(CharAt(pos)));
+ (ccPrev != ccPos);
}
return true;
}
@@ -2075,96 +2260,137 @@ void Document::NotifyModified(DocModification mh) {
}
}
-bool Document::IsWordPartSeparator(char ch) const {
- return (WordCharClass(ch) == CharClassify::ccWord) && IsPunctuation(ch);
+// Used for word part navigation.
+static bool IsASCIIPunctuationCharacter(unsigned int ch) {
+ switch (ch) {
+ case '!':
+ case '"':
+ case '#':
+ case '$':
+ case '%':
+ case '&':
+ case '\'':
+ case '(':
+ case ')':
+ case '*':
+ case '+':
+ case ',':
+ case '-':
+ case '.':
+ case '/':
+ case ':':
+ case ';':
+ case '<':
+ case '=':
+ case '>':
+ case '?':
+ case '@':
+ case '[':
+ case '\\':
+ case ']':
+ case '^':
+ case '_':
+ case '`':
+ case '{':
+ case '|':
+ case '}':
+ case '~':
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool Document::IsWordPartSeparator(unsigned int ch) const {
+ return (WordCharacterClass(ch) == CharClassify::ccWord) && IsASCIIPunctuationCharacter(ch);
}
-int Document::WordPartLeft(int pos) {
+int Document::WordPartLeft(int pos) const {
if (pos > 0) {
- --pos;
- char startChar = cb.CharAt(pos);
- if (IsWordPartSeparator(startChar)) {
- while (pos > 0 && IsWordPartSeparator(cb.CharAt(pos))) {
- --pos;
+ pos -= CharacterBefore(pos).widthBytes;
+ CharacterExtracted ceStart = CharacterAfter(pos);
+ if (IsWordPartSeparator(ceStart.character)) {
+ while (pos > 0 && IsWordPartSeparator(CharacterAfter(pos).character)) {
+ pos -= CharacterBefore(pos).widthBytes;
}
}
if (pos > 0) {
- startChar = cb.CharAt(pos);
- --pos;
- if (IsLowerCase(startChar)) {
- while (pos > 0 && IsLowerCase(cb.CharAt(pos)))
- --pos;
- if (!IsUpperCase(cb.CharAt(pos)) && !IsLowerCase(cb.CharAt(pos)))
- ++pos;
- } else if (IsUpperCase(startChar)) {
- while (pos > 0 && IsUpperCase(cb.CharAt(pos)))
- --pos;
- if (!IsUpperCase(cb.CharAt(pos)))
- ++pos;
- } else if (IsADigit(startChar)) {
- while (pos > 0 && IsADigit(cb.CharAt(pos)))
- --pos;
- if (!IsADigit(cb.CharAt(pos)))
- ++pos;
- } else if (IsPunctuation(startChar)) {
- while (pos > 0 && IsPunctuation(cb.CharAt(pos)))
- --pos;
- if (!IsPunctuation(cb.CharAt(pos)))
- ++pos;
- } else if (isspacechar(startChar)) {
- while (pos > 0 && isspacechar(cb.CharAt(pos)))
- --pos;
- if (!isspacechar(cb.CharAt(pos)))
- ++pos;
- } else if (!IsASCII(startChar)) {
- while (pos > 0 && !IsASCII(cb.CharAt(pos)))
- --pos;
- if (IsASCII(cb.CharAt(pos)))
- ++pos;
+ ceStart = CharacterAfter(pos);
+ pos -= CharacterBefore(pos).widthBytes;
+ if (IsLowerCase(ceStart.character)) {
+ while (pos > 0 && IsLowerCase(CharacterAfter(pos).character))
+ pos -= CharacterBefore(pos).widthBytes;
+ if (!IsUpperCase(CharacterAfter(pos).character) && !IsLowerCase(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ } else if (IsUpperCase(ceStart.character)) {
+ while (pos > 0 && IsUpperCase(CharacterAfter(pos).character))
+ pos -= CharacterBefore(pos).widthBytes;
+ if (!IsUpperCase(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ } else if (IsADigit(ceStart.character)) {
+ while (pos > 0 && IsADigit(CharacterAfter(pos).character))
+ pos -= CharacterBefore(pos).widthBytes;
+ if (!IsADigit(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ } else if (IsASCIIPunctuationCharacter(ceStart.character)) {
+ while (pos > 0 && IsASCIIPunctuationCharacter(CharacterAfter(pos).character))
+ pos -= CharacterBefore(pos).widthBytes;
+ if (!IsASCIIPunctuationCharacter(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ } else if (isspacechar(ceStart.character)) {
+ while (pos > 0 && isspacechar(CharacterAfter(pos).character))
+ pos -= CharacterBefore(pos).widthBytes;
+ if (!isspacechar(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ } else if (!IsASCII(ceStart.character)) {
+ while (pos > 0 && !IsASCII(CharacterAfter(pos).character))
+ pos -= CharacterBefore(pos).widthBytes;
+ if (IsASCII(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
} else {
- ++pos;
+ pos += CharacterAfter(pos).widthBytes;
}
}
}
return pos;
}
-int Document::WordPartRight(int pos) {
- char startChar = cb.CharAt(pos);
- int length = Length();
- if (IsWordPartSeparator(startChar)) {
- while (pos < length && IsWordPartSeparator(cb.CharAt(pos)))
- ++pos;
- startChar = cb.CharAt(pos);
- }
- if (!IsASCII(startChar)) {
- while (pos < length && !IsASCII(cb.CharAt(pos)))
- ++pos;
- } else if (IsLowerCase(startChar)) {
- while (pos < length && IsLowerCase(cb.CharAt(pos)))
- ++pos;
- } else if (IsUpperCase(startChar)) {
- if (IsLowerCase(cb.CharAt(pos + 1))) {
- ++pos;
- while (pos < length && IsLowerCase(cb.CharAt(pos)))
- ++pos;
+int Document::WordPartRight(int pos) const {
+ CharacterExtracted ceStart = CharacterAfter(pos);
+ const int length = Length();
+ if (IsWordPartSeparator(ceStart.character)) {
+ while (pos < length && IsWordPartSeparator(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ ceStart = CharacterAfter(pos);
+ }
+ if (!IsASCII(ceStart.character)) {
+ while (pos < length && !IsASCII(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ } else if (IsLowerCase(ceStart.character)) {
+ while (pos < length && IsLowerCase(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ } else if (IsUpperCase(ceStart.character)) {
+ if (IsLowerCase(CharacterAfter(pos + ceStart.widthBytes).character)) {
+ pos += CharacterAfter(pos).widthBytes;
+ while (pos < length && IsLowerCase(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
} else {
- while (pos < length && IsUpperCase(cb.CharAt(pos)))
- ++pos;
- }
- if (IsLowerCase(cb.CharAt(pos)) && IsUpperCase(cb.CharAt(pos - 1)))
- --pos;
- } else if (IsADigit(startChar)) {
- while (pos < length && IsADigit(cb.CharAt(pos)))
- ++pos;
- } else if (IsPunctuation(startChar)) {
- while (pos < length && IsPunctuation(cb.CharAt(pos)))
- ++pos;
- } else if (isspacechar(startChar)) {
- while (pos < length && isspacechar(cb.CharAt(pos)))
- ++pos;
+ while (pos < length && IsUpperCase(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ }
+ if (IsLowerCase(CharacterAfter(pos).character) && IsUpperCase(CharacterBefore(pos).character))
+ pos -= CharacterBefore(pos).widthBytes;
+ } else if (IsADigit(ceStart.character)) {
+ while (pos < length && IsADigit(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ } else if (IsASCIIPunctuationCharacter(ceStart.character)) {
+ while (pos < length && IsASCIIPunctuationCharacter(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
+ } else if (isspacechar(ceStart.character)) {
+ while (pos < length && isspacechar(CharacterAfter(pos).character))
+ pos += CharacterAfter(pos).widthBytes;
} else {
- ++pos;
+ pos += CharacterAfter(pos).widthBytes;
}
return pos;
}