aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authornyamatongwe <unknown>2007-10-10 11:30:14 +0000
committernyamatongwe <unknown>2007-10-10 11:30:14 +0000
commit36b586e7dfc2b782f89ff9e73e1a48ccd94af033 (patch)
treefdbdb1340f059a882a4bdf3b0f734b8b72793ebb
parentc341b2b24cb94fd4173d2baff0efb7d603c901dd (diff)
downloadscintilla-mirror-36b586e7dfc2b782f89ff9e73e1a48ccd94af033.tar.gz
Detect and handle invalid byte sequences in UTF-8 mode by displaying each
individual invalid byte as a hex blob.
-rw-r--r--src/Document.cxx60
-rw-r--r--src/Document.h1
-rw-r--r--src/Editor.cxx118
-rw-r--r--src/PositionCache.cxx47
-rw-r--r--src/PositionCache.h7
5 files changed, 201 insertions, 32 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index e2ca7a32a..ff8d0fbcf 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -287,6 +287,55 @@ int Document::LenChar(int pos) {
}
}
+static bool IsTrailByte(int ch) {
+ return (ch >= 0x80) && (ch < (0x80 + 0x40));
+}
+
+static int BytesFromLead(int leadByte) {
+ if (leadByte > 0xF4) {
+ // Characters longer than 4 bytes not possible in current UTF-8
+ return 0;
+ } else if (leadByte >= 0xF0) {
+ return 4;
+ } else if (leadByte >= 0xE0) {
+ return 3;
+ } else if (leadByte >= 0xC2) {
+ return 2;
+ }
+ return 0;
+}
+
+bool Document::InGoodUTF8(int pos, int &start, int &end) {
+ int lead = pos;
+ while ((lead>0) && (pos-lead < 4) && IsTrailByte(static_cast<unsigned char>(cb.CharAt(lead-1))))
+ lead--;
+ start = 0;
+ if (lead > 0) {
+ start = lead-1;
+ }
+ int leadByte = static_cast<unsigned char>(cb.CharAt(start));
+ int bytes = BytesFromLead(leadByte);
+ if (bytes == 0) {
+ return false;
+ } else {
+ int trailBytes = bytes - 1;
+ int len = pos - lead + 1;
+ if (len > trailBytes)
+ // pos too far from lead
+ return false;
+ // Check that there are enough trails for this lead
+ int trail = pos + 1;
+ while ((trail-lead<trailBytes) && (trail < Length())) {
+ if (!IsTrailByte(static_cast<unsigned char>(cb.CharAt(trail)))) {
+ return false;
+ }
+ trail++;
+ }
+ end = start + bytes;
+ return true;
+ }
+}
+
// Normalise a position so that it is not halfway through a two byte character.
// This can occur in two situations -
// When lines are terminated with \r\n pairs which should be treated as one character.
@@ -313,13 +362,14 @@ int Document::MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd) {
if (dbcsCodePage) {
if (SC_CP_UTF8 == dbcsCodePage) {
unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos));
- while ((pos > 0) && (pos < Length()) && (ch >= 0x80) && (ch < (0x80 + 0x40))) {
- // ch is a trail byte
+ int startUTF = pos;
+ int endUTF = pos;
+ if (IsTrailByte(ch) && InGoodUTF8(pos, startUTF, endUTF)) {
+ // ch is a trail byte within a UTF-8 character
if (moveDir > 0)
- pos++;
+ pos = endUTF;
else
- pos--;
- ch = static_cast<unsigned char>(cb.CharAt(pos));
+ pos = startUTF;
}
} else {
// Anchor DBCS calculations at start of line because start of line can
diff --git a/src/Document.h b/src/Document.h
index 9143ec6e4..a36c4aafe 100644
--- a/src/Document.h
+++ b/src/Document.h
@@ -138,6 +138,7 @@ public:
int ClampPositionIntoDocument(int pos);
bool IsCrLf(int pos);
int LenChar(int pos);
+ bool InGoodUTF8(int pos, int &start, int &end);
int MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd=true);
// Gateways to modifying document
diff --git a/src/Editor.cxx b/src/Editor.cxx
index 372ba0809..a1a48a08b 100644
--- a/src/Editor.cxx
+++ b/src/Editor.cxx
@@ -1684,6 +1684,61 @@ LineLayout *Editor::RetrieveLineLayout(int lineNumber) {
LinesOnScreen() + 1, pdoc->LinesTotal());
}
+static bool GoodTrailByte(int v) {
+ return (v >= 0x80) && (v < 0xc0);
+}
+
+bool BadUTF(const char *s, int len, int &trailBytes) {
+ if (trailBytes) {
+ trailBytes--;
+ return false;
+ }
+ const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
+ if (*us < 0x80) {
+ // Single bytes easy
+ return false;
+ } else if (*us > 0xF4) {
+ // Characters longer than 4 bytes not possible in current UTF-8
+ return true;
+ } else if (*us >= 0xF0) {
+ // 4 bytes
+ if (len < 4)
+ return true;
+ if (GoodTrailByte(us[1]) && GoodTrailByte(us[2]) && GoodTrailByte(us[3])) {
+ trailBytes = 3;
+ return false;
+ } else {
+ return true;
+ }
+ } else if (*us >= 0xE0) {
+ // 3 bytes
+ if (len < 3)
+ return true;
+ if (GoodTrailByte(us[1]) && GoodTrailByte(us[2])) {
+ trailBytes = 2;
+ return false;
+ } else {
+ return true;
+ }
+ } else if (*us >= 0xC2) {
+ // 2 bytes
+ if (len < 2)
+ return true;
+ if (GoodTrailByte(us[1])) {
+ trailBytes = 1;
+ return false;
+ } else {
+ return true;
+ }
+ } else if (*us >= 0xC0) {
+ // Overlong encoding
+ return true;
+ } else {
+ // Trail byte
+ return true;
+ }
+}
+
/**
* Fill in the LineLayout data for the given line.
* Copy the given @a line and its styles from the document into local arrays.
@@ -1795,11 +1850,15 @@ void Editor::LayoutLine(int line, Surface *surface, ViewStyle &vstyle, LineLayou
int ctrlCharWidth[32] = {0};
bool isControlNext = IsControlCharacter(ll->chars[0]);
+ int trailBytes = 0;
+ bool isBadUTFNext = IsUnicodeMode() && BadUTF(ll->chars, numCharsInLine, trailBytes);
for (int charInLine = 0; charInLine < numCharsInLine; charInLine++) {
bool isControl = isControlNext;
isControlNext = IsControlCharacter(ll->chars[charInLine + 1]);
+ bool isBadUTF = isBadUTFNext;
+ isBadUTFNext = IsUnicodeMode() && BadUTF(ll->chars + charInLine + 1, numCharsInLine - charInLine - 1, trailBytes);
if ((ll->styles[charInLine] != ll->styles[charInLine + 1]) ||
- isControl || isControlNext) {
+ isControl || isControlNext || isBadUTF || isBadUTFNext) {
ll->positions[startseg] = 0;
if (vstyle.styles[ll->styles[charInLine]].visible) {
if (isControl) {
@@ -1820,6 +1879,11 @@ void Editor::LayoutLine(int line, Surface *surface, ViewStyle &vstyle, LineLayou
ll->positions + startseg + 1);
}
lastSegItalics = false;
+ } else if (isBadUTF) {
+ char hexits[3];
+ sprintf(hexits, "%2X", ll->chars[charInLine] & 0xff);
+ ll->positions[charInLine + 1] =
+ surface->WidthText(ctrlCharsFont, hexits, istrlen(hexits)) + 3;
} else { // Regular character
int lenSeg = charInLine - startseg + 1;
if ((lenSeg == 1) && (' ' == ll->chars[startseg])) {
@@ -2133,6 +2197,30 @@ void Editor::DrawIndicators(Surface *surface, ViewStyle &vsDraw, int line, int x
}
}
+void DrawTextBlob(Surface *surface, ViewStyle &vsDraw, PRectangle rcSegment,
+ const char *s, ColourAllocated textBack, ColourAllocated textFore, bool twoPhaseDraw) {
+ if (!twoPhaseDraw) {
+ surface->FillRectangle(rcSegment, textBack);
+ }
+ Font &ctrlCharsFont = vsDraw.styles[STYLE_CONTROLCHAR].font;
+ int normalCharHeight = surface->Ascent(ctrlCharsFont) -
+ surface->InternalLeading(ctrlCharsFont);
+ PRectangle rcCChar = rcSegment;
+ rcCChar.left = rcCChar.left + 1;
+ rcCChar.top = rcSegment.top + vsDraw.maxAscent - normalCharHeight;
+ rcCChar.bottom = rcSegment.top + vsDraw.maxAscent + 1;
+ PRectangle rcCentral = rcCChar;
+ rcCentral.top++;
+ rcCentral.bottom--;
+ surface->FillRectangle(rcCentral, textFore);
+ PRectangle rcChar = rcCChar;
+ rcChar.left++;
+ rcChar.right--;
+ surface->DrawTextClipped(rcChar, ctrlCharsFont,
+ rcSegment.top + vsDraw.maxAscent, s, istrlen(s),
+ textBack, textFore);
+}
+
void Editor::DrawLine(Surface *surface, ViewStyle &vsDraw, int line, int lineVisible, int xStart,
PRectangle rcLine, LineLayout *ll, int subLine) {
@@ -2251,7 +2339,7 @@ void Editor::DrawLine(Surface *surface, ViewStyle &vsDraw, int line, int lineVis
// Does not take margin into account but not significant
int xStartVisible = subLineStart - xStart;
- BreakFinder bfBack(ll, lineStart, lineEnd, posLineStart, xStartVisible);
+ BreakFinder bfBack(ll, lineStart, lineEnd, posLineStart, IsUnicodeMode(), xStartVisible);
int next = bfBack.First();
// Background drawing loop
@@ -2326,7 +2414,7 @@ void Editor::DrawLine(Surface *surface, ViewStyle &vsDraw, int line, int lineVis
inIndentation = subLine == 0; // Do not handle indentation except on first subline.
// Foreground drawing loop
- BreakFinder bfFore(ll, lineStart, lineEnd, posLineStart, xStartVisible);
+ BreakFinder bfFore(ll, lineStart, lineEnd, posLineStart, IsUnicodeMode(), xStartVisible);
next = bfFore.First();
while (next < lineEnd) {
@@ -2391,31 +2479,17 @@ void Editor::DrawLine(Surface *surface, ViewStyle &vsDraw, int line, int lineVis
if (controlCharSymbol < 32) {
// Draw the character
const char *ctrlChar = ControlCharacterString(ll->chars[i]);
- if (!twoPhaseDraw) {
- surface->FillRectangle(rcSegment, textBack);
- }
- int normalCharHeight = surface->Ascent(ctrlCharsFont) -
- surface->InternalLeading(ctrlCharsFont);
- PRectangle rcCChar = rcSegment;
- rcCChar.left = rcCChar.left + 1;
- rcCChar.top = rcSegment.top + vsDraw.maxAscent - normalCharHeight;
- rcCChar.bottom = rcSegment.top + vsDraw.maxAscent + 1;
- PRectangle rcCentral = rcCChar;
- rcCentral.top++;
- rcCentral.bottom--;
- surface->FillRectangle(rcCentral, textFore);
- PRectangle rcChar = rcCChar;
- rcChar.left++;
- rcChar.right--;
- surface->DrawTextClipped(rcChar, ctrlCharsFont,
- rcSegment.top + vsDraw.maxAscent, ctrlChar, istrlen(ctrlChar),
- textBack, textFore);
+ DrawTextBlob(surface, vsDraw, rcSegment, ctrlChar, textBack, textFore, twoPhaseDraw);
} else {
char cc[2] = { static_cast<char>(controlCharSymbol), '\0' };
surface->DrawTextNoClip(rcSegment, ctrlCharsFont,
rcSegment.top + vsDraw.maxAscent,
cc, 1, textBack, textFore);
}
+ } else if ((i == startseg) && (static_cast<unsigned char>(ll->chars[i]) >= 0x80) && IsUnicodeMode()) {
+ char hexits[3];
+ sprintf(hexits, "%2X", ll->chars[i] & 0xff);
+ DrawTextBlob(surface, vsDraw, rcSegment, hexits, textBack, textFore, twoPhaseDraw);
} else {
// Normal text display
if (vsDraw.styles[styleMain].visible) {
diff --git a/src/PositionCache.cxx b/src/PositionCache.cxx
index 1763b6530..f40a15378 100644
--- a/src/PositionCache.cxx
+++ b/src/PositionCache.cxx
@@ -345,12 +345,23 @@ void LineLayoutCache::Dispose(LineLayout *ll) {
}
void BreakFinder::Insert(int val) {
+ // Expand if needed
+ if (saeLen >= saeSize) {
+ saeSize *= 2;
+ int *selAndEdgeNew = new int[saeSize];
+ for (unsigned int j = 0; j<saeLen; j++) {
+ selAndEdgeNew[j] = selAndEdge[j];
+ }
+ delete []selAndEdge;
+ selAndEdge = selAndEdgeNew;
+ }
+
if (val >= nextBreak) {
for (unsigned int j = 0; j<saeLen; j++) {
if (val == selAndEdge[j]) {
return;
} if (val < selAndEdge[j]) {
- for (unsigned int k = saeLen; j>k; k--) {
+ for (unsigned int k = saeLen; k>j; k--) {
selAndEdge[k] = selAndEdge[k-1];
}
saeLen++;
@@ -363,17 +374,32 @@ void BreakFinder::Insert(int val) {
}
}
-BreakFinder::BreakFinder(LineLayout *ll_, int lineStart_, int lineEnd_, int posLineStart_, int xStart) :
+extern bool BadUTF(const char *s, int len, int &trailBytes);
+
+static int NextBadU(const char *s, int p, int len, int &trailBytes) {
+ while (p < len) {
+ p++;
+ if (BadUTF(s + p, len - p, trailBytes))
+ return p;
+ }
+ return -1;
+}
+
+BreakFinder::BreakFinder(LineLayout *ll_, int lineStart_, int lineEnd_, int posLineStart_, bool utf8_, int xStart) :
ll(ll_),
lineStart(lineStart_),
lineEnd(lineEnd_),
posLineStart(posLineStart_),
+ utf8(utf8_),
nextBreak(lineStart_),
+ saeSize(0),
saeLen(0),
saeCurrentPos(0),
saeNext(0),
subBreak(-1) {
- for (unsigned int j=0; j < sizeof(selAndEdge) / sizeof(selAndEdge[0]); j++) {
+ saeSize = 8;
+ selAndEdge = new int[saeSize];
+ for (unsigned int j=0; j < saeSize; j++) {
selAndEdge[j] = 0;
}
@@ -392,9 +418,24 @@ BreakFinder::BreakFinder(LineLayout *ll_, int lineStart_, int lineEnd_, int posL
Insert(ll->edgeColumn - 1);
Insert(lineEnd - 1);
+
+ if (utf8) {
+ int trailBytes=0;
+ for (int pos = -1;;) {
+ pos = NextBadU(ll->chars, pos, lineEnd, trailBytes);
+ if (pos < 0)
+ break;
+ Insert(pos-1);
+ Insert(pos);
+ }
+ }
saeNext = (saeLen > 0) ? selAndEdge[0] : -1;
}
+BreakFinder::~BreakFinder() {
+ delete []selAndEdge;
+}
+
int BreakFinder::First() {
return nextBreak;
}
diff --git a/src/PositionCache.h b/src/PositionCache.h
index 764702fce..5d486cb60 100644
--- a/src/PositionCache.h
+++ b/src/PositionCache.h
@@ -124,15 +124,18 @@ class BreakFinder {
int lineStart;
int lineEnd;
int posLineStart;
+ bool utf8;
int nextBreak;
- int selAndEdge[5];
+ int *selAndEdge;
+ unsigned int saeSize;
unsigned int saeLen;
unsigned int saeCurrentPos;
int saeNext;
int subBreak;
void Insert(int val);
public:
- BreakFinder(LineLayout *ll_, int lineStart_, int lineEnd_, int posLineStart_, int xStart);
+ BreakFinder(LineLayout *ll_, int lineStart_, int lineEnd_, int posLineStart_, bool utf8_, int xStart);
+ ~BreakFinder();
int First();
int Next();
};