diff options
Diffstat (limited to 'src/UniConversion.cxx')
| -rw-r--r-- | src/UniConversion.cxx | 23 | 
1 files changed, 23 insertions, 0 deletions
| diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index 4da9e102a..d0028d65e 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -8,6 +8,7 @@  #include <stdlib.h>  #include <stdexcept> +#include <string>  #include "UniConversion.h" @@ -304,6 +305,28 @@ int UTF8DrawBytes(const unsigned char *us, int len) {  	return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);  } +// Replace invalid bytes in UTF-8 with the replacement character +std::string FixInvalidUTF8(const std::string &text) { +	std::string result; +	const unsigned char *us = reinterpret_cast<const unsigned char *>(text.c_str()); +	size_t remaining = text.size(); +	while (remaining > 0) { +		const int utf8Status = UTF8Classify(us, static_cast<int>(remaining)); +		if (utf8Status & UTF8MaskInvalid) { +			// Replacement character 0xFFFD = UTF8:"efbfbd". +			result.append("\xef\xbf\xbd"); +			us++; +			remaining--; +		} else { +			const int len = utf8Status&UTF8MaskWidth; +			result.append(reinterpret_cast<const char *>(us), len); +			us += len; +			remaining -= len; +		} +	} +	return result; +} +  #ifdef SCI_NAMESPACE  }  #endif | 
