From 2a050759ab621b87d0782cc8235907a1757b46cc Mon Sep 17 00:00:00 2001 From: Robin Haberkorn Date: Wed, 11 Sep 2024 14:30:24 +0200 Subject: fixed searches in single-byte encoded documents * while code is guaranteed to be in valid UTF-8, this cannot be said about the result of string building. * The search pattern can end up with invalid Unicode bytes even when searching on UTF-8 buffers, e.g. if ^EQq inserts garbage. There are currently no checks. * When searching on a raw buffer, it must be possible to search for arbitrary bytes (^EUq). Since teco_pattern2regexp() was always expecting clean UTF-8 input, this would sometimes skip over too many bytes and could even crash. * Instead, teco_pattern2regexp() now takes the target codepage into account. --- tests/testsuite.at | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'tests') diff --git a/tests/testsuite.at b/tests/testsuite.at index 0733d2a..0c7612a 100644 --- a/tests/testsuite.at +++ b/tests/testsuite.at @@ -85,6 +85,7 @@ AT_CHECK([$SCITECO -e "@EQa//0EE 1U*0EE 0:@EUa/f^@^@/ :Qa-4\"N(0/0)' Ga Z-4\"N(0 AT_CHECK([$SCITECO -e "0EE 129@I// -A-129\"N(0/0)' HXa @EQa// EE\"N(0/0)'"], 0, ignore, ignore) AT_CHECK([$SCITECO -8e "129@:^Ua// 0Qa-129\"N(0/0)'"], 0, ignore, ignore) AT_CHECK([$SCITECO -e "1EE 167Ua @I/^EUa/ .-1\"N(0/0)'"], 0, ignore, ignore) +AT_CHECK([$SCITECO -8e "194Ua Qa@I//J :@S/^EUa/\"F(0/0)'"], 0, ignore, ignore) AT_CLEANUP AT_SETUP([Unicode]) @@ -203,11 +204,3 @@ AT_SKIP_IF([case $host in *-*-*bsd* | *-*-darwin*) true;; *) false;; esac]) AT_CHECK([$SCITECO -e "@^Um{U.a Q.a-100000\"<%.aMm'} 0Mm"], 0, ignore, ignore) AT_XFAIL_IF(true) AT_CLEANUP - -AT_SETUP([Unicode glitches]) -# While TECO code must always be UTF-8, strings after string building -# can be in single-byte encodings as well. -# It must be possible to search for single bytes in single-byte encodings. -AT_CHECK([$SCITECO -8e "164Ua Ga@I//J :@S/^EUa/\"F(0/0)'"], 0, ignore, ignore) -AT_XFAIL_IF(true) -AT_CLEANUP -- cgit v1.2.3