From 867d22e419afe769f05ad26b61c6ea5ea1432c3c Mon Sep 17 00:00:00 2001
From: Robin Haberkorn <robin.haberkorn@googlemail.com>
Date: Sat, 22 Mar 2025 13:45:28 +0300
Subject: harmonized all word-movement and deletion commands: they move/delete
 until the beginning of words now

* All commands and their documentations were inconsistent.
  * ^W rubbed out to the beginning of words.
  * Shift+Right (fnkeys.tes) moved to the beginning of the next word if
    invoked at the beginning of a word and to the end of the next word otherwise.
  * <W> (and <V> and <Y> by extension) moved to the end of the next word.
  * The cheat sheet would claim that <W> moves to the beginning of the next word.
* Video TECO's <W> command would differ again from everything else.
  With positive arguments, it moved to the beginning of words, while
  with negative it moved to end of words.
  I decided not to copy this behavior.
* It has been decided to adopt a consistent beginning-of-words policy.
  -W therefore differs from Video TECO in moving to the beginning of the
  current or previous word.
* teco_find_words() is now based on parsing the document pointer, instead
  of relying on SCI_WORDENDPOSITION, since the latter cannot actually be
  used to skip strictly non-word characters.
  This requires a constant amount of Scintilla messages but will require fewer
  messages only when moving for more than 3 words.
* The semantics of <W> are therefore now consistent with Vim and Emacs as well.
* Shift+Right/Left is still based on SCI_WORDENDPOSITION, so it's behavior
  differs slightly from <W> for instance at the end of lines, as it will
  stop at linebreaks.
* Unfortunately, these changes will break lots of macros, among others
  the M#rf, M#sp and git.blame macros ("Useful macros" from the wiki).
---
 src/cmdline.c           |   4 ++
 src/core-commands.c     | 109 +++++++++++++++++++++++++++++++++++++++---------
 src/symbols-extract.tes |   4 +-
 3 files changed, 95 insertions(+), 22 deletions(-)

(limited to 'src')
diff --git a/src/cmdline.c b/src/cmdline.c
index dde096d..b3da887 100644
--- a/src/cmdline.c
+++ b/src/cmdline.c
@@ -509,6 +509,10 @@ teco_state_stringbuilding_start_process_edit_cmd(teco_machine_stringbuilding_t *
 	case TECO_CTL_KEY('W'): { /* rubout/reinsert word */
 		teco_interface_popup_clear();
 
+		/*
+		 * NOTE: This must be consistent with teco_find_words():
+		 * Always delete to the beginning of the previous word.
+		 */
 		g_auto(teco_string_t) wchars;
 		wchars.len = teco_interface_ssm(SCI_GETWORDCHARS, 0, 0);
 		wchars.data = g_malloc(wchars.len + 1);
diff --git a/src/core-commands.c b/src/core-commands.c
index 979095b..8cbb4be 100644
--- a/src/core-commands.c
+++ b/src/core-commands.c
@@ -700,31 +700,97 @@ teco_state_start_back(teco_machine_main_t *ctx, GError **error)
 }
 
 /*
- * FIXME: would be nice to do this with constant amount of
- * editor messages. E.g. by using custom algorithm accessing
- * the internal document buffer.
+ * NOTE: This implementation has a constant/maximum number of Scintilla
+ * messages, compared to using SCI_WORDENDPOSITION.
+ * This pays out only beginning at n > 3, though.
+ * But most importantly SCI_WORDENDPOSITION(p, FALSE) does not actually skip
+ * over all non-word characters.
  */
 static gboolean
 teco_find_words(gsize *pos, teco_int_t n)
 {
+	if (!n)
+		return TRUE;
+
+	g_auto(teco_string_t) wchars;
+	wchars.len = teco_interface_ssm(SCI_GETWORDCHARS, 0, 0);
+	wchars.data = g_malloc(wchars.len + 1);
+	teco_interface_ssm(SCI_GETWORDCHARS, 0, (sptr_t)wchars.data);
+	wchars.data[wchars.len] = '\0';
+
+	sptr_t gap = teco_interface_ssm(SCI_GETGAPPOSITION, 0, 0);
+
 	if (n > 0) {
+		/* scan forward */
+		gsize len = teco_interface_ssm(SCI_GETLENGTH, 0, 0);
+		gsize range_len = gap > *pos ? gap - *pos : len - *pos;
+		if (!range_len)
+			return FALSE;
+		const gchar *buffer, *p;
+		p = buffer = (const gchar *)teco_interface_ssm(SCI_GETRANGEPOINTER, *pos, range_len);
+
 		while (n--) {
-			sptr_t old_pos = *pos;
-			*pos = teco_interface_ssm(SCI_WORDENDPOSITION, *pos, FALSE);
-			*pos = teco_interface_ssm(SCI_WORDENDPOSITION, *pos, TRUE);
-			if (*pos == old_pos)
-				return FALSE;
+			gboolean skip_word = TRUE;
+
+			for (;;) {
+				if (*pos == len)
+					/* end of document */
+					return n == 0;
+				if (p-buffer >= range_len) {
+					g_assert(*pos == gap);
+					range_len = len - gap;
+					p = buffer = (const gchar *)teco_interface_ssm(SCI_GETRANGEPOINTER, gap, range_len);
+				}
+				/*
+				 * FIXME: Is this safe or do we have to look up Unicode code points?
+				 */
+				if ((!teco_string_contains(&wchars, *p)) == skip_word) {
+					if (!skip_word)
+						break;
+					skip_word = !skip_word;
+					continue;
+				}
+				(*pos)++;
+				p++;
+			}
 		}
 
 		return TRUE;
 	}
 
+	/* scan backwards */
+	gsize range_len = gap < *pos ? *pos - gap : *pos;
+	if (!range_len)
+		return FALSE;
+	const gchar *buffer, *p;
+	buffer = (const gchar *)teco_interface_ssm(SCI_GETRANGEPOINTER, *pos - range_len, range_len);
+	p = buffer+range_len;
+
 	while (n++) {
-		sptr_t old_pos = *pos;
-		*pos = teco_interface_ssm(SCI_WORDSTARTPOSITION, *pos, TRUE);
-		*pos = teco_interface_ssm(SCI_WORDSTARTPOSITION, *pos, FALSE);
-		if (*pos == old_pos)
-			return FALSE;
+		gboolean skip_word = FALSE;
+
+		for (;;) {
+			if (*pos == 0)
+				/* beginning of document */
+				return n == 0;
+			if (p == buffer) {
+				g_assert(*pos == gap);
+				range_len = *pos;
+				buffer = (const gchar *)teco_interface_ssm(SCI_GETRANGEPOINTER, 0, range_len);
+				p = buffer+range_len;
+			}
+			/*
+			 * FIXME: Is this safe or do we have to look up Unicode code points?
+			 */
+			if ((!teco_string_contains(&wchars, p[-1])) == skip_word) {
+				if (skip_word)
+					break;
+				skip_word = !skip_word;
+				continue;
+			}
+			(*pos)--;
+			p--;
+		}
 	}
 
 	return TRUE;
@@ -738,8 +804,8 @@ teco_find_words(gsize *pos, teco_int_t n)
  * Move dot <n> words forward.
  *   - If <n> is positive, dot is positioned at the beginning
  *     of the word <n> words after the current one.
- *   - If <n> is negative, dot is positioned at the end
- *     of the word <n> words before the current one.
+ *   - If <n> is negative, dot is positioned at the beginning
+ *     of the word, <-n> words before the current one.
  *   - If <n> is zero, dot is not moved.
  *
  * \(lqW\(rq uses Scintilla's definition of a word as
@@ -747,8 +813,9 @@ teco_find_words(gsize *pos, teco_int_t n)
  * .B SCI_SETWORDCHARS
  * message.
  *
- * Otherwise, the command's behaviour is analogous to
- * the \(lqC\(rq command.
+ * If the requested word would lie beyond the range of the
+ * buffer, the command yields an error.
+ * If colon-modified it instead returns a condition code.
  */
 static void
 teco_state_start_word(teco_machine_main_t *ctx, GError **error)
@@ -805,10 +872,12 @@ teco_delete_words(teco_int_t n)
  * -V
  * [n]:V -> Success|Failure
  *
- * Deletes the next <n> words until the end of the
+ * Deletes the next <n> words until the beginning of the
  * n'th word after the current one.
- * If <n> is negative, deletes up to end of the
- * n'th word before the current one.
+ * If <n> is negative, deletes up to the beginning of the
+ * word, <-n> words before the current one.
+ * \(lq-V\(rq in the middle of a word deletes until the beginning
+ * of the word.
  * If <n> is omitted, 1 or -1 is implied depending on the
  * sign prefix.
  *
diff --git a/src/symbols-extract.tes b/src/symbols-extract.tes
index 1ab6667..9a8a270 100755
--- a/src/symbols-extract.tes
+++ b/src/symbols-extract.tes
@@ -15,7 +15,7 @@ EMQ[$SCITECOPATH]/string.tes
 LR 0X#ou 2LR 0X#in HK
 
 !* copy all defines in input file beginning with prefix *!
-EBN#in <S#defineS[Q[getopt.p]]; -SS :Xa> EF
+EBN#in <S#defineS[[Q[getopt.p]]M ]; 1:Xa 10:a> EF
 
 !* sort all defines *!
 Ga ZJB 0,.M[qsort] J
@@ -37,7 +37,7 @@ I/*
 
 static const teco_symbol_entry_t entries[] = {^J
 <
-  .,W.Xa 0KK
+  .,LR.Xa 0KK
   I#ifdef Qa^J^I{"Qa", Qa},^J#endif^J
 .-Z;>
 I};
-- 
cgit v1.2.3