changeset 20678:71216b5f4bc9 draft

-Add: Caret movement by words for CJK languages.
author Michael Lutz <michi@icosahedron.de>
date Sun, 28 Jul 2013 02:21:43 +0200
parents 9795e5780df4
children 390a9a434896
files src/string.cpp src/string_base.h src/string_func.h src/textbuf.cpp src/textbuf_type.h
diffstat 5 files changed, 216 insertions(+), 125 deletions(-) [+]
line wrap: on
line diff
--- a/src/string.cpp
+++ b/src/string.cpp
@@ -661,50 +661,132 @@
 class IcuStringIterator : public StringIterator
 {
 	icu::BreakIterator *char_itr; ///< ICU iterator for characters.
+	icu::BreakIterator *word_itr; ///< ICU iterator for words.
 	const char *string;           ///< Iteration string in UTF-8.
 
+	SmallVector<UChar, 32> utf16_str;      ///< UTF-16 copy of the string.
+	SmallVector<size_t, 32> utf16_to_utf8; ///< Mapping from UTF-16 code point position to index in the UTF-8 source string.
+
 public:
-	IcuStringIterator() : char_itr(NULL)
+	IcuStringIterator() : char_itr(NULL), word_itr(NULL)
 	{
 		UErrorCode status = U_ZERO_ERROR;
 		this->char_itr = icu::BreakIterator::createCharacterInstance(icu::Locale(_current_language != NULL ? _current_language->isocode : "en"), status);
+		this->word_itr = icu::BreakIterator::createWordInstance(icu::Locale(_current_language != NULL ? _current_language->isocode : "en"), status);
+
+		*this->utf16_str.Append() = '\0';
+		*this->utf16_to_utf8.Append() = 0;
 	}
 
 	virtual ~IcuStringIterator()
 	{
 		delete this->char_itr;
+		delete this->word_itr;
 	}
 
 	virtual void SetString(const char *s)
 	{
 		this->string = s;
 
+		/* Unfortunately current ICU versions only provide rudimentary support
+		 * for word break iterators (especially for CJK languages) in combination
+		 * with UTF-8 input. As a work around we have to convert the input to
+		 * UTF-16 and create a mapping back to UTF-8 character indices. */
+		this->utf16_str.Clear();
+		this->utf16_to_utf8.Clear();
+
+		while (*s != '\0') {
+			size_t idx = s - this->string;
+
+			WChar c = Utf8Consume(&s);
+			if (c <	0x10000) {
+				*this->utf16_str.Append() = (UChar)c;
+			} else {
+				/* Make a surrogate pair. */
+				*this->utf16_str.Append() = (UChar)(0xD800 + ((c - 0x10000) >> 10));
+				*this->utf16_str.Append() = (UChar)(0xDC00 + ((c - 0x10000) & 0x3FF));
+				*this->utf16_to_utf8.Append() = idx;
+			}
+			*this->utf16_to_utf8.Append() = idx;
+		}
+		*this->utf16_str.Append() = '\0';
+		*this->utf16_to_utf8.Append() = s - this->string;
+
 		UText text = UTEXT_INITIALIZER;
 		UErrorCode status = U_ZERO_ERROR;
-		utext_openUTF8(&text, s, -1, &status);
+		utext_openUChars(&text, this->utf16_str.Begin(), this->utf16_str.Length() - 1, &status);
 		this->char_itr->setText(&text, status);
+		this->word_itr->setText(&text, status);
 		this->char_itr->first();
+		this->word_itr->first();
 	}
 
 	virtual size_t SetCurPosition(size_t pos)
 	{
+		/* Convert incoming position to an UTF-16 string index. */
+		uint utf16_pos = 0;
+		for (uint i = 0; i < this->utf16_to_utf8.Length(); i++) {
+			if (this->utf16_to_utf8[i] == pos) {
+				utf16_pos = i;
+				break;
+			}
+		}
+
 		/* isBoundary has the documented side-effect of setting the current
 		 * position to the first valid boundary equal to or greater than
 		 * the passed value. */
-		this->char_itr->isBoundary((int32_t)pos);
-		return this->char_itr->current();
+		this->char_itr->isBoundary(utf16_pos);
+		return this->utf16_to_utf8[this->char_itr->current()];
 	}
 
-	virtual size_t Next()
+	virtual size_t Next(IterType what)
 	{
-		int32_t pos = this->char_itr->next();
-		return pos == icu::BreakIterator::DONE ? END : pos;
+		int32_t pos;
+		switch (what) {
+			case ITER_CHARACTER:
+				pos = this->char_itr->next();
+				break;
+
+			case ITER_WORD:
+				pos = this->word_itr->following(this->char_itr->current());
+				/* The ICU word iterator considers both the start and the end of a word a valid
+				 * break point, but we only want word starts. Move to the next location in
+				 * case the new position points to whitespace. */
+				while (pos != icu::BreakIterator::DONE && IsWhitespace(Utf16DecodeChar((const uint16 *)&this->utf16_str[pos]))) pos = this->word_itr->next();
+
+				this->char_itr->isBoundary(pos);
+				break;
+
+			default:
+				return END;
+		}
+
+		return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
 	}
 
-	virtual size_t Prev()
+	virtual size_t Prev(IterType what)
 	{
-		int32_t pos = this->char_itr->previous();
-		return pos == icu::BreakIterator::DONE ? END : pos;
+		int32_t pos;
+		switch (what) {
+			case ITER_CHARACTER:
+				pos = this->char_itr->previous();
+				break;
+
+			case ITER_WORD:
+				pos = this->word_itr->preceding(this->char_itr->current());
+				/* The ICU word iterator considers both the start and the end of a word a valid
+				 * break point, but we only want word starts. Move to the previous location in
+				 * case the new position points to whitespace. */
+				while (pos != icu::BreakIterator::DONE && IsWhitespace(Utf16DecodeChar((const uint16 *)&this->utf16_str[pos]))) pos = this->word_itr->previous();
+
+				this->char_itr->isBoundary(pos);
+				break;
+
+			default:
+				return END;
+		}
+
+		return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
 	}
 };
 
@@ -730,38 +812,85 @@
 	virtual void SetString(const char *s)
 	{
 		this->string = s;
-		this->len = strlen(s) + 1;
+		this->len = strlen(s);
 		this->cur_pos = 0;
 	}
 
 	virtual size_t SetCurPosition(size_t pos)
 	{
-		assert(this->string != NULL && pos < this->len);
+		assert(this->string != NULL && pos <= this->len);
 		/* Sanitize in case we get a position inside an UTF-8 sequence. */
 		while (pos > 0 && IsUtf8Part(this->string[pos])) pos--;
 		return this->cur_pos = pos;
 	}
 
-	virtual size_t Next()
+	virtual size_t Next(IterType what)
 	{
 		assert(this->string != NULL);
 
 		/* Already at the end? */
 		if (this->cur_pos >= this->len) return END;
 
-		WChar c;
-		this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos);
-		return this->cur_pos;
+		switch (what) {
+			case ITER_CHARACTER: {
+				WChar c;
+				this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos);
+				return this->cur_pos;
+			}
+
+			case ITER_WORD: {
+				WChar c;
+				/* Consume current word. */
+				size_t offs = Utf8Decode(&c, this->string + this->cur_pos);
+				while (this->cur_pos < this->len && !IsWhitespace(c)) {
+					this->cur_pos += offs;
+					offs = Utf8Decode(&c, this->string + this->cur_pos);
+				}
+				/* Consume whitespace to the next word. */
+				while (this->cur_pos < this->len && IsWhitespace(c)) {
+					this->cur_pos += offs;
+					offs = Utf8Decode(&c, this->string + this->cur_pos);
+				}
+
+				return this->cur_pos;
+			}
+		}
+
+		return END;
 	}
 
-	virtual size_t Prev()
+	virtual size_t Prev(IterType what)
 	{
 		assert(this->string != NULL);
 
 		/* Already at the beginning? */
 		if (this->cur_pos == 0) return END;
 
-		return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string;
+		switch (what) {
+			case ITER_CHARACTER:
+				return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string;
+
+			case ITER_WORD: {
+				const char *s = this->string + this->cur_pos;
+				WChar c;
+				/* Consume preceding whitespace. */
+				do {
+					s = Utf8PrevChar(s);
+					Utf8Decode(&c, s);
+				} while (s > this->string && IsWhitespace(c));
+				/* Consume preceding word. */
+				while (s > this->string && !IsWhitespace(c)) {
+					s = Utf8PrevChar(s);
+					Utf8Decode(&c, s);
+				}
+				/* Move caret back to the beginning of the word. */
+				if (IsWhitespace(c)) Utf8Consume(&s);
+
+				return this->cur_pos = s - this->string;
+			}
+		}
+
+		return END;
 	}
 };
 
--- a/src/string_base.h
+++ b/src/string_base.h
@@ -15,6 +15,12 @@
 /** Class for iterating over different kind of parts of a string. */
 class StringIterator {
 public:
+	/** Type of the iterator. */
+	enum IterType {
+		ITER_CHARACTER, ///< Iterate over characters (or more exactly grapheme clusters).
+		ITER_WORD,      ///< Iterate over words.
+	};
+
 	/** Sentinel to indicate end-of-iteration. */
 	static const size_t END = SIZE_MAX;
 
@@ -45,13 +51,13 @@
 	 * Advance the cursor by one iteration unit.
 	 * @return New cursor position (in bytes) or #END if the cursor is already at the end of the string.
 	 */
-	virtual size_t Next() = 0;
+	virtual size_t Next(IterType what = ITER_CHARACTER) = 0;
 
 	/**
 	 * Move the cursor back by one iteration unit.
 	 * @return New cursor position (in bytes) or #END if the cursor is already at the start of the string.
 	 */
-	virtual size_t Prev() = 0;
+	virtual size_t Prev(IterType what = ITER_CHARACTER) = 0;
 
 protected:
 	StringIterator() {}
--- a/src/string_func.h
+++ b/src/string_func.h
@@ -90,7 +90,6 @@
 	return c;
 }
 
-
 /**
  * Return the length of a UTF-8 encoded character.
  * @param c Unicode character.
@@ -157,6 +156,51 @@
 size_t Utf8StringLength(const char *s);
 
 /**
+ * Is the given character a lead surrogate code point?
+ * @param c The character to test.
+ * @return True if the character is a lead surrogate code point.
+ */
+static inline bool Utf16IsLeadSurrogate(uint c)
+{
+	return c >= 0xD800 && c <= 0xDBFF;
+}
+
+/**
+ * Is the given character a lead surrogate code point?
+ * @param c The character to test.
+ * @return True if the character is a lead surrogate code point.
+ */
+static inline bool Utf16IsTrailSurrogate(uint c)
+{
+	return c >= 0xDC00 && c <= 0xDFFF;
+}
+
+/**
+ * Convert an UTF-16 surrogate pair to the corresponding Unicode character.
+ * @param lead Lead surrogate code point.
+ * @param trail Trail surrogate code point.
+ * @return Decoded Unicode character.
+ */
+static inline WChar Utf16DecodeSurrogate(uint lead, uint trail)
+{
+	return 0x10000 + (((lead - 0xD800) << 10) | (trail - 0xDC00));
+}
+
+/**
+ * Decode an UTF-16 character.
+ * @param c Pointer to one or two UTF-16 code points.
+ * @return Decoded Unicode character.
+ */
+static inline WChar Utf16DecodeChar(const uint16 *c)
+{
+	if (Utf16IsLeadSurrogate(c[0])) {
+		return Utf16DecodeSurrogate(c[0], c[1]);
+	} else {
+		return *c;
+	}
+}
+
+/**
  * Is the given character a text direction character.
  * @param c The character to test.
  * @return true iff the character is used to influence
--- a/src/textbuf.cpp
+++ b/src/textbuf.cpp
@@ -219,70 +219,12 @@
 	return true;
 }
 
-/**
- * Checks if it is possible to move caret to the left
- * @return true if the caret can be moved to the left, otherwise false.
- */
-bool Textbuf::CanMoveCaretLeft()
-{
-	return this->caretpos != 0;
-}
-
-/**
- * Moves the caret to the left.
- * @pre Ensure that Textbuf::CanMoveCaretLeft returns true
- * @return The character under the caret.
- */
-WChar Textbuf::MoveCaretLeft()
-{
-	assert(this->CanMoveCaretLeft());
-
-	size_t pos = this->char_iter->Prev();
-	if (pos == StringIterator::END) pos = 0;
-
-	this->caretpos = (uint16)pos;
-	this->UpdateCaretPosition();
-
-	WChar c;
-	Utf8Decode(&c, this->buf + this->caretpos);
-
-	return c;
-}
-
-/**
- * Checks if it is possible to move caret to the right
- * @return true if the caret can be moved to the right, otherwise false.
- */
-bool Textbuf::CanMoveCaretRight()
-{
-	return this->caretpos < this->bytes - 1;
-}
-
-/**
- * Moves the caret to the right.
- * @pre Ensure that Textbuf::CanMoveCaretRight returns true
- * @return The character under the caret.
- */
-WChar Textbuf::MoveCaretRight()
-{
-	assert(this->CanMoveCaretRight());
-
-	size_t pos = this->char_iter->Next();
-	if (pos == StringIterator::END) pos = this->bytes - 1;
-
-	this->caretpos = (uint16)pos;
-	this->UpdateCaretPosition();
-
-	WChar c;
-	Utf8Decode(&c, this->buf + this->caretpos);
-	return c;
-}
-
 /** Update the character iter after the text has changed. */
 void Textbuf::UpdateStringIter()
 {
 	this->char_iter->SetString(this->buf);
-	this->caretpos = (uint16)this->char_iter->SetCurPosition(this->caretpos);
+	size_t pos = this->char_iter->SetCurPosition(this->caretpos);
+	this->caretpos = pos == StringIterator::END ? 0 : (uint16)pos;
 }
 
 /** Update pixel width of the text. */
@@ -307,64 +249,38 @@
 {
 	switch (keycode) {
 		case WKC_LEFT:
-			if (this->CanMoveCaretLeft()) {
-				this->MoveCaretLeft();
-				return true;
-			}
-			break;
-
 		case WKC_CTRL | WKC_LEFT: {
-			if (!this->CanMoveCaretLeft()) break;
+			if (this->caretpos == 0) break;
 
-			/* Unconditionally move one char to the left. */
-			WChar c = this->MoveCaretLeft();
-			/* Consume left whitespaces. */
-			while (IsWhitespace(c)) {
-				if (!this->CanMoveCaretLeft()) return true;
-				c = this->MoveCaretLeft();
-			}
-			/* Consume left word. */
-			while (!IsWhitespace(c)) {
-				if (!this->CanMoveCaretLeft()) return true;
-				c = this->MoveCaretLeft();
-			}
-			/* Place caret at the beginning of the left word. */
-			this->MoveCaretRight();
+			size_t pos = this->char_iter->Prev(keycode & WKC_CTRL ? StringIterator::ITER_WORD : StringIterator::ITER_CHARACTER);
+			if (pos == StringIterator::END) return true;
+
+			this->caretpos = (uint16)pos;
+			this->UpdateCaretPosition();
 			return true;
 		}
 
 		case WKC_RIGHT:
-			if (this->CanMoveCaretRight()) {
-				this->MoveCaretRight();
-				return true;
-			}
-			break;
-
 		case WKC_CTRL | WKC_RIGHT: {
-			if (!this->CanMoveCaretRight()) break;
+			if (this->caretpos >= this->bytes - 1) break;
 
-			/* Unconditionally move one char to the right. */
-			WChar c = this->MoveCaretRight();
-			/* Continue to consume current word. */
-			while (!IsWhitespace(c)) {
-				if (!this->CanMoveCaretRight()) return true;
-				c = this->MoveCaretRight();
-			}
-			/* Consume right whitespaces. */
-			while (IsWhitespace(c)) {
-				if (!this->CanMoveCaretRight()) return true;
-				c = this->MoveCaretRight();
-			}
+			size_t pos = this->char_iter->Next(keycode & WKC_CTRL ? StringIterator::ITER_WORD : StringIterator::ITER_CHARACTER);
+			if (pos == StringIterator::END) return true;
+
+			this->caretpos = (uint16)pos;
+			this->UpdateCaretPosition();
 			return true;
 		}
 
 		case WKC_HOME:
 			this->caretpos = 0;
+			this->char_iter->SetCurPosition(this->caretpos);
 			this->UpdateCaretPosition();
 			return true;
 
 		case WKC_END:
 			this->caretpos = this->bytes - 1;
+			this->char_iter->SetCurPosition(this->caretpos);
 			this->UpdateCaretPosition();
 			return true;
 
--- a/src/textbuf_type.h
+++ b/src/textbuf_type.h
@@ -67,10 +67,6 @@
 	bool CanDelChar(bool backspace);
 	WChar GetNextDelChar(bool backspace);
 	void DelChar(bool backspace);
-	bool CanMoveCaretLeft();
-	WChar MoveCaretLeft();
-	bool CanMoveCaretRight();
-	WChar MoveCaretRight();
 
 	void UpdateStringIter();
 	void UpdateWidth();