From bc8764eaa72bbaed3b097e766671757c01c66cdb Mon Sep 17 00:00:00 2001
From: Adrian Pemsel <apemsel@gmail.com>
Date: Thu, 3 Mar 2016 10:35:55 +0100
Subject: [PATCH] test and fix for a bug in TokenizedAttributedString

---
 src/TokenizedAttributedString.php           | 10 +++++-----
 test/unit/TokenizedAttributedStringTest.php |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/TokenizedAttributedString.php b/src/TokenizedAttributedString.php
index 1576bde..0eacc70 100644
--- a/src/TokenizedAttributedString.php
+++ b/src/TokenizedAttributedString.php
@@ -90,7 +90,7 @@ public function getTokenOffset($i) {
   public function setTokenAttribute($i, $attribute, $state = true) {
     $token = $this->tokens[$i];
     $offset = $this->tokenOffsets[$i];
-    $length = strlen($token);
+    $length = mb_strlen($token, "utf-8");
     
     return $this->setLength($offset, $length, $attribute, $state);
   }
@@ -105,7 +105,7 @@ public function setTokenAttribute($i, $attribute, $state = true) {
    */
   public function setTokenRangeAttribute($from, $to, $attribute, $state = true) {
     $fromOffset = $this->tokenOffsets[$from];
-    $toOffset = $this->tokenOffsets[$to] + strlen($this->tokens[$to]);
+    $toOffset = $this->tokenOffsets[$to] + mb_strlen($this->tokens[$to], "utf-8");
     
     return $this->setRange($fromOffset, $toOffset, $attribute, $state);
   }
@@ -148,7 +148,7 @@ public function lowercaseTokens() {
    * Tokenize a string on whitespace
    *
    * @param string $string string to be tokenized
-   * @return string[] tokens
+   * @return array array of two arrays, with tokens at index 0 and their offsets at index 1
    */
   public static function tokenizeOnWhitespace($string) {
     // Matches pontential whitespace in front of the token and the token itself.
@@ -160,7 +160,7 @@ public static function tokenizeOnWhitespace($string) {
    * Tokenize a string on words
    *
    * @param string $string string to be tokenized
-   * @return string[] tokens
+   * @return array array of two arrays, with tokens at index 0 and their offsets at index 1
    */
   public static function tokenizeOnWords($string) {
     return self::tokenizeOnRegex($string, '/([\w]+)/u');
@@ -171,7 +171,7 @@ public static function tokenizeOnWords($string) {
    *
    * @param string $string string to be tokenized
    * @param string $pattern regex. The token must be captured in the first subgroup.
-   * @return string[] tokens
+   * @return array array of two arrays, with tokens at index 0 and their offsets at index 1
    */
   public static function tokenizeOnRegex($string, $pattern)
   {
diff --git a/test/unit/TokenizedAttributedStringTest.php b/test/unit/TokenizedAttributedStringTest.php
index 66c62ad..c79a6c3 100644
--- a/test/unit/TokenizedAttributedStringTest.php
+++ b/test/unit/TokenizedAttributedStringTest.php
@@ -39,7 +39,7 @@ public function testGetTokenCount() {
   }
   
   public function testSetTokenAttribute() {
-    $as = new TokenizedAttributedString("foo bar baz");
+    $as = new TokenizedAttributedString("foo bär baz");
     $as->setTokenAttribute(1, "bold");
     $this->assertEquals(true, $as->is("bold", 5));
     $this->assertEquals(false, $as->is("bold", 3));