From bc8764eaa72bbaed3b097e766671757c01c66cdb Mon Sep 17 00:00:00 2001 From: Adrian Pemsel Date: Thu, 3 Mar 2016 10:35:55 +0100 Subject: [PATCH] test and fix for a bug in TokenizedAttributedString --- src/TokenizedAttributedString.php | 10 +++++----- test/unit/TokenizedAttributedStringTest.php | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/TokenizedAttributedString.php b/src/TokenizedAttributedString.php index 1576bde..0eacc70 100644 --- a/src/TokenizedAttributedString.php +++ b/src/TokenizedAttributedString.php @@ -90,7 +90,7 @@ public function getTokenOffset($i) { public function setTokenAttribute($i, $attribute, $state = true) { $token = $this->tokens[$i]; $offset = $this->tokenOffsets[$i]; - $length = strlen($token); + $length = mb_strlen($token, "utf-8"); return $this->setLength($offset, $length, $attribute, $state); } @@ -105,7 +105,7 @@ public function setTokenAttribute($i, $attribute, $state = true) { */ public function setTokenRangeAttribute($from, $to, $attribute, $state = true) { $fromOffset = $this->tokenOffsets[$from]; - $toOffset = $this->tokenOffsets[$to] + strlen($this->tokens[$to]); + $toOffset = $this->tokenOffsets[$to] + mb_strlen($this->tokens[$to], "utf-8"); return $this->setRange($fromOffset, $toOffset, $attribute, $state); } @@ -148,7 +148,7 @@ public function lowercaseTokens() { * Tokenize a string on whitespace * * @param string $string string to be tokenized - * @return string[] tokens + * @return array array of two arrays, with tokens at index 0 and their offsets at index 1 */ public static function tokenizeOnWhitespace($string) { // Matches pontential whitespace in front of the token and the token itself. @@ -160,7 +160,7 @@ public static function tokenizeOnWhitespace($string) { * Tokenize a string on words * * @param string $string string to be tokenized - * @return string[] tokens + * @return array array of two arrays, with tokens at index 0 and their offsets at index 1 */ public static function tokenizeOnWords($string) { return self::tokenizeOnRegex($string, '/([\w]+)/u'); @@ -171,7 +171,7 @@ public static function tokenizeOnWords($string) { * * @param string $string string to be tokenized * @param string $pattern regex. The token must be captured in the first subgroup. - * @return string[] tokens + * @return array array of two arrays, with tokens at index 0 and their offsets at index 1 */ public static function tokenizeOnRegex($string, $pattern) { diff --git a/test/unit/TokenizedAttributedStringTest.php b/test/unit/TokenizedAttributedStringTest.php index 66c62ad..c79a6c3 100644 --- a/test/unit/TokenizedAttributedStringTest.php +++ b/test/unit/TokenizedAttributedStringTest.php @@ -39,7 +39,7 @@ public function testGetTokenCount() { } public function testSetTokenAttribute() { - $as = new TokenizedAttributedString("foo bar baz"); + $as = new TokenizedAttributedString("foo bär baz"); $as->setTokenAttribute(1, "bold"); $this->assertEquals(true, $as->is("bold", 5)); $this->assertEquals(false, $as->is("bold", 3));