diff --git a/src/AttributedString.php b/src/AttributedString.php index 33b8200..7f0e2a8 100644 --- a/src/AttributedString.php +++ b/src/AttributedString.php @@ -401,7 +401,7 @@ public function attributeToString($attribute, $true = "-", $false = " ") { * * May improve performance if setPattern is used extensively */ - public function enablebyteToCharCache() { + public function enableByteToCharCache() { $this->byteToChar = []; $char = 0; for ($i = 0; $i < strlen($this->string); ) { diff --git a/src/TokenizedAttributedString.php b/src/TokenizedAttributedString.php index 9301a86..773d5df 100644 --- a/src/TokenizedAttributedString.php +++ b/src/TokenizedAttributedString.php @@ -19,6 +19,8 @@ class TokenizedAttributedString extends AttributedString * @param string $tokenizer Tokenizer to use, either "whitespace", "word" or a custom regex */ public function __construct($string, $tokenizer = "whitespace") { + parent::__construct($string); + $tokenizerFunction = "tokenizeOn".ucfirst($tokenizer); if ($tokenizer[0] == "/") { @@ -30,7 +32,11 @@ public function __construct($string, $tokenizer = "whitespace") { list($this->tokens, $this->tokenOffsets) = self::$tokenizerFunction($string); } - parent::__construct($string); + // convert byte to char offsets + $this->enableByteToCharCache(); + $this->tokenOffsets = array_map(function($o) { + return $this->byteToCharOffset($o); + }, $this->tokenOffsets); } /** @@ -148,7 +154,7 @@ public function lowercaseTokens() { * Tokenize a string on whitespace * * @param string $string string to be tokenized - * @return array array of two arrays, with tokens at index 0 and their offsets at index 1 + * @return array array of two arrays, with tokens at index 0 and their byte offsets at index 1 */ public static function tokenizeOnWhitespace($string) { // Matches pontential whitespace in front of the token and the token itself. @@ -160,7 +166,7 @@ public static function tokenizeOnWhitespace($string) { * Tokenize a string on words * * @param string $string string to be tokenized - * @return array array of two arrays, with tokens at index 0 and their offsets at index 1 + * @return array array of two arrays, with tokens at index 0 and their byte offsets at index 1 */ public static function tokenizeOnWords($string) { return self::tokenizeOnRegex($string, '/([\p{L}\p{S}\p{N}]+)/u'); @@ -171,7 +177,7 @@ public static function tokenizeOnWords($string) { * * @param string $string string to be tokenized * @param string $pattern regex. The token must be captured in the first subgroup. - * @return array array of two arrays, with tokens at index 0 and their offsets at index 1 + * @return array array of two arrays, with tokens at index 0 and their byte offsets at index 1 */ public static function tokenizeOnRegex($string, $pattern) { diff --git a/test/unit/TokenizedAttributedStringTest.php b/test/unit/TokenizedAttributedStringTest.php index c195e16..dc69546 100644 --- a/test/unit/TokenizedAttributedStringTest.php +++ b/test/unit/TokenizedAttributedStringTest.php @@ -31,6 +31,10 @@ public function testGetToken() { public function testGetTokenOffset() { $as = new TokenizedAttributedString(" one two\nthree\rfour\n\r five "); $this->assertEquals(9, $as->getTokenOffset(2)); + + $as = new TokenizedAttributedString("ä ö ü"); + $this->assertEquals(2, $as->getTokenOffset(1)); + } public function testGetTokenCount() {