fix bug in token offsets

apemsel · Mar 10, 2016 · 4d8ce1f · 4d8ce1f
1 parent 88bec4e
commit 4d8ce1f
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 5 deletions.
diff --git a/src/AttributedString.php b/src/AttributedString.php
@@ -401,7 +401,7 @@ public function attributeToString($attribute, $true = "-", $false = " ") {
    *
    * May improve performance if setPattern is used extensively
    */
-  public function enablebyteToCharCache() {
+  public function enableByteToCharCache() {
     $this->byteToChar = [];
     $char = 0;
     for ($i = 0; $i < strlen($this->string); ) {

diff --git a/src/TokenizedAttributedString.php b/src/TokenizedAttributedString.php
@@ -19,6 +19,8 @@ class TokenizedAttributedString extends AttributedString
    * @param string $tokenizer Tokenizer to use, either "whitespace", "word" or a custom regex
    */
   public function __construct($string, $tokenizer = "whitespace") {
+    parent::__construct($string);
+
     $tokenizerFunction = "tokenizeOn".ucfirst($tokenizer);
 
     if ($tokenizer[0] == "/") {
@@ -30,7 +32,11 @@ public function __construct($string, $tokenizer = "whitespace") {
       list($this->tokens, $this->tokenOffsets) = self::$tokenizerFunction($string);
     }
 
-    parent::__construct($string);
+    // convert byte to char offsets
+    $this->enableByteToCharCache();
+    $this->tokenOffsets = array_map(function($o) {
+      return $this->byteToCharOffset($o);
+    }, $this->tokenOffsets);
   }
 
   /**
@@ -148,7 +154,7 @@ public function lowercaseTokens() {
    * Tokenize a string on whitespace
    *
    * @param string $string string to be tokenized
-   * @return array array of two arrays, with tokens at index 0 and their offsets at index 1
+   * @return array array of two arrays, with tokens at index 0 and their byte offsets at index 1
    */
   public static function tokenizeOnWhitespace($string) {
     // Matches pontential whitespace in front of the token and the token itself.
@@ -160,7 +166,7 @@ public static function tokenizeOnWhitespace($string) {
    * Tokenize a string on words
    *
    * @param string $string string to be tokenized
-   * @return array array of two arrays, with tokens at index 0 and their offsets at index 1
+   * @return array array of two arrays, with tokens at index 0 and their byte offsets at index 1
    */
   public static function tokenizeOnWords($string) {
     return self::tokenizeOnRegex($string, '/([\p{L}\p{S}\p{N}]+)/u');
@@ -171,7 +177,7 @@ public static function tokenizeOnWords($string) {
    *
    * @param string $string string to be tokenized
    * @param string $pattern regex. The token must be captured in the first subgroup.
-   * @return array array of two arrays, with tokens at index 0 and their offsets at index 1
+   * @return array array of two arrays, with tokens at index 0 and their byte offsets at index 1
    */
   public static function tokenizeOnRegex($string, $pattern)
   {

diff --git a/test/unit/TokenizedAttributedStringTest.php b/test/unit/TokenizedAttributedStringTest.php
@@ -31,6 +31,10 @@ public function testGetToken() {
   public function testGetTokenOffset() {
     $as = new TokenizedAttributedString(" one two\nthree\rfour\n\r five  ");
     $this->assertEquals(9, $as->getTokenOffset(2));
+
+    $as = new TokenizedAttributedString("ä ö ü");
+    $this->assertEquals(2, $as->getTokenOffset(1));
+
   }
 
   public function testGetTokenCount() {