Skip to content

Commit

Permalink
fix bug in token offsets
Browse files Browse the repository at this point in the history
  • Loading branch information
apemsel committed Mar 10, 2016
1 parent 88bec4e commit 4d8ce1f
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 5 deletions.
2 changes: 1 addition & 1 deletion src/AttributedString.php
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,7 @@ public function attributeToString($attribute, $true = "-", $false = " ") {
*
* May improve performance if setPattern is used extensively
*/
public function enablebyteToCharCache() {
public function enableByteToCharCache() {
$this->byteToChar = [];
$char = 0;
for ($i = 0; $i < strlen($this->string); ) {
Expand Down
14 changes: 10 additions & 4 deletions src/TokenizedAttributedString.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ class TokenizedAttributedString extends AttributedString
* @param string $tokenizer Tokenizer to use, either "whitespace", "word" or a custom regex
*/
public function __construct($string, $tokenizer = "whitespace") {
parent::__construct($string);

$tokenizerFunction = "tokenizeOn".ucfirst($tokenizer);

if ($tokenizer[0] == "/") {
Expand All @@ -30,7 +32,11 @@ public function __construct($string, $tokenizer = "whitespace") {
list($this->tokens, $this->tokenOffsets) = self::$tokenizerFunction($string);
}

parent::__construct($string);
// convert byte to char offsets
$this->enableByteToCharCache();
$this->tokenOffsets = array_map(function($o) {
return $this->byteToCharOffset($o);
}, $this->tokenOffsets);
}

/**
Expand Down Expand Up @@ -148,7 +154,7 @@ public function lowercaseTokens() {
* Tokenize a string on whitespace
*
* @param string $string string to be tokenized
* @return array array of two arrays, with tokens at index 0 and their offsets at index 1
* @return array array of two arrays, with tokens at index 0 and their byte offsets at index 1
*/
public static function tokenizeOnWhitespace($string) {
// Matches pontential whitespace in front of the token and the token itself.
Expand All @@ -160,7 +166,7 @@ public static function tokenizeOnWhitespace($string) {
* Tokenize a string on words
*
* @param string $string string to be tokenized
* @return array array of two arrays, with tokens at index 0 and their offsets at index 1
* @return array array of two arrays, with tokens at index 0 and their byte offsets at index 1
*/
public static function tokenizeOnWords($string) {
return self::tokenizeOnRegex($string, '/([\p{L}\p{S}\p{N}]+)/u');
Expand All @@ -171,7 +177,7 @@ public static function tokenizeOnWords($string) {
*
* @param string $string string to be tokenized
* @param string $pattern regex. The token must be captured in the first subgroup.
* @return array array of two arrays, with tokens at index 0 and their offsets at index 1
* @return array array of two arrays, with tokens at index 0 and their byte offsets at index 1
*/
public static function tokenizeOnRegex($string, $pattern)
{
Expand Down
4 changes: 4 additions & 0 deletions test/unit/TokenizedAttributedStringTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ public function testGetToken() {
public function testGetTokenOffset() {
$as = new TokenizedAttributedString(" one two\nthree\rfour\n\r five ");
$this->assertEquals(9, $as->getTokenOffset(2));

$as = new TokenizedAttributedString("ä ö ü");
$this->assertEquals(2, $as->getTokenOffset(1));

}

public function testGetTokenCount() {
Expand Down

0 comments on commit 4d8ce1f

Please sign in to comment.