Skip to content

Commit

Permalink
implementation for TokenizedAttributedString getTokenCount() and make…
Browse files Browse the repository at this point in the history
… tokenizers public
  • Loading branch information
apemsel committed Mar 3, 2016
1 parent 491d35a commit 7e7a771
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 8 deletions.
2 changes: 1 addition & 1 deletion src/AttributedString.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

/**
* Basic class to work with attributed strings.
*
*
* Attributed strings are strings that can have multiple attributes per character of the string
*
* @author Adrian Pemsel <[email protected]>
Expand Down
1 change: 0 additions & 1 deletion src/MutableAttributedString.php
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
<?php

namespace apemsel\AttributedString;

/**
Expand Down
39 changes: 33 additions & 6 deletions src/TokenizedAttributedString.php
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
<?php

namespace apemsel\AttributedString;

/**
Expand Down Expand Up @@ -46,11 +45,20 @@ public function getTokens() {
/**
* Return all tokens' offsets
*
* @return in[] offsets
* @return int[] offsets
*/
public function getTokenOffsets() {
return $this->tokenOffsets;
}

/**
* Return the number of tokens
*
* @return int count
*/
public function getTokenCount() {
return count($this->tokens);
}

/**
* Get indicated token
Expand Down Expand Up @@ -127,7 +135,7 @@ public function attributesAtToken($i) {
return $this->attributesAt($this->tokenOffsets[$i]);
}

/*
/**
* Convert all tokens to lower case
*/
public function lowercaseTokens() {
Expand All @@ -136,17 +144,36 @@ public function lowercaseTokens() {
}, $this->tokens);
}

protected static function tokenizeOnWhitespace($string) {
/**
* Tokenize a string on whitespace
*
* @param string $string string to be tokenized
* @return string[] tokens
*/
public static function tokenizeOnWhitespace($string) {
// Matches pontential whitespace in front of the token and the token itself.
// Matching the whitespace could be omitted, but that results in slower execution ;-)
return self::tokenizeOnRegex($string, '/[\s\n\r]*([^\s\n\r]+)/u');
}

protected static function tokenizeOnWords($string) {
/**
* Tokenize a string on words
*
* @param string $string string to be tokenized
* @return string[] tokens
*/
public static function tokenizeOnWords($string) {
return self::tokenizeOnRegex($string, '/([\w]+)/u');
}

protected static function tokenizeOnRegex($string, $pattern)
/**
* Tokenize a string with a given regex
*
* @param string $string string to be tokenized
* @param string $pattern regex. The token must be captured in the first subgroup.
* @return string[] tokens
*/
public static function tokenizeOnRegex($string, $pattern)
{
// Fastest way to get both tokens and their offsets, but not easy to understand.
preg_match_all($pattern, $string, $matches, PREG_OFFSET_CAPTURE);
Expand Down

0 comments on commit 7e7a771

Please sign in to comment.