diff --git a/doc/class-apemsel.AttributedString.AttributedString.html b/doc/class-apemsel.AttributedString.AttributedString.html
index 97bc5ac..f7c9716 100644
--- a/doc/class-apemsel.AttributedString.AttributedString.html
+++ b/doc/class-apemsel.AttributedString.AttributedString.html
@@ -807,7 +807,7 @@
public
@@ -817,8 +817,8 @@ Parameters
|
- #
- enablebyteToCharCache( )
+ #
+ enableByteToCharCache( )
Enable and fill cache for byte to char offset conversion
diff --git a/doc/class-apemsel.AttributedString.MutableAttributedString.html b/doc/class-apemsel.AttributedString.MutableAttributedString.html
index c069729..3541b37 100644
--- a/doc/class-apemsel.AttributedString.MutableAttributedString.html
+++ b/doc/class-apemsel.AttributedString.MutableAttributedString.html
@@ -354,7 +354,7 @@ Overrides
count() ,
createAttribute() ,
deleteAttribute() ,
- enablebyteToCharCache() ,
+ enableByteToCharCache() ,
filter() ,
hasAttribute() ,
is() ,
diff --git a/doc/class-apemsel.AttributedString.TokenizedAttributedString.html b/doc/class-apemsel.AttributedString.TokenizedAttributedString.html
index da7efb1..982d363 100644
--- a/doc/class-apemsel.AttributedString.TokenizedAttributedString.html
+++ b/doc/class-apemsel.AttributedString.TokenizedAttributedString.html
@@ -127,7 +127,7 @@ Class TokenizedAttributedString
Author:
Adrian Pemsel apemsel@gmail.com
- Located at TokenizedAttributedString.php
+ Located at TokenizedAttributedString.php
@@ -147,7 +147,7 @@ Class TokenizedAttributedString
#
- __construct( string|apemsel\AttributedString\AttributedString $string, string $tokenizer = "whitespace" )
+ __construct( string|apemsel\AttributedString\AttributedString $string, string $tokenizer = "whitespace" )
@@ -185,7 +185,7 @@ Overrides
#
- getTokens( )
+ getTokens( )
Return all tokens
@@ -218,7 +218,7 @@ Returns
#
- getTokenOffsets( )
+ getTokenOffsets( )
Return all tokens' offsets
@@ -251,7 +251,7 @@ Returns
#
- getTokenCount( )
+ getTokenCount( )
Return the number of tokens
@@ -284,7 +284,7 @@ Returns
#
- getToken( integer $i )
+ getToken( integer $i )
Get indicated token
@@ -322,7 +322,7 @@ Returns
#
- getTokenOffset( integer $i )
+ getTokenOffset( integer $i )
Get indicated token offset
@@ -360,7 +360,7 @@ Returns
#
- setTokenAttribute( integer $i, string $attribute, boolean $state = true )
+ setTokenAttribute( integer $i, string $attribute, boolean $state = true )
Set a token to a given attribute and state
@@ -398,7 +398,7 @@ Parameters
#
- setTokenRangeAttribute( integer $from, integer $to, string $attribute, boolean $state = true )
+ setTokenRangeAttribute( integer $from, integer $to, string $attribute, boolean $state = true )
Set a range of tokens to a given attribute and state
@@ -438,7 +438,7 @@ Parameters
#
- setTokenDictionaryAttribute( string[] $dictionary, string $attribute, boolean $state = true )
+ setTokenDictionaryAttribute( string[] $dictionary, string $attribute, boolean $state = true )
Set all tokens matching given dictionary to attribute and state
@@ -476,7 +476,7 @@ Parameters
#
- attributesAtToken( integer $i )
+ attributesAtToken( integer $i )
Get all attribute of token at given index
@@ -514,7 +514,7 @@ Returns
#
- lowercaseTokens( )
+ lowercaseTokens( )
Convert all tokens to lower case
@@ -543,7 +543,7 @@ Returns
#
- tokenizeOnWhitespace( string $string )
+ tokenizeOnWhitespace( string $string )
Tokenize a string on whitespace
@@ -561,7 +561,7 @@ Parameters
Returns
- array array of two arrays, with tokens at index 0 and their offsets at index 1
+ array array of two arrays, with tokens at index 0 and their byte offsets at index 1
@@ -581,7 +581,7 @@ Returns
#
- tokenizeOnWords( string $string )
+ tokenizeOnWords( string $string )
Tokenize a string on words
@@ -599,7 +599,7 @@ Parameters
Returns
- array array of two arrays, with tokens at index 0 and their offsets at index 1
+ array array of two arrays, with tokens at index 0 and their byte offsets at index 1
@@ -619,7 +619,7 @@ Returns
#
- tokenizeOnRegex( string $string, string $pattern )
+ tokenizeOnRegex( string $string, string $pattern )
Tokenize a string with a given regex
@@ -639,7 +639,7 @@ Parameters
Returns
- array array of two arrays, with tokens at index 0 and their offsets at index 1
+ array array of two arrays, with tokens at index 0 and their byte offsets at index 1
@@ -659,7 +659,7 @@ Returns
#
- offsetExists( integer $i )
+ offsetExists( integer $i )
Check if the token at the given index exists
@@ -699,7 +699,7 @@ Overrides
#
- offsetGet( integer $i )
+ offsetGet( integer $i )
Get token at given index
@@ -746,7 +746,7 @@ Overrides
count() ,
createAttribute() ,
deleteAttribute() ,
- enablebyteToCharCache() ,
+ enableByteToCharCache() ,
filter() ,
hasAttribute() ,
is() ,
diff --git a/doc/source-class-apemsel.AttributedString.AttributedString.html b/doc/source-class-apemsel.AttributedString.AttributedString.html
index 700fb82..09c5eb2 100644
--- a/doc/source-class-apemsel.AttributedString.AttributedString.html
+++ b/doc/source-class-apemsel.AttributedString.AttributedString.html
@@ -485,7 +485,7 @@ Classes
401 402 403
-404 public function enablebyteToCharCache() {
+404 public function enableByteToCharCache() {
405 $this->byteToChar = [];
406 $char = 0;
407 for ($i = 0; $i < strlen($this->string); ) {
diff --git a/doc/source-class-apemsel.AttributedString.TokenizedAttributedString.html b/doc/source-class-apemsel.AttributedString.TokenizedAttributedString.html
index 2bd57be..e170a29 100644
--- a/doc/source-class-apemsel.AttributedString.TokenizedAttributedString.html
+++ b/doc/source-class-apemsel.AttributedString.TokenizedAttributedString.html
@@ -103,198 +103,204 @@ Classes
19 20
21 public function __construct($string, $tokenizer = "whitespace") {
- 22 $tokenizerFunction = "tokenizeOn".ucfirst($tokenizer);
- 23
- 24 if ($tokenizer[0] == "/") {
- 25 list($this->tokens, $this->tokenOffsets) = self::tokenizeOnRegex($string, $tokenizer);
- 26 } else {
- 27 if (!method_exists("apemsel\AttributedString\TokenizedAttributedString", $tokenizerFunction)) {
- 28 throw new \InvalidArgumentException("Unknown tokenizer $tokenizer");
- 29 }
- 30 list($this->tokens, $this->tokenOffsets) = self::$tokenizerFunction($string);
- 31 }
- 32
- 33 parent::__construct($string);
- 34 }
- 35
- 36 37 38 39 40
- 41 public function getTokens() {
- 42 return $this->tokens;
- 43 }
- 44
- 45 46 47 48 49
- 50 public function getTokenOffsets() {
- 51 return $this->tokenOffsets;
- 52 }
- 53
- 54 55 56 57 58
- 59 public function getTokenCount() {
- 60 return count($this->tokens);
- 61 }
- 62
- 63 64 65 66 67 68
- 69 public function getToken($i) {
- 70 return $this->tokens[$i];
- 71 }
- 72
- 73 74 75 76 77 78
- 79 public function getTokenOffset($i) {
- 80 return $this->tokenOffsets[$i];
- 81 }
- 82
- 83 84 85 86 87 88 89
- 90 public function setTokenAttribute($i, $attribute, $state = true) {
- 91 $token = $this->tokens[$i];
- 92 $offset = $this->tokenOffsets[$i];
- 93 $length = mb_strlen($token, "utf-8");
- 94
- 95 return $this->setLength($offset, $length, $attribute, $state);
- 96 }
- 97
- 98 99 100 101 102 103 104 105
-106 public function setTokenRangeAttribute($from, $to, $attribute, $state = true) {
-107 $fromOffset = $this->tokenOffsets[$from];
-108 $toOffset = $this->tokenOffsets[$to] + mb_strlen($this->tokens[$to], "utf-8") - 1;
-109
-110 return $this->setRange($fromOffset, $toOffset, $attribute, $state);
-111 }
-112
-113 114 115 116 117 118 119
-120 public function setTokenDictionaryAttribute($dictionary, $attribute, $state = true) {
-121 foreach($this->tokens as $i => $token) {
-122 if (in_array($token, $dictionary)) {
-123 $this->setTokenAttribute($i, $attribute, $state);
-124 }
-125 }
-126 }
-127
-128 129 130 131 132 133
-134 public function attributesAtToken($i) {
-135 return $this->attributesAt($this->tokenOffsets[$i]);
-136 }
-137
-138 139 140
-141 public function lowercaseTokens() {
-142 $this->tokens = array_map(function($token) {
-143 return mb_strtolower($token, "utf-8");
-144 }, $this->tokens);
-145 }
-146
-147 148 149 150 151 152
-153 public static function tokenizeOnWhitespace($string) {
-154
-155
-156 return self::tokenizeOnRegex($string, '/[\s\n\r]*([^\s\n\r]+)/u');
-157 }
-158
-159 160 161 162 163 164
-165 public static function tokenizeOnWords($string) {
-166 return self::tokenizeOnRegex($string, '/([\w]+)/u');
-167 }
-168
-169 170 171 172 173 174 175
-176 public static function tokenizeOnRegex($string, $pattern)
-177 {
-178
-179 preg_match_all($pattern, $string, $matches, PREG_OFFSET_CAPTURE);
-180
-181
-182
-183 $tokens = array_column($matches[1], 0);
-184 $tokenOffsets = array_column($matches[1], 1);
-185
-186 return [$tokens, $tokenOffsets];
-187 }
-188
-189
-190
-191 192 193 194 195 196
-197 public function offsetExists($i) {
-198 return $i < $this->getTokenCount();
-199 }
-200
-201 202 203 204 205 206 207 208
-209 public function offsetGet($i) {
-210 return $this->tokens[$i];
-211 }
-212 }
-213
+ 22 parent::__construct($string);
+ 23
+ 24 $tokenizerFunction = "tokenizeOn".ucfirst($tokenizer);
+ 25
+ 26 if ($tokenizer[0] == "/") {
+ 27 list($this->tokens, $this->tokenOffsets) = self::tokenizeOnRegex($string, $tokenizer);
+ 28 } else {
+ 29 if (!method_exists("apemsel\AttributedString\TokenizedAttributedString", $tokenizerFunction)) {
+ 30 throw new \InvalidArgumentException("Unknown tokenizer $tokenizer");
+ 31 }
+ 32 list($this->tokens, $this->tokenOffsets) = self::$tokenizerFunction($string);
+ 33 }
+ 34
+ 35
+ 36 $this->enableByteToCharCache();
+ 37 $this->tokenOffsets = array_map(function($o) {
+ 38 return $this->byteToCharOffset($o);
+ 39 }, $this->tokenOffsets);
+ 40 }
+ 41
+ 42 43 44 45 46
+ 47 public function getTokens() {
+ 48 return $this->tokens;
+ 49 }
+ 50
+ 51 52 53 54 55
+ 56 public function getTokenOffsets() {
+ 57 return $this->tokenOffsets;
+ 58 }
+ 59
+ 60 61 62 63 64
+ 65 public function getTokenCount() {
+ 66 return count($this->tokens);
+ 67 }
+ 68
+ 69 70 71 72 73 74
+ 75 public function getToken($i) {
+ 76 return $this->tokens[$i];
+ 77 }
+ 78
+ 79 80 81 82 83 84
+ 85 public function getTokenOffset($i) {
+ 86 return $this->tokenOffsets[$i];
+ 87 }
+ 88
+ 89 90 91 92 93 94 95
+ 96 public function setTokenAttribute($i, $attribute, $state = true) {
+ 97 $token = $this->tokens[$i];
+ 98 $offset = $this->tokenOffsets[$i];
+ 99 $length = mb_strlen($token, "utf-8");
+100
+101 return $this->setLength($offset, $length, $attribute, $state);
+102 }
+103
+104 105 106 107 108 109 110 111
+112 public function setTokenRangeAttribute($from, $to, $attribute, $state = true) {
+113 $fromOffset = $this->tokenOffsets[$from];
+114 $toOffset = $this->tokenOffsets[$to] + mb_strlen($this->tokens[$to], "utf-8") - 1;
+115
+116 return $this->setRange($fromOffset, $toOffset, $attribute, $state);
+117 }
+118
+119 120 121 122 123 124 125
+126 public function setTokenDictionaryAttribute($dictionary, $attribute, $state = true) {
+127 foreach($this->tokens as $i => $token) {
+128 if (in_array($token, $dictionary)) {
+129 $this->setTokenAttribute($i, $attribute, $state);
+130 }
+131 }
+132 }
+133
+134 135 136 137 138 139
+140 public function attributesAtToken($i) {
+141 return $this->attributesAt($this->tokenOffsets[$i]);
+142 }
+143
+144 145 146
+147 public function lowercaseTokens() {
+148 $this->tokens = array_map(function($token) {
+149 return mb_strtolower($token, "utf-8");
+150 }, $this->tokens);
+151 }
+152
+153 154 155 156 157 158
+159 public static function tokenizeOnWhitespace($string) {
+160
+161
+162 return self::tokenizeOnRegex($string, '/[\s\n\r]*([^\s\n\r]+)/u');
+163 }
+164
+165 166 167 168 169 170
+171 public static function tokenizeOnWords($string) {
+172 return self::tokenizeOnRegex($string, '/([\p{L}\p{S}\p{N}]+)/u');
+173 }
+174
+175 176 177 178 179 180 181
+182 public static function tokenizeOnRegex($string, $pattern)
+183 {
+184
+185 preg_match_all($pattern, $string, $matches, PREG_OFFSET_CAPTURE);
+186
+187
+188
+189 $tokens = array_column($matches[1], 0);
+190 $tokenOffsets = array_column($matches[1], 1);
+191
+192 return [$tokens, $tokenOffsets];
+193 }
+194
+195
+196
+197 198 199 200 201 202
+203 public function offsetExists($i) {
+204 return $i < $this->getTokenCount();
+205 }
+206
+207 208 209 210 211 212 213 214
+215 public function offsetGet($i) {
+216 return $this->tokens[$i];
+217 }
+218 }
+219
| | | | | | | | | | | | | | | | |