diff --git a/doc/class-apemsel.AttributedString.AttributedString.html b/doc/class-apemsel.AttributedString.AttributedString.html index 97bc5ac..f7c9716 100644 --- a/doc/class-apemsel.AttributedString.AttributedString.html +++ b/doc/class-apemsel.AttributedString.AttributedString.html @@ -807,7 +807,7 @@

Parameters

- + public @@ -817,8 +817,8 @@

Parameters

- # - enablebyteToCharCache( ) + # + enableByteToCharCache( )

Enable and fill cache for byte to char offset conversion

diff --git a/doc/class-apemsel.AttributedString.MutableAttributedString.html b/doc/class-apemsel.AttributedString.MutableAttributedString.html index c069729..3541b37 100644 --- a/doc/class-apemsel.AttributedString.MutableAttributedString.html +++ b/doc/class-apemsel.AttributedString.MutableAttributedString.html @@ -354,7 +354,7 @@

Overrides

count(), createAttribute(), deleteAttribute(), - enablebyteToCharCache(), + enableByteToCharCache(), filter(), hasAttribute(), is(), diff --git a/doc/class-apemsel.AttributedString.TokenizedAttributedString.html b/doc/class-apemsel.AttributedString.TokenizedAttributedString.html index da7efb1..982d363 100644 --- a/doc/class-apemsel.AttributedString.TokenizedAttributedString.html +++ b/doc/class-apemsel.AttributedString.TokenizedAttributedString.html @@ -127,7 +127,7 @@

Class TokenizedAttributedString

Author: Adrian Pemsel apemsel@gmail.com
- Located at TokenizedAttributedString.php + Located at TokenizedAttributedString.php
@@ -147,7 +147,7 @@

Class TokenizedAttributedString

# - __construct( string|apemsel\AttributedString\AttributedString $string, string $tokenizer = "whitespace" ) + __construct( string|apemsel\AttributedString\AttributedString $string, string $tokenizer = "whitespace" )
@@ -185,7 +185,7 @@

Overrides

# - getTokens( ) + getTokens( )

Return all tokens

@@ -218,7 +218,7 @@

Returns

# - getTokenOffsets( ) + getTokenOffsets( )

Return all tokens' offsets

@@ -251,7 +251,7 @@

Returns

# - getTokenCount( ) + getTokenCount( )

Return the number of tokens

@@ -284,7 +284,7 @@

Returns

# - getToken( integer $i ) + getToken( integer $i )

Get indicated token

@@ -322,7 +322,7 @@

Returns

# - getTokenOffset( integer $i ) + getTokenOffset( integer $i )

Get indicated token offset

@@ -360,7 +360,7 @@

Returns

# - setTokenAttribute( integer $i, string $attribute, boolean $state = true ) + setTokenAttribute( integer $i, string $attribute, boolean $state = true )

Set a token to a given attribute and state

@@ -398,7 +398,7 @@

Parameters

# - setTokenRangeAttribute( integer $from, integer $to, string $attribute, boolean $state = true ) + setTokenRangeAttribute( integer $from, integer $to, string $attribute, boolean $state = true )

Set a range of tokens to a given attribute and state

@@ -438,7 +438,7 @@

Parameters

# - setTokenDictionaryAttribute( string[] $dictionary, string $attribute, boolean $state = true ) + setTokenDictionaryAttribute( string[] $dictionary, string $attribute, boolean $state = true )

Set all tokens matching given dictionary to attribute and state

@@ -476,7 +476,7 @@

Parameters

# - attributesAtToken( integer $i ) + attributesAtToken( integer $i )

Get all attribute of token at given index

@@ -514,7 +514,7 @@

Returns

# - lowercaseTokens( ) + lowercaseTokens( )

Convert all tokens to lower case

@@ -543,7 +543,7 @@

Returns

# - tokenizeOnWhitespace( string $string ) + tokenizeOnWhitespace( string $string )

Tokenize a string on whitespace

@@ -561,7 +561,7 @@

Parameters

Returns

- array
array of two arrays, with tokens at index 0 and their offsets at index 1 + array
array of two arrays, with tokens at index 0 and their byte offsets at index 1
@@ -581,7 +581,7 @@

Returns

# - tokenizeOnWords( string $string ) + tokenizeOnWords( string $string )

Tokenize a string on words

@@ -599,7 +599,7 @@

Parameters

Returns

- array
array of two arrays, with tokens at index 0 and their offsets at index 1 + array
array of two arrays, with tokens at index 0 and their byte offsets at index 1
@@ -619,7 +619,7 @@

Returns

# - tokenizeOnRegex( string $string, string $pattern ) + tokenizeOnRegex( string $string, string $pattern )

Tokenize a string with a given regex

@@ -639,7 +639,7 @@

Parameters

Returns

- array
array of two arrays, with tokens at index 0 and their offsets at index 1 + array
array of two arrays, with tokens at index 0 and their byte offsets at index 1
@@ -659,7 +659,7 @@

Returns

# - offsetExists( integer $i ) + offsetExists( integer $i )

Check if the token at the given index exists

@@ -699,7 +699,7 @@

Overrides

# - offsetGet( integer $i ) + offsetGet( integer $i )

Get token at given index

@@ -746,7 +746,7 @@

Overrides

count(), createAttribute(), deleteAttribute(), - enablebyteToCharCache(), + enableByteToCharCache(), filter(), hasAttribute(), is(), diff --git a/doc/source-class-apemsel.AttributedString.AttributedString.html b/doc/source-class-apemsel.AttributedString.AttributedString.html index 700fb82..09c5eb2 100644 --- a/doc/source-class-apemsel.AttributedString.AttributedString.html +++ b/doc/source-class-apemsel.AttributedString.AttributedString.html @@ -485,7 +485,7 @@

Classes

401 * 402 * May improve performance if setPattern is used extensively 403 */ -404 public function enablebyteToCharCache() { +404 public function enableByteToCharCache() { 405 $this->byteToChar = []; 406 $char = 0; 407 for ($i = 0; $i < strlen($this->string); ) { diff --git a/doc/source-class-apemsel.AttributedString.TokenizedAttributedString.html b/doc/source-class-apemsel.AttributedString.TokenizedAttributedString.html index 2bd57be..e170a29 100644 --- a/doc/source-class-apemsel.AttributedString.TokenizedAttributedString.html +++ b/doc/source-class-apemsel.AttributedString.TokenizedAttributedString.html @@ -103,198 +103,204 @@

Classes

19 * @param string $tokenizer Tokenizer to use, either "whitespace", "word" or a custom regex 20 */ 21 public function __construct($string, $tokenizer = "whitespace") { - 22 $tokenizerFunction = "tokenizeOn".ucfirst($tokenizer); - 23 - 24 if ($tokenizer[0] == "/") { - 25 list($this->tokens, $this->tokenOffsets) = self::tokenizeOnRegex($string, $tokenizer); - 26 } else { - 27 if (!method_exists("apemsel\AttributedString\TokenizedAttributedString", $tokenizerFunction)) { - 28 throw new \InvalidArgumentException("Unknown tokenizer $tokenizer"); - 29 } - 30 list($this->tokens, $this->tokenOffsets) = self::$tokenizerFunction($string); - 31 } - 32 - 33 parent::__construct($string); - 34 } - 35 - 36 /** - 37 * Return all tokens - 38 * - 39 * @return string[] tokens - 40 */ - 41 public function getTokens() { - 42 return $this->tokens; - 43 } - 44 - 45 /** - 46 * Return all tokens' offsets - 47 * - 48 * @return int[] offsets - 49 */ - 50 public function getTokenOffsets() { - 51 return $this->tokenOffsets; - 52 } - 53 - 54 /** - 55 * Return the number of tokens - 56 * - 57 * @return int count - 58 */ - 59 public function getTokenCount() { - 60 return count($this->tokens); - 61 } - 62 - 63 /** - 64 * Get indicated token - 65 * - 66 * @param int $i token index - 67 * @return string token - 68 */ - 69 public function getToken($i) { - 70 return $this->tokens[$i]; - 71 } - 72 - 73 /** - 74 * Get indicated token offset - 75 * - 76 * @param int $i token index - 77 * @return int offset - 78 */ - 79 public function getTokenOffset($i) { - 80 return $this->tokenOffsets[$i]; - 81 } - 82 - 83 /** - 84 * Set a token to a given attribute and state - 85 * - 86 * @param int $i token index - 87 * @param string $attribute attribute name - 88 * @param bool $state attribute state - 89 */ - 90 public function setTokenAttribute($i, $attribute, $state = true) { - 91 $token = $this->tokens[$i]; - 92 $offset = $this->tokenOffsets[$i]; - 93 $length = mb_strlen($token, "utf-8"); - 94 - 95 return $this->setLength($offset, $length, $attribute, $state); - 96 } - 97 - 98 /** - 99 * Set a range of tokens to a given attribute and state -100 * -101 * @param int $from token start index -102 * @param int $to token end index -103 * @param string $attribute attribute name -104 * @param bool $state attribute state -105 */ -106 public function setTokenRangeAttribute($from, $to, $attribute, $state = true) { -107 $fromOffset = $this->tokenOffsets[$from]; -108 $toOffset = $this->tokenOffsets[$to] + mb_strlen($this->tokens[$to], "utf-8") - 1; -109 -110 return $this->setRange($fromOffset, $toOffset, $attribute, $state); -111 } -112 -113 /** -114 * Set all tokens matching given dictionary to attribute and state -115 * -116 * @param string[] $dictionary dictionary -117 * @param string $attribute attribute name -118 * @param bool $state attribute state -119 */ -120 public function setTokenDictionaryAttribute($dictionary, $attribute, $state = true) { -121 foreach($this->tokens as $i => $token) { -122 if (in_array($token, $dictionary)) { -123 $this->setTokenAttribute($i, $attribute, $state); -124 } -125 } -126 } -127 -128 /** -129 * Get all attribute of token at given index -130 * -131 * @param int token index -132 * @return string[] attributes -133 */ -134 public function attributesAtToken($i) { -135 return $this->attributesAt($this->tokenOffsets[$i]); -136 } -137 -138 /** -139 * Convert all tokens to lower case -140 */ -141 public function lowercaseTokens() { -142 $this->tokens = array_map(function($token) { -143 return mb_strtolower($token, "utf-8"); -144 }, $this->tokens); -145 } -146 -147 /** -148 * Tokenize a string on whitespace -149 * -150 * @param string $string string to be tokenized -151 * @return array array of two arrays, with tokens at index 0 and their offsets at index 1 -152 */ -153 public static function tokenizeOnWhitespace($string) { -154 // Matches pontential whitespace in front of the token and the token itself. -155 // Matching the whitespace could be omitted, but that results in slower execution ;-) -156 return self::tokenizeOnRegex($string, '/[\s\n\r]*([^\s\n\r]+)/u'); -157 } -158 -159 /** -160 * Tokenize a string on words -161 * -162 * @param string $string string to be tokenized -163 * @return array array of two arrays, with tokens at index 0 and their offsets at index 1 -164 */ -165 public static function tokenizeOnWords($string) { -166 return self::tokenizeOnRegex($string, '/([\w]+)/u'); -167 } -168 -169 /** -170 * Tokenize a string with a given regex -171 * -172 * @param string $string string to be tokenized -173 * @param string $pattern regex. The token must be captured in the first subgroup. -174 * @return array array of two arrays, with tokens at index 0 and their offsets at index 1 -175 */ -176 public static function tokenizeOnRegex($string, $pattern) -177 { -178 // Fastest way to get both tokens and their offsets, but not easy to understand. -179 preg_match_all($pattern, $string, $matches, PREG_OFFSET_CAPTURE); -180 -181 // $matches[1] contains an array of all matched subexpressions (= tokens) -182 // with their offset in column 1 and the matched token in column 0 -183 $tokens = array_column($matches[1], 0); -184 $tokenOffsets = array_column($matches[1], 1); -185 -186 return [$tokens, $tokenOffsets]; -187 } -188 -189 // Modified ArrayAccess interface -190 -191 /** -192 * Check if the token at the given index exists -193 * -194 * @param int $i token index -195 * @return bool does the offset exist -196 */ -197 public function offsetExists($i) { -198 return $i < $this->getTokenCount(); -199 } -200 -201 /** -202 * Get token at given index -203 * -204 * Note: TokenizedAttributedString uses the ArrayAccess interface to access tokens, not chars! -205 * -206 * @param int $i token index -207 * @return string token -208 */ -209 public function offsetGet($i) { -210 return $this->tokens[$i]; -211 } -212 } -213 + 22 parent::__construct($string); + 23 + 24 $tokenizerFunction = "tokenizeOn".ucfirst($tokenizer); + 25 + 26 if ($tokenizer[0] == "/") { + 27 list($this->tokens, $this->tokenOffsets) = self::tokenizeOnRegex($string, $tokenizer); + 28 } else { + 29 if (!method_exists("apemsel\AttributedString\TokenizedAttributedString", $tokenizerFunction)) { + 30 throw new \InvalidArgumentException("Unknown tokenizer $tokenizer"); + 31 } + 32 list($this->tokens, $this->tokenOffsets) = self::$tokenizerFunction($string); + 33 } + 34 + 35 // convert byte to char offsets + 36 $this->enableByteToCharCache(); + 37 $this->tokenOffsets = array_map(function($o) { + 38 return $this->byteToCharOffset($o); + 39 }, $this->tokenOffsets); + 40 } + 41 + 42 /** + 43 * Return all tokens + 44 * + 45 * @return string[] tokens + 46 */ + 47 public function getTokens() { + 48 return $this->tokens; + 49 } + 50 + 51 /** + 52 * Return all tokens' offsets + 53 * + 54 * @return int[] offsets + 55 */ + 56 public function getTokenOffsets() { + 57 return $this->tokenOffsets; + 58 } + 59 + 60 /** + 61 * Return the number of tokens + 62 * + 63 * @return int count + 64 */ + 65 public function getTokenCount() { + 66 return count($this->tokens); + 67 } + 68 + 69 /** + 70 * Get indicated token + 71 * + 72 * @param int $i token index + 73 * @return string token + 74 */ + 75 public function getToken($i) { + 76 return $this->tokens[$i]; + 77 } + 78 + 79 /** + 80 * Get indicated token offset + 81 * + 82 * @param int $i token index + 83 * @return int offset + 84 */ + 85 public function getTokenOffset($i) { + 86 return $this->tokenOffsets[$i]; + 87 } + 88 + 89 /** + 90 * Set a token to a given attribute and state + 91 * + 92 * @param int $i token index + 93 * @param string $attribute attribute name + 94 * @param bool $state attribute state + 95 */ + 96 public function setTokenAttribute($i, $attribute, $state = true) { + 97 $token = $this->tokens[$i]; + 98 $offset = $this->tokenOffsets[$i]; + 99 $length = mb_strlen($token, "utf-8"); +100 +101 return $this->setLength($offset, $length, $attribute, $state); +102 } +103 +104 /** +105 * Set a range of tokens to a given attribute and state +106 * +107 * @param int $from token start index +108 * @param int $to token end index +109 * @param string $attribute attribute name +110 * @param bool $state attribute state +111 */ +112 public function setTokenRangeAttribute($from, $to, $attribute, $state = true) { +113 $fromOffset = $this->tokenOffsets[$from]; +114 $toOffset = $this->tokenOffsets[$to] + mb_strlen($this->tokens[$to], "utf-8") - 1; +115 +116 return $this->setRange($fromOffset, $toOffset, $attribute, $state); +117 } +118 +119 /** +120 * Set all tokens matching given dictionary to attribute and state +121 * +122 * @param string[] $dictionary dictionary +123 * @param string $attribute attribute name +124 * @param bool $state attribute state +125 */ +126 public function setTokenDictionaryAttribute($dictionary, $attribute, $state = true) { +127 foreach($this->tokens as $i => $token) { +128 if (in_array($token, $dictionary)) { +129 $this->setTokenAttribute($i, $attribute, $state); +130 } +131 } +132 } +133 +134 /** +135 * Get all attribute of token at given index +136 * +137 * @param int token index +138 * @return string[] attributes +139 */ +140 public function attributesAtToken($i) { +141 return $this->attributesAt($this->tokenOffsets[$i]); +142 } +143 +144 /** +145 * Convert all tokens to lower case +146 */ +147 public function lowercaseTokens() { +148 $this->tokens = array_map(function($token) { +149 return mb_strtolower($token, "utf-8"); +150 }, $this->tokens); +151 } +152 +153 /** +154 * Tokenize a string on whitespace +155 * +156 * @param string $string string to be tokenized +157 * @return array array of two arrays, with tokens at index 0 and their byte offsets at index 1 +158 */ +159 public static function tokenizeOnWhitespace($string) { +160 // Matches pontential whitespace in front of the token and the token itself. +161 // Matching the whitespace could be omitted, but that results in slower execution ;-) +162 return self::tokenizeOnRegex($string, '/[\s\n\r]*([^\s\n\r]+)/u'); +163 } +164 +165 /** +166 * Tokenize a string on words +167 * +168 * @param string $string string to be tokenized +169 * @return array array of two arrays, with tokens at index 0 and their byte offsets at index 1 +170 */ +171 public static function tokenizeOnWords($string) { +172 return self::tokenizeOnRegex($string, '/([\p{L}\p{S}\p{N}]+)/u'); +173 } +174 +175 /** +176 * Tokenize a string with a given regex +177 * +178 * @param string $string string to be tokenized +179 * @param string $pattern regex. The token must be captured in the first subgroup. +180 * @return array array of two arrays, with tokens at index 0 and their byte offsets at index 1 +181 */ +182 public static function tokenizeOnRegex($string, $pattern) +183 { +184 // Fastest way to get both tokens and their offsets, but not easy to understand. +185 preg_match_all($pattern, $string, $matches, PREG_OFFSET_CAPTURE); +186 +187 // $matches[1] contains an array of all matched subexpressions (= tokens) +188 // with their offset in column 1 and the matched token in column 0 +189 $tokens = array_column($matches[1], 0); +190 $tokenOffsets = array_column($matches[1], 1); +191 +192 return [$tokens, $tokenOffsets]; +193 } +194 +195 // Modified ArrayAccess interface +196 +197 /** +198 * Check if the token at the given index exists +199 * +200 * @param int $i token index +201 * @return bool does the offset exist +202 */ +203 public function offsetExists($i) { +204 return $i < $this->getTokenCount(); +205 } +206 +207 /** +208 * Get token at given index +209 * +210 * Note: TokenizedAttributedString uses the ArrayAccess interface to access tokens, not chars! +211 * +212 * @param int $i token index +213 * @return string token +214 */ +215 public function offsetGet($i) { +216 return $this->tokens[$i]; +217 } +218 } +219