diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..57872d0 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/vendor/ diff --git a/README.md b/README.md index 86897cd..2206a1b 100644 --- a/README.md +++ b/README.md @@ -14,9 +14,31 @@ than simply calling `strpos` many times, and it's much faster than calling I originally wrote this to use with [F5Bot](https://f5bot.com), since it's searching for the same set of a few thousand keywords over and over again. +# Install via Composer + +Add the following to your project's `composer.json`: + +```json + "repositories": [ + { + "type": "vcs", + "url": "https://github.com/codeplea/ahocorasickphp" + } + ], + "require": { + "codeplea/ahocorasickphp": "dev-master" + } +``` + +Then, install the package itself: + +```bash +$ composer update +``` + # Usage -It's designed to be really easy to use. You create the `ahocorasick` object, +It's designed to be really easy to use. You create the `Search` object, add your keywords, call `finalize()` to finish setup, and then search your text. It'll return an array of the keywords found and their position in the search text. @@ -24,22 +46,22 @@ search text. Create, add keywords, and `finalize()`: ```php -require('ahocorasick.php'); +use codeplea\AhoCorasick\Search; -$ac = new ahocorasick(); +$ac = new Search(); -$ac->add_needle('art'); -$ac->add_needle('cart'); -$ac->add_needle('ted'); +$ac->addNeedle('art'); +$ac->addNeedle('cart'); +$ac->addNeedle('ted'); $ac->finalize(); ``` -Call `search()` to preform the actual search. It'll return an array of matches. +Call `execute()` to preform the actual search. It'll return an array of matches. ```php -$found = $ac->search('a carted mart lot one blue ted'); +$found = $ac->execute('a carted mart lot one blue ted'); print_r($found); ``` @@ -97,10 +119,17 @@ time: 0.054709911346436 ``` -Note: the regex solutions are actually slightly broken. They won't work if you +**Note:** the regex solutions are actually slightly broken. They won't work if you have a keyword that is a prefix or suffix of another. But hey, who really uses regex when it's not slightly broken? -Also keep in mind that building the search tree (the `add_needle()` and +Also keep in mind that building the search tree (the `addNeedle()` and `finalize()` calls) takes time. So you'll get the best speed-up if you're -reusing the same keywords and calling `search()` many times. +reusing the same keywords and calling `execute()` many times. + +# Running tests + +```$php +$ composer install +$ ./vendor/bin/phpunit +``` \ No newline at end of file diff --git a/ahocorasick.php b/ahocorasick.php deleted file mode 100644 index cd3747c..0000000 --- a/ahocorasick.php +++ /dev/null @@ -1,117 +0,0 @@ -final) throw new Exception("Cannot add word to finalized ahocorasick."); - - $nodes = &$this->nodes; - $n = 0; - - for ($i = 0; $i < strlen($needle); ++$i) { - $c = $needle[$i]; - - if (!isset($nodes[$n][$c])) { - $nodes[$n][$c] = count($nodes); - $nodes[] = array(); - } - $n = $nodes[$n][$c]; - } - - $nodes[$n][0][] = $needle; - } - - - public function finalize() { - $nodes = &$this->nodes; - $queue = array(); - - foreach($nodes[0] as $j => $_) { - $nodes[$nodes[0][$j]][1] = 0; - $queue[] = $nodes[0][$j]; - } - - while (count($queue)) { - $r = $queue[0]; - $queue = array_slice($queue, 1); - - foreach($nodes[$r] as $j => $_) { - if ($j === 0 || $j === 1) continue; - $v = $nodes[$r][1]; - $u = $nodes[$r][$j]; - while ($v > 0 && !isset($nodes[$v][$j])) $v = $nodes[$v][1]; - $nodes[$u][1] = isset($nodes[$v][$j]) ? $nodes[$v][$j] : $v; - if (isset($nodes[$nodes[$u][1]][0])) { - if (!isset($nodes[$u][0])) $nodes[$u][0] = array(); - $nodes[$u][0] = array_merge($nodes[$u][0], $nodes[$nodes[$u][1]][0]); - } - $queue[] = $u; - } - } - - $this->final = 1; - } - - - public function search($haystack) { - if (!$this->final) throw new Exception("Must call finalize() before search."); - - $nodes = &$this->nodes; - $found = array(); - $n = 0; - - for ($i = 0; $i < strlen($haystack); ++$i) { - $c = $haystack[$i]; - - while(!in_array($c, array_keys($nodes[$n]), true) && $n) { - $n = $nodes[$n][1]; - if ($n === null) die(); - } - - if (isset($nodes[$n][$c])) - $n = $nodes[$n][$c]; - - if (isset($nodes[$n][0])) { - $z = $nodes[$n][0]; - foreach($z as $w) { - $found[] = array($w, $i - strlen($w) + 1); - } - } - } - - return $found; - } - -}; diff --git a/benchmark.php b/benchmark.php index ef08f6f..63dc94e 100644 --- a/benchmark.php +++ b/benchmark.php @@ -1,120 +1,11 @@ add_needle($n); -$ac->finalize(); - -$st = microtime(1); -for ($loop = 0; $loop < $loops; ++$loop) { - $found = array(); - $found = $ac->search($haystack); -} -$et = microtime(1); -print("time: " . ($et - $st) . "\n"); - - - - - - -//Check that the answers match. -//First sort the arrays. -$comp = function($a, $b) {return ($a[1] === $b[1]) ? ($a[0] > $b[0]) : ($a[1] > $b[1]);}; -usort($found, $comp); -usort($found_strpos, $comp); - -if ($found_strpos !== $found) { - print("ERROR - Aho Corasick got the wrong result.\n"); - - print("strpos size: " . count($found_strpos) . "\n"); - print("aho corasick size: " . count($found) . "\n"); - - for ($i = 0; $i < count($found); ++$i) { - if ($found_strpos[$i] !== $found[$i]) { - print("Mismatch $i\n"); - print_r($found_strpos[$i]); - print_r($found[$i]); - } - } -} - - +require 'vendor/autoload.php'; +/* keywords and text */ +require 'benchmark_setup.php'; +// Benchmark searching for 1,000 keywords in a 5,000 word text all at once. +$benchmark = new Benchmark(); +$benchmark->run($needles, $haystack); diff --git a/benchmark_setup.php b/benchmark_setup.php index 83aa0ba..2cddf5a 100644 --- a/benchmark_setup.php +++ b/benchmark_setup.php @@ -1,6 +1,6 @@ =7.0" + }, + "require-dev": { + "phpunit/phpunit": "^7.3" + }, + "autoload": { + "psr-4": { + "codeplea\\AhoCorasick\\": "src" + } + } +} diff --git a/composer.lock b/composer.lock new file mode 100644 index 0000000..b2e0985 --- /dev/null +++ b/composer.lock @@ -0,0 +1,1428 @@ +{ + "_readme": [ + "This file locks the dependencies of your project to a known state", + "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", + "This file is @generated automatically" + ], + "content-hash": "0288e7d5499423c5e2dc038a0eeabda1", + "packages": [], + "packages-dev": [ + { + "name": "doctrine/instantiator", + "version": "1.1.0", + "source": { + "type": "git", + "url": "https://github.com/doctrine/instantiator.git", + "reference": "185b8868aa9bf7159f5f953ed5afb2d7fcdc3bda" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/doctrine/instantiator/zipball/185b8868aa9bf7159f5f953ed5afb2d7fcdc3bda", + "reference": "185b8868aa9bf7159f5f953ed5afb2d7fcdc3bda", + "shasum": "" + }, + "require": { + "php": "^7.1" + }, + "require-dev": { + "athletic/athletic": "~0.1.8", + "ext-pdo": "*", + "ext-phar": "*", + "phpunit/phpunit": "^6.2.3", + "squizlabs/php_codesniffer": "^3.0.2" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.2.x-dev" + } + }, + "autoload": { + "psr-4": { + "Doctrine\\Instantiator\\": "src/Doctrine/Instantiator/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Marco Pivetta", + "email": "ocramius@gmail.com", + "homepage": "http://ocramius.github.com/" + } + ], + "description": "A small, lightweight utility to instantiate objects in PHP without invoking their constructors", + "homepage": "https://github.com/doctrine/instantiator", + "keywords": [ + "constructor", + "instantiate" + ], + "time": "2017-07-22T11:58:36+00:00" + }, + { + "name": "myclabs/deep-copy", + "version": "1.8.1", + "source": { + "type": "git", + "url": "https://github.com/myclabs/DeepCopy.git", + "reference": "3e01bdad3e18354c3dce54466b7fbe33a9f9f7f8" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/myclabs/DeepCopy/zipball/3e01bdad3e18354c3dce54466b7fbe33a9f9f7f8", + "reference": "3e01bdad3e18354c3dce54466b7fbe33a9f9f7f8", + "shasum": "" + }, + "require": { + "php": "^7.1" + }, + "replace": { + "myclabs/deep-copy": "self.version" + }, + "require-dev": { + "doctrine/collections": "^1.0", + "doctrine/common": "^2.6", + "phpunit/phpunit": "^7.1" + }, + "type": "library", + "autoload": { + "psr-4": { + "DeepCopy\\": "src/DeepCopy/" + }, + "files": [ + "src/DeepCopy/deep_copy.php" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "description": "Create deep copies (clones) of your objects", + "keywords": [ + "clone", + "copy", + "duplicate", + "object", + "object graph" + ], + "time": "2018-06-11T23:09:50+00:00" + }, + { + "name": "phar-io/manifest", + "version": "1.0.3", + "source": { + "type": "git", + "url": "https://github.com/phar-io/manifest.git", + "reference": "7761fcacf03b4d4f16e7ccb606d4879ca431fcf4" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/phar-io/manifest/zipball/7761fcacf03b4d4f16e7ccb606d4879ca431fcf4", + "reference": "7761fcacf03b4d4f16e7ccb606d4879ca431fcf4", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "ext-phar": "*", + "phar-io/version": "^2.0", + "php": "^5.6 || ^7.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Arne Blankerts", + "email": "arne@blankerts.de", + "role": "Developer" + }, + { + "name": "Sebastian Heuer", + "email": "sebastian@phpeople.de", + "role": "Developer" + }, + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de", + "role": "Developer" + } + ], + "description": "Component for reading phar.io manifest information from a PHP Archive (PHAR)", + "time": "2018-07-08T19:23:20+00:00" + }, + { + "name": "phar-io/version", + "version": "2.0.1", + "source": { + "type": "git", + "url": "https://github.com/phar-io/version.git", + "reference": "45a2ec53a73c70ce41d55cedef9063630abaf1b6" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/phar-io/version/zipball/45a2ec53a73c70ce41d55cedef9063630abaf1b6", + "reference": "45a2ec53a73c70ce41d55cedef9063630abaf1b6", + "shasum": "" + }, + "require": { + "php": "^5.6 || ^7.0" + }, + "type": "library", + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Arne Blankerts", + "email": "arne@blankerts.de", + "role": "Developer" + }, + { + "name": "Sebastian Heuer", + "email": "sebastian@phpeople.de", + "role": "Developer" + }, + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de", + "role": "Developer" + } + ], + "description": "Library for handling version information and constraints", + "time": "2018-07-08T19:19:57+00:00" + }, + { + "name": "phpdocumentor/reflection-common", + "version": "1.0.1", + "source": { + "type": "git", + "url": "https://github.com/phpDocumentor/ReflectionCommon.git", + "reference": "21bdeb5f65d7ebf9f43b1b25d404f87deab5bfb6" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/phpDocumentor/ReflectionCommon/zipball/21bdeb5f65d7ebf9f43b1b25d404f87deab5bfb6", + "reference": "21bdeb5f65d7ebf9f43b1b25d404f87deab5bfb6", + "shasum": "" + }, + "require": { + "php": ">=5.5" + }, + "require-dev": { + "phpunit/phpunit": "^4.6" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "psr-4": { + "phpDocumentor\\Reflection\\": [ + "src" + ] + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Jaap van Otterdijk", + "email": "opensource@ijaap.nl" + } + ], + "description": "Common reflection classes used by phpdocumentor to reflect the code structure", + "homepage": "http://www.phpdoc.org", + "keywords": [ + "FQSEN", + "phpDocumentor", + "phpdoc", + "reflection", + "static analysis" + ], + "time": "2017-09-11T18:02:19+00:00" + }, + { + "name": "phpdocumentor/reflection-docblock", + "version": "4.3.0", + "source": { + "type": "git", + "url": "https://github.com/phpDocumentor/ReflectionDocBlock.git", + "reference": "94fd0001232e47129dd3504189fa1c7225010d08" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/phpDocumentor/ReflectionDocBlock/zipball/94fd0001232e47129dd3504189fa1c7225010d08", + "reference": "94fd0001232e47129dd3504189fa1c7225010d08", + "shasum": "" + }, + "require": { + "php": "^7.0", + "phpdocumentor/reflection-common": "^1.0.0", + "phpdocumentor/type-resolver": "^0.4.0", + "webmozart/assert": "^1.0" + }, + "require-dev": { + "doctrine/instantiator": "~1.0.5", + "mockery/mockery": "^1.0", + "phpunit/phpunit": "^6.4" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "4.x-dev" + } + }, + "autoload": { + "psr-4": { + "phpDocumentor\\Reflection\\": [ + "src/" + ] + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Mike van Riel", + "email": "me@mikevanriel.com" + } + ], + "description": "With this component, a library can provide support for annotations via DocBlocks or otherwise retrieve information that is embedded in a DocBlock.", + "time": "2017-11-30T07:14:17+00:00" + }, + { + "name": "phpdocumentor/type-resolver", + "version": "0.4.0", + "source": { + "type": "git", + "url": "https://github.com/phpDocumentor/TypeResolver.git", + "reference": "9c977708995954784726e25d0cd1dddf4e65b0f7" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/phpDocumentor/TypeResolver/zipball/9c977708995954784726e25d0cd1dddf4e65b0f7", + "reference": "9c977708995954784726e25d0cd1dddf4e65b0f7", + "shasum": "" + }, + "require": { + "php": "^5.5 || ^7.0", + "phpdocumentor/reflection-common": "^1.0" + }, + "require-dev": { + "mockery/mockery": "^0.9.4", + "phpunit/phpunit": "^5.2||^4.8.24" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "psr-4": { + "phpDocumentor\\Reflection\\": [ + "src/" + ] + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Mike van Riel", + "email": "me@mikevanriel.com" + } + ], + "time": "2017-07-14T14:27:02+00:00" + }, + { + "name": "phpspec/prophecy", + "version": "1.8.0", + "source": { + "type": "git", + "url": "https://github.com/phpspec/prophecy.git", + "reference": "4ba436b55987b4bf311cb7c6ba82aa528aac0a06" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/phpspec/prophecy/zipball/4ba436b55987b4bf311cb7c6ba82aa528aac0a06", + "reference": "4ba436b55987b4bf311cb7c6ba82aa528aac0a06", + "shasum": "" + }, + "require": { + "doctrine/instantiator": "^1.0.2", + "php": "^5.3|^7.0", + "phpdocumentor/reflection-docblock": "^2.0|^3.0.2|^4.0", + "sebastian/comparator": "^1.1|^2.0|^3.0", + "sebastian/recursion-context": "^1.0|^2.0|^3.0" + }, + "require-dev": { + "phpspec/phpspec": "^2.5|^3.2", + "phpunit/phpunit": "^4.8.35 || ^5.7 || ^6.5 || ^7.1" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.8.x-dev" + } + }, + "autoload": { + "psr-0": { + "Prophecy\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Konstantin Kudryashov", + "email": "ever.zet@gmail.com", + "homepage": "http://everzet.com" + }, + { + "name": "Marcello Duarte", + "email": "marcello.duarte@gmail.com" + } + ], + "description": "Highly opinionated mocking framework for PHP 5.3+", + "homepage": "https://github.com/phpspec/prophecy", + "keywords": [ + "Double", + "Dummy", + "fake", + "mock", + "spy", + "stub" + ], + "time": "2018-08-05T17:53:17+00:00" + }, + { + "name": "phpunit/php-code-coverage", + "version": "6.0.7", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/php-code-coverage.git", + "reference": "865662550c384bc1db7e51d29aeda1c2c161d69a" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/php-code-coverage/zipball/865662550c384bc1db7e51d29aeda1c2c161d69a", + "reference": "865662550c384bc1db7e51d29aeda1c2c161d69a", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "ext-xmlwriter": "*", + "php": "^7.1", + "phpunit/php-file-iterator": "^2.0", + "phpunit/php-text-template": "^1.2.1", + "phpunit/php-token-stream": "^3.0", + "sebastian/code-unit-reverse-lookup": "^1.0.1", + "sebastian/environment": "^3.1", + "sebastian/version": "^2.0.1", + "theseer/tokenizer": "^1.1" + }, + "require-dev": { + "phpunit/phpunit": "^7.0" + }, + "suggest": { + "ext-xdebug": "^2.6.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "6.0-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de", + "role": "lead" + } + ], + "description": "Library that provides collection, processing, and rendering functionality for PHP code coverage information.", + "homepage": "https://github.com/sebastianbergmann/php-code-coverage", + "keywords": [ + "coverage", + "testing", + "xunit" + ], + "time": "2018-06-01T07:51:50+00:00" + }, + { + "name": "phpunit/php-file-iterator", + "version": "2.0.2", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/php-file-iterator.git", + "reference": "050bedf145a257b1ff02746c31894800e5122946" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/php-file-iterator/zipball/050bedf145a257b1ff02746c31894800e5122946", + "reference": "050bedf145a257b1ff02746c31894800e5122946", + "shasum": "" + }, + "require": { + "php": "^7.1" + }, + "require-dev": { + "phpunit/phpunit": "^7.1" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.0.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de", + "role": "lead" + } + ], + "description": "FilterIterator implementation that filters files based on a list of suffixes.", + "homepage": "https://github.com/sebastianbergmann/php-file-iterator/", + "keywords": [ + "filesystem", + "iterator" + ], + "time": "2018-09-13T20:33:42+00:00" + }, + { + "name": "phpunit/php-text-template", + "version": "1.2.1", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/php-text-template.git", + "reference": "31f8b717e51d9a2afca6c9f046f5d69fc27c8686" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/php-text-template/zipball/31f8b717e51d9a2afca6c9f046f5d69fc27c8686", + "reference": "31f8b717e51d9a2afca6c9f046f5d69fc27c8686", + "shasum": "" + }, + "require": { + "php": ">=5.3.3" + }, + "type": "library", + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de", + "role": "lead" + } + ], + "description": "Simple template engine.", + "homepage": "https://github.com/sebastianbergmann/php-text-template/", + "keywords": [ + "template" + ], + "time": "2015-06-21T13:50:34+00:00" + }, + { + "name": "phpunit/php-timer", + "version": "2.0.0", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/php-timer.git", + "reference": "8b8454ea6958c3dee38453d3bd571e023108c91f" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/php-timer/zipball/8b8454ea6958c3dee38453d3bd571e023108c91f", + "reference": "8b8454ea6958c3dee38453d3bd571e023108c91f", + "shasum": "" + }, + "require": { + "php": "^7.1" + }, + "require-dev": { + "phpunit/phpunit": "^7.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.0-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de", + "role": "lead" + } + ], + "description": "Utility class for timing", + "homepage": "https://github.com/sebastianbergmann/php-timer/", + "keywords": [ + "timer" + ], + "time": "2018-02-01T13:07:23+00:00" + }, + { + "name": "phpunit/php-token-stream", + "version": "3.0.0", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/php-token-stream.git", + "reference": "21ad88bbba7c3d93530d93994e0a33cd45f02ace" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/php-token-stream/zipball/21ad88bbba7c3d93530d93994e0a33cd45f02ace", + "reference": "21ad88bbba7c3d93530d93994e0a33cd45f02ace", + "shasum": "" + }, + "require": { + "ext-tokenizer": "*", + "php": "^7.1" + }, + "require-dev": { + "phpunit/phpunit": "^7.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "3.0-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + } + ], + "description": "Wrapper around PHP's tokenizer extension.", + "homepage": "https://github.com/sebastianbergmann/php-token-stream/", + "keywords": [ + "tokenizer" + ], + "time": "2018-02-01T13:16:43+00:00" + }, + { + "name": "phpunit/phpunit", + "version": "7.3.5", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/phpunit.git", + "reference": "7b331efabbb628c518c408fdfcaf571156775de2" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/phpunit/zipball/7b331efabbb628c518c408fdfcaf571156775de2", + "reference": "7b331efabbb628c518c408fdfcaf571156775de2", + "shasum": "" + }, + "require": { + "doctrine/instantiator": "^1.1", + "ext-dom": "*", + "ext-json": "*", + "ext-libxml": "*", + "ext-mbstring": "*", + "ext-xml": "*", + "myclabs/deep-copy": "^1.7", + "phar-io/manifest": "^1.0.2", + "phar-io/version": "^2.0", + "php": "^7.1", + "phpspec/prophecy": "^1.7", + "phpunit/php-code-coverage": "^6.0.7", + "phpunit/php-file-iterator": "^2.0.1", + "phpunit/php-text-template": "^1.2.1", + "phpunit/php-timer": "^2.0", + "sebastian/comparator": "^3.0", + "sebastian/diff": "^3.0", + "sebastian/environment": "^3.1", + "sebastian/exporter": "^3.1", + "sebastian/global-state": "^2.0", + "sebastian/object-enumerator": "^3.0.3", + "sebastian/resource-operations": "^1.0", + "sebastian/version": "^2.0.1" + }, + "conflict": { + "phpunit/phpunit-mock-objects": "*" + }, + "require-dev": { + "ext-pdo": "*" + }, + "suggest": { + "ext-soap": "*", + "ext-xdebug": "*", + "phpunit/php-invoker": "^2.0" + }, + "bin": [ + "phpunit" + ], + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "7.3-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de", + "role": "lead" + } + ], + "description": "The PHP Unit Testing framework.", + "homepage": "https://phpunit.de/", + "keywords": [ + "phpunit", + "testing", + "xunit" + ], + "time": "2018-09-08T15:14:29+00:00" + }, + { + "name": "sebastian/code-unit-reverse-lookup", + "version": "1.0.1", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/code-unit-reverse-lookup.git", + "reference": "4419fcdb5eabb9caa61a27c7a1db532a6b55dd18" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/code-unit-reverse-lookup/zipball/4419fcdb5eabb9caa61a27c7a1db532a6b55dd18", + "reference": "4419fcdb5eabb9caa61a27c7a1db532a6b55dd18", + "shasum": "" + }, + "require": { + "php": "^5.6 || ^7.0" + }, + "require-dev": { + "phpunit/phpunit": "^5.7 || ^6.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + } + ], + "description": "Looks up which function or method a line of code belongs to", + "homepage": "https://github.com/sebastianbergmann/code-unit-reverse-lookup/", + "time": "2017-03-04T06:30:41+00:00" + }, + { + "name": "sebastian/comparator", + "version": "3.0.2", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/comparator.git", + "reference": "5de4fc177adf9bce8df98d8d141a7559d7ccf6da" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/comparator/zipball/5de4fc177adf9bce8df98d8d141a7559d7ccf6da", + "reference": "5de4fc177adf9bce8df98d8d141a7559d7ccf6da", + "shasum": "" + }, + "require": { + "php": "^7.1", + "sebastian/diff": "^3.0", + "sebastian/exporter": "^3.1" + }, + "require-dev": { + "phpunit/phpunit": "^7.1" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "3.0-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Jeff Welch", + "email": "whatthejeff@gmail.com" + }, + { + "name": "Volker Dusch", + "email": "github@wallbash.com" + }, + { + "name": "Bernhard Schussek", + "email": "bschussek@2bepublished.at" + }, + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + } + ], + "description": "Provides the functionality to compare PHP values for equality", + "homepage": "https://github.com/sebastianbergmann/comparator", + "keywords": [ + "comparator", + "compare", + "equality" + ], + "time": "2018-07-12T15:12:46+00:00" + }, + { + "name": "sebastian/diff", + "version": "3.0.1", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/diff.git", + "reference": "366541b989927187c4ca70490a35615d3fef2dce" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/diff/zipball/366541b989927187c4ca70490a35615d3fef2dce", + "reference": "366541b989927187c4ca70490a35615d3fef2dce", + "shasum": "" + }, + "require": { + "php": "^7.1" + }, + "require-dev": { + "phpunit/phpunit": "^7.0", + "symfony/process": "^2 || ^3.3 || ^4" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "3.0-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Kore Nordmann", + "email": "mail@kore-nordmann.de" + }, + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + } + ], + "description": "Diff implementation", + "homepage": "https://github.com/sebastianbergmann/diff", + "keywords": [ + "diff", + "udiff", + "unidiff", + "unified diff" + ], + "time": "2018-06-10T07:54:39+00:00" + }, + { + "name": "sebastian/environment", + "version": "3.1.0", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/environment.git", + "reference": "cd0871b3975fb7fc44d11314fd1ee20925fce4f5" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/environment/zipball/cd0871b3975fb7fc44d11314fd1ee20925fce4f5", + "reference": "cd0871b3975fb7fc44d11314fd1ee20925fce4f5", + "shasum": "" + }, + "require": { + "php": "^7.0" + }, + "require-dev": { + "phpunit/phpunit": "^6.1" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "3.1.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + } + ], + "description": "Provides functionality to handle HHVM/PHP environments", + "homepage": "http://www.github.com/sebastianbergmann/environment", + "keywords": [ + "Xdebug", + "environment", + "hhvm" + ], + "time": "2017-07-01T08:51:00+00:00" + }, + { + "name": "sebastian/exporter", + "version": "3.1.0", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/exporter.git", + "reference": "234199f4528de6d12aaa58b612e98f7d36adb937" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/exporter/zipball/234199f4528de6d12aaa58b612e98f7d36adb937", + "reference": "234199f4528de6d12aaa58b612e98f7d36adb937", + "shasum": "" + }, + "require": { + "php": "^7.0", + "sebastian/recursion-context": "^3.0" + }, + "require-dev": { + "ext-mbstring": "*", + "phpunit/phpunit": "^6.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "3.1.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Jeff Welch", + "email": "whatthejeff@gmail.com" + }, + { + "name": "Volker Dusch", + "email": "github@wallbash.com" + }, + { + "name": "Bernhard Schussek", + "email": "bschussek@2bepublished.at" + }, + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + }, + { + "name": "Adam Harvey", + "email": "aharvey@php.net" + } + ], + "description": "Provides the functionality to export PHP variables for visualization", + "homepage": "http://www.github.com/sebastianbergmann/exporter", + "keywords": [ + "export", + "exporter" + ], + "time": "2017-04-03T13:19:02+00:00" + }, + { + "name": "sebastian/global-state", + "version": "2.0.0", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/global-state.git", + "reference": "e8ba02eed7bbbb9e59e43dedd3dddeff4a56b0c4" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/global-state/zipball/e8ba02eed7bbbb9e59e43dedd3dddeff4a56b0c4", + "reference": "e8ba02eed7bbbb9e59e43dedd3dddeff4a56b0c4", + "shasum": "" + }, + "require": { + "php": "^7.0" + }, + "require-dev": { + "phpunit/phpunit": "^6.0" + }, + "suggest": { + "ext-uopz": "*" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.0-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + } + ], + "description": "Snapshotting of global state", + "homepage": "http://www.github.com/sebastianbergmann/global-state", + "keywords": [ + "global state" + ], + "time": "2017-04-27T15:39:26+00:00" + }, + { + "name": "sebastian/object-enumerator", + "version": "3.0.3", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/object-enumerator.git", + "reference": "7cfd9e65d11ffb5af41198476395774d4c8a84c5" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/object-enumerator/zipball/7cfd9e65d11ffb5af41198476395774d4c8a84c5", + "reference": "7cfd9e65d11ffb5af41198476395774d4c8a84c5", + "shasum": "" + }, + "require": { + "php": "^7.0", + "sebastian/object-reflector": "^1.1.1", + "sebastian/recursion-context": "^3.0" + }, + "require-dev": { + "phpunit/phpunit": "^6.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "3.0.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + } + ], + "description": "Traverses array structures and object graphs to enumerate all referenced objects", + "homepage": "https://github.com/sebastianbergmann/object-enumerator/", + "time": "2017-08-03T12:35:26+00:00" + }, + { + "name": "sebastian/object-reflector", + "version": "1.1.1", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/object-reflector.git", + "reference": "773f97c67f28de00d397be301821b06708fca0be" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/object-reflector/zipball/773f97c67f28de00d397be301821b06708fca0be", + "reference": "773f97c67f28de00d397be301821b06708fca0be", + "shasum": "" + }, + "require": { + "php": "^7.0" + }, + "require-dev": { + "phpunit/phpunit": "^6.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.1-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + } + ], + "description": "Allows reflection of object attributes, including inherited and non-public ones", + "homepage": "https://github.com/sebastianbergmann/object-reflector/", + "time": "2017-03-29T09:07:27+00:00" + }, + { + "name": "sebastian/recursion-context", + "version": "3.0.0", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/recursion-context.git", + "reference": "5b0cd723502bac3b006cbf3dbf7a1e3fcefe4fa8" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/recursion-context/zipball/5b0cd723502bac3b006cbf3dbf7a1e3fcefe4fa8", + "reference": "5b0cd723502bac3b006cbf3dbf7a1e3fcefe4fa8", + "shasum": "" + }, + "require": { + "php": "^7.0" + }, + "require-dev": { + "phpunit/phpunit": "^6.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "3.0.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Jeff Welch", + "email": "whatthejeff@gmail.com" + }, + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + }, + { + "name": "Adam Harvey", + "email": "aharvey@php.net" + } + ], + "description": "Provides functionality to recursively process PHP variables", + "homepage": "http://www.github.com/sebastianbergmann/recursion-context", + "time": "2017-03-03T06:23:57+00:00" + }, + { + "name": "sebastian/resource-operations", + "version": "1.0.0", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/resource-operations.git", + "reference": "ce990bb21759f94aeafd30209e8cfcdfa8bc3f52" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/resource-operations/zipball/ce990bb21759f94aeafd30209e8cfcdfa8bc3f52", + "reference": "ce990bb21759f94aeafd30209e8cfcdfa8bc3f52", + "shasum": "" + }, + "require": { + "php": ">=5.6.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + } + ], + "description": "Provides a list of PHP built-in functions that operate on resources", + "homepage": "https://www.github.com/sebastianbergmann/resource-operations", + "time": "2015-07-28T20:34:47+00:00" + }, + { + "name": "sebastian/version", + "version": "2.0.1", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/version.git", + "reference": "99732be0ddb3361e16ad77b68ba41efc8e979019" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/version/zipball/99732be0ddb3361e16ad77b68ba41efc8e979019", + "reference": "99732be0ddb3361e16ad77b68ba41efc8e979019", + "shasum": "" + }, + "require": { + "php": ">=5.6" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.0.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de", + "role": "lead" + } + ], + "description": "Library that helps with managing the version number of Git-hosted PHP projects", + "homepage": "https://github.com/sebastianbergmann/version", + "time": "2016-10-03T07:35:21+00:00" + }, + { + "name": "theseer/tokenizer", + "version": "1.1.0", + "source": { + "type": "git", + "url": "https://github.com/theseer/tokenizer.git", + "reference": "cb2f008f3f05af2893a87208fe6a6c4985483f8b" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/theseer/tokenizer/zipball/cb2f008f3f05af2893a87208fe6a6c4985483f8b", + "reference": "cb2f008f3f05af2893a87208fe6a6c4985483f8b", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "ext-tokenizer": "*", + "ext-xmlwriter": "*", + "php": "^7.0" + }, + "type": "library", + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Arne Blankerts", + "email": "arne@blankerts.de", + "role": "Developer" + } + ], + "description": "A small library for converting tokenized PHP source code into XML and potentially other formats", + "time": "2017-04-07T12:08:54+00:00" + }, + { + "name": "webmozart/assert", + "version": "1.3.0", + "source": { + "type": "git", + "url": "https://github.com/webmozart/assert.git", + "reference": "0df1908962e7a3071564e857d86874dad1ef204a" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/webmozart/assert/zipball/0df1908962e7a3071564e857d86874dad1ef204a", + "reference": "0df1908962e7a3071564e857d86874dad1ef204a", + "shasum": "" + }, + "require": { + "php": "^5.3.3 || ^7.0" + }, + "require-dev": { + "phpunit/phpunit": "^4.6", + "sebastian/version": "^1.0.1" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.3-dev" + } + }, + "autoload": { + "psr-4": { + "Webmozart\\Assert\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Bernhard Schussek", + "email": "bschussek@gmail.com" + } + ], + "description": "Assertions to validate method input/output with nice error messages.", + "keywords": [ + "assert", + "check", + "validate" + ], + "time": "2018-01-29T19:49:41+00:00" + } + ], + "aliases": [], + "minimum-stability": "stable", + "stability-flags": [], + "prefer-stable": false, + "prefer-lowest": false, + "platform": { + "php": ">=7.0" + }, + "platform-dev": [] +} diff --git a/example.php b/example.php index 744486e..37d0759 100644 --- a/example.php +++ b/example.php @@ -1,27 +1,28 @@ add_needle('art'); -$ac->add_needle('cart'); -$ac->add_needle('ted'); +$ac->addNeedle('art'); +$ac->addNeedle('cart'); +$ac->addNeedle('ted'); /* Now call finalize. This lets the engine @@ -34,11 +35,9 @@ /* And finally do the search. It should be fast. */ /* 012345678901234567890123456789 */ -$found = $ac->search('a carted mart lot one blue ted'); +$found = $ac->execute('a carted mart lot one blue ted'); /* It'll return an array with each keyword found and its * position in the search text. */ print_r($found); - - diff --git a/phpunit.xml.dist b/phpunit.xml.dist new file mode 100644 index 0000000..5066e14 --- /dev/null +++ b/phpunit.xml.dist @@ -0,0 +1,7 @@ + + + + test + + + \ No newline at end of file diff --git a/src/Benchmark.php b/src/Benchmark.php new file mode 100644 index 0000000..838520f --- /dev/null +++ b/src/Benchmark.php @@ -0,0 +1,147 @@ +foundStrpos = $found; + } + + /** + * @param array $needles + * @param string $haystack + * @param int $loops + */ + protected function benchmarkPregMatch(array $needles, string $haystack, int $loops) + { + print "\nSearching with preg_match...\n"; + + // Note, this actually sucks and misses cases where one needle is a prefix or + // suffix of another. + $regex = '/' . implode('|', $needles) . '/'; + + $st = microtime(1); + for ($loop = 0; $loop < $loops; ++$loop) { + $k = 0; + while (preg_match($regex, $haystack, $m, PREG_OFFSET_CAPTURE, $k)) { + $k = $m[0][1] + 1; + } + } + $et = microtime(1); + print 'time: ' . ($et - $st) . "\n"; + } + + /** + * @param array $needles + * @param string $haystack + * @param int $loops + */ + protected function benchmarkPregMatchAll(array $needles, string $haystack, int $loops) + { + print "\nSearching with preg_match_all...\n"; + + // Note, this actually sucks and misses cases where one needle is a prefix or + // suffix of another. + $regex = '/' . implode('|', $needles) . '/'; + + $st = microtime(1); + for ($loop = 0; $loop < $loops; ++$loop) { + preg_match_all($regex, $haystack, $found, PREG_OFFSET_CAPTURE); + } + $et = microtime(1); + print 'time: ' . ($et - $st) . "\n"; + } + + /** + * @param array $needles + * @param string $haystack + * @param int $loops + * @throws \Exception + */ + protected function benchmarkAhoCorasick(array $needles, string $haystack, int $loops) + { + print "\nSearching with aho corasick...\n"; + + $ac = new Search(); + foreach ($needles as $n) { + $ac->addNeedle($n); + } + $ac->finalize(); + + $st = microtime(1); + for ($loop = 0; $loop < $loops; ++$loop) { + $found = $ac->execute($haystack); + } + $et = microtime(1); + print 'time: ' . ($et - $st) . "\n"; + + // Check that the answers match. + // First sort the arrays. + $comp = function ($a, $b) { + return ($a[1] === $b[1]) ? ($a[0] > $b[0]) : ($a[1] > $b[1]); + }; + usort($found, $comp); + usort($this->foundStrpos, $comp); + + if ($this->foundStrpos !== $found) { + print "ERROR - Aho Corasick got the wrong result.\n"; + + print 'strpos size: ' . count($this->foundStrpos) . "\n"; + print 'aho corasick size: ' . count($found) . "\n"; + + $numberFound = count($found); + + for ($i = 0; $i < $numberFound; ++$i) { + if ($this->foundStrpos[$i] !== $found[$i]) { + print "Mismatch $i\n"; + print_r($this->foundStrpos[$i]); + print_r($found[$i]); + } + } + } + } + + /** + * Compares the performance of Aho Corasick against strpos, preg_match, and preg_match_all + * + * @param array $needles + * @param string $haystack + * @param int $loops + * @throws \Exception + */ + public function run(array $needles, string $haystack, int $loops = 10) + { + print 'Loaded ' . count($needles) . ' keywords to search on a text of ' . strlen($haystack) . " characters.\n"; + + $this->benchmarkStrpos($needles, $haystack, $loops); + $this->benchmarkPregMatch($needles, $haystack, $loops); + $this->benchmarkPregMatchAll($needles, $haystack, $loops); + $this->benchmarkAhoCorasick($needles, $haystack, $loops); + } +} diff --git a/src/Search.php b/src/Search.php new file mode 100644 index 0000000..6bc6a82 --- /dev/null +++ b/src/Search.php @@ -0,0 +1,154 @@ +final) { + throw new Exception('Cannot add word to finalized ahocorasick.'); + } + + $nodes = &$this->nodes; + $n = 0; + + $needleLength = strlen($needle); + + for ($i = 0; $i < $needleLength; ++$i) { + $c = $needle[$i]; + + if (!isset($nodes[$n][$c])) { + $nodes[$n][$c] = count($nodes); + $nodes[] = []; + } + $n = $nodes[$n][$c]; + } + + $nodes[$n][0][] = $needle; + } + + /** + * Create the structure needed to search text for the given keywords. + * Once you call this, you cannot add additional keywords via addNeedle(). + */ + public function finalize() + { + $nodes = &$this->nodes; + $queue = []; + + foreach ($nodes[0] as $j => $_) { + $nodes[$nodes[0][$j]][1] = 0; + $queue[] = $nodes[0][$j]; + } + + while (count($queue)) { + $r = $queue[0]; + $queue = array_slice($queue, 1); + + foreach ($nodes[$r] as $j => $_) { + if ($j === 0 || $j === 1) { + continue; + } + $v = $nodes[$r][1]; + $u = $nodes[$r][$j]; + while ($v > 0 && !isset($nodes[$v][$j])) { + $v = $nodes[$v][1]; + } + $nodes[$u][1] = $nodes[$v][$j] ?? $v; + if (isset($nodes[$nodes[$u][1]][0])) { + if (!isset($nodes[$u][0])) { + $nodes[$u][0] = []; + } + $nodes[$u][0] = array_merge($nodes[$u][0], $nodes[$nodes[$u][1]][0]); + } + $queue[] = $u; + } + } + + $this->final = 1; + } + + /** + * Search the text for the given keywords. + * + * @param string $haystack + * @return array + * @throws Exception + */ + public function execute(string $haystack):array + { + if (!$this->final) { + throw new Exception('Must call finalize() before search.'); + } + + $nodes = &$this->nodes; + $found = []; + $n = 0; + + $haystackLength = strlen($haystack); + + for ($i = 0; $i < $haystackLength; ++$i) { + $c = $haystack[$i]; + + while (!array_key_exists($c, $nodes[$n]) && $n) { + $n = $nodes[$n][1]; + if ($n === null) { + die(); + } + } + + if (isset($nodes[$n][$c])) { + $n = $nodes[$n][$c]; + } + + if (isset($nodes[$n][0])) { + $z = $nodes[$n][0]; + foreach ($z as $w) { + $found[] = [$w, $i - strlen($w) + 1]; + } + } + } + + return $found; + } +} diff --git a/test/AhoCorasickTest.php b/test/AhoCorasickTest.php new file mode 100644 index 0000000..838b6bf --- /dev/null +++ b/test/AhoCorasickTest.php @@ -0,0 +1,63 @@ +search = new Search(); + + parent::setUp(); + } + + public function testSearch() + { + $expectedResult = [ + ['cart', 2], + ['art', 3], + ['ted', 5], + ['art', 10], + ['ted', 27], + ]; + + $this->search->addNeedle('art'); + $this->search->addNeedle('cart'); + $this->search->addNeedle('ted'); + $this->search->finalize(); + + $found = $this->search->execute('a carted mart lot one blue ted'); + + $this->assertSame($expectedResult, $found); + } + + /** + * @expectedException \Exception + * @expectedExceptionMessage Must call finalize() before search. + */ + public function testCannotSearchUnlessFinalized() + { + $this->search->addNeedle('art'); + $this->search->addNeedle('cart'); + $this->search->addNeedle('ted'); + + $this->search->execute('a carted mart lot one blue ted'); + } + + /** + * @expectedException \Exception + * @expectedExceptionMessage Cannot add word to finalized ahocorasick. + */ + public function testCannotAddNeedleAfterFinalize() + { + $this->search->addNeedle('art'); + $this->search->addNeedle('cart'); + $this->search->addNeedle('ted'); + + $this->search->finalize(); + + $this->search->addNeedle('mart'); + } +}