-
Notifications
You must be signed in to change notification settings - Fork 263
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
PHPLIB-1237 Implement Parallel Benchmarks LDJSON multi-file import #1166
Changes from 3 commits
4dc5cd6
02d633f
158cf1d
f974887
59515b0
0f9ce7c
7310d97
b06ae9d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,14 +64,15 @@ jobs: | |
- name: "Install dependencies with Composer" | ||
uses: "ramsey/[email protected]" | ||
with: | ||
composer-options: "--no-suggest" | ||
composer-options: "--no-suggest --working-dir=./benchmark" | ||
|
||
- name: "Run phpbench" | ||
working-directory: "./benchmark" | ||
run: "vendor/bin/phpbench run --report=aggregate --report=bar_chart_time --report=env --output html" | ||
|
||
- name: Upload HTML report | ||
uses: actions/upload-artifact@v3 | ||
with: | ||
name: phpbench-${{ github.sha }}.html | ||
path: .phpbench/html/index.html | ||
path: ./benchmark/.phpbench/html/index.html | ||
retention-days: 3 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
{ | ||
"name": "mongodb/mongodb-benchmark", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a proposal to define different dependencies for the benchmark: create a dedicated composer project. There is the PHP version and now the amphp dependency (which requires PHP 8). The library |
||
"type": "project", | ||
"repositories": [ | ||
{ | ||
"type": "path", | ||
"url": "../", | ||
"symlink": true | ||
} | ||
], | ||
"require": { | ||
"php": ">=8.1", | ||
"ext-pcntl": "*", | ||
"amphp/parallel-functions": "^1.1", | ||
"mongodb/mongodb": "@dev", | ||
"phpbench/phpbench": "^1.2" | ||
}, | ||
"autoload": { | ||
"psr-4": { | ||
"MongoDB\\Benchmark\\": "src/" | ||
} | ||
}, | ||
"scripts": { | ||
"benchmark": "phpbench run --report=aggregate" | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,210 @@ | ||
<?php | ||
|
||
namespace MongoDB\Benchmark\DriverBench; | ||
|
||
use Amp\Parallel\Worker\DefaultPool; | ||
use Generator; | ||
use MongoDB\Benchmark\Fixtures\Data; | ||
use MongoDB\Benchmark\Utils; | ||
use MongoDB\BSON\Document; | ||
use MongoDB\Driver\BulkWrite; | ||
use PhpBench\Attributes\AfterClassMethods; | ||
use PhpBench\Attributes\BeforeClassMethods; | ||
use PhpBench\Attributes\BeforeMethods; | ||
use PhpBench\Attributes\Iterations; | ||
use PhpBench\Attributes\ParamProviders; | ||
use PhpBench\Attributes\Revs; | ||
use RuntimeException; | ||
|
||
use function Amp\ParallelFunctions\parallelMap; | ||
use function Amp\Promise\wait; | ||
use function array_map; | ||
use function count; | ||
use function file; | ||
use function file_get_contents; | ||
use function file_put_contents; | ||
use function is_dir; | ||
use function mkdir; | ||
use function pcntl_fork; | ||
use function pcntl_waitpid; | ||
use function range; | ||
use function sprintf; | ||
use function str_repeat; | ||
use function sys_get_temp_dir; | ||
use function unlink; | ||
|
||
use const FILE_IGNORE_NEW_LINES; | ||
use const FILE_NO_DEFAULT_CONTEXT; | ||
use const FILE_SKIP_EMPTY_LINES; | ||
|
||
/** | ||
* For accurate results, run benchmarks on a standalone server. | ||
* | ||
* @see https://github.com/mongodb/specifications/blob/ddfc8b583d49aaf8c4c19fa01255afb66b36b92e/source/benchmarking/benchmarking.rst#parallel | ||
*/ | ||
#[BeforeClassMethods('beforeClass')] | ||
#[AfterClassMethods('afterClass')] | ||
final class ParallelBench | ||
GromNaN marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
public static function beforeClass(): void | ||
{ | ||
// Generate files | ||
$fileContents = str_repeat(file_get_contents(Data::LDJSON_FILE_PATH), 5_000); | ||
foreach (self::getFileNames() as $file) { | ||
file_put_contents($file, $fileContents); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like how you prepare the files for each run instead of relying on the archive that contains a few hundred MB of the same data 👍 |
||
} | ||
|
||
public static function afterClass(): void | ||
{ | ||
foreach (self::getFileNames() as $file) { | ||
unlink($file); | ||
} | ||
} | ||
|
||
/** | ||
* Parallel: LDJSON multi-file import | ||
* Using Driver's BulkWrite in a single thread | ||
*/ | ||
#[BeforeMethods('beforeMultiFileImport')] | ||
#[Revs(1)] | ||
#[Iterations(1)] | ||
public function benchMultiFileImportBulkWrite(): void | ||
{ | ||
foreach (self::getFileNames() as $file) { | ||
self::importFile($file); | ||
} | ||
} | ||
|
||
/** | ||
* Parallel: LDJSON multi-file import | ||
* Using library's Collection::insertMany in a single thread | ||
*/ | ||
#[BeforeMethods('beforeMultiFileImport')] | ||
#[Revs(1)] | ||
#[Iterations(1)] | ||
public function benchMultiFileImportInsertMany(): void | ||
{ | ||
$collection = Utils::getCollection(); | ||
foreach (self::getFileNames() as $file) { | ||
// Read file contents into BSON documents | ||
$docs = array_map( | ||
static fn (string $line) => Document::fromJSON($line), | ||
file($file, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES | FILE_NO_DEFAULT_CONTEXT), | ||
jmikola marked this conversation as resolved.
Show resolved
Hide resolved
|
||
); | ||
// Insert documents in bulk | ||
$collection->insertMany($docs); | ||
} | ||
} | ||
|
||
/** | ||
* Parallel: LDJSON multi-file import | ||
* Using multiple forked threads | ||
* | ||
* @param array{processes:int, files:string[], batchSize:int} $params | ||
*/ | ||
#[BeforeMethods('beforeMultiFileImport')] | ||
#[ParamProviders(['provideProcessesParameter'])] | ||
#[Revs(1)] | ||
#[Iterations(1)] | ||
public function benchMultiFileImportFork(array $params): void | ||
{ | ||
$pids = []; | ||
foreach (self::getFileNames() as $file) { | ||
// Wait for a child process to finish if we have reached the maximum number of processes | ||
if (count($pids) >= $params['processes']) { | ||
$pid = pcntl_waitpid(-1, $status); | ||
unset($pids[$pid]); | ||
} | ||
|
||
$pid = pcntl_fork(); | ||
if ($pid === 0) { | ||
// Reset to ensure that the existing libmongoc client (via the Manager) is not re-used by the child | ||
// process. When the child process constructs a new Manager, the differing PID will result in creation | ||
// of a new libmongoc client. | ||
Utils::reset(); | ||
GromNaN marked this conversation as resolved.
Show resolved
Hide resolved
|
||
self::importFile($file); | ||
|
||
// Exit the child process | ||
exit(0); | ||
} | ||
|
||
if ($pid === -1) { | ||
throw new RuntimeException('Failed to fork'); | ||
} | ||
|
||
// Keep the forked process id to wait for it later | ||
$pids[$pid] = true; | ||
} | ||
|
||
// Wait for all child processes to finish | ||
while ($pids !== []) { | ||
$pid = pcntl_waitpid(-1, $status); | ||
unset($pids[$pid]); | ||
} | ||
} | ||
|
||
/** | ||
* Parallel: LDJSON multi-file import | ||
* Using amphp/parallel-functions with worker pool | ||
* | ||
* @param array{processes:int, files:string[], batchSize:int} $params | ||
*/ | ||
#[BeforeMethods('beforeMultiFileImport')] | ||
#[ParamProviders(['provideProcessesParameter'])] | ||
#[Revs(1)] | ||
#[Iterations(1)] | ||
public function benchMultiFileImportAmp(array $params): void | ||
{ | ||
wait(parallelMap( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This implementation uses a worker process that communicates with the parent process to receive serialized functions to calls and sends serialized results. |
||
self::getFileNames(), | ||
// Uses array callable instead of closure to skip complex serialization | ||
[self::class, 'importFile'], | ||
// The pool size is the number of processes | ||
new DefaultPool($params['processes']), | ||
)); | ||
} | ||
|
||
public static function provideProcessesParameter(): Generator | ||
{ | ||
yield '1 proc' => ['processes' => 1]; // 100 sequences, to compare to the single thread baseline | ||
yield '2 proc' => ['processes' => 2]; // 50 sequences | ||
yield '4 proc' => ['processes' => 4]; // 25 sequences | ||
yield '8 proc' => ['processes' => 8]; // 13 sequences | ||
yield '13 proc' => ['processes' => 13]; // 8 sequences | ||
yield '20 proc' => ['processes' => 20]; // 5 sequences | ||
yield '34 proc' => ['processes' => 34]; // 3 sequences | ||
} | ||
|
||
public function beforeMultiFileImport(): void | ||
{ | ||
$database = Utils::getDatabase(); | ||
$database->drop(); | ||
$database->createCollection(Utils::getCollectionName()); | ||
} | ||
|
||
public static function importFile(string $file): void | ||
{ | ||
$namespace = sprintf('%s.%s', Utils::getDatabaseName(), Utils::getCollectionName()); | ||
|
||
$bulkWrite = new BulkWrite(); | ||
foreach (file($file, FILE_IGNORE_NEW_LINES | FILE_NO_DEFAULT_CONTEXT) as $line) { | ||
jmikola marked this conversation as resolved.
Show resolved
Hide resolved
|
||
$bulkWrite->insert(Document::fromJSON($line)); | ||
} | ||
|
||
Utils::getClient()->getManager()->executeBulkWrite($namespace, $bulkWrite); | ||
} | ||
|
||
private static function getFileNames(): array | ||
{ | ||
$tempDir = sys_get_temp_dir() . '/mongodb-php-benchmark'; | ||
if (! is_dir($tempDir)) { | ||
mkdir($tempDir); | ||
} | ||
|
||
return array_map( | ||
static fn (int $i) => sprintf('%s/%03d.txt', $tempDir, $i), | ||
range(0, 99), | ||
); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"text":"@wildfits you're not getting one.....","in_reply_to_status_id":22773233453,"retweet_count":null,"contributors":null,"created_at":"Thu Sep 02 19:38:18 +0000 2010","geo":null,"source":"web","coordinates":null,"in_reply_to_screen_name":"wildfits","truncated":false,"entities":{"user_mentions":[{"indices":[0,9],"screen_name":"wildfits","name":"Mairin Goetzinger","id":41832464}],"urls":[],"hashtags":[]},"retweeted":false,"place":null,"user":{"friends_count":179,"profile_sidebar_fill_color":"7a7a7a","location":"Minneapols, MN/Brookings SD","verified":false,"follow_request_sent":null,"favourites_count":0,"profile_sidebar_border_color":"a3a3a3","profile_image_url":"http://a1.twimg.com/profile_images/1110614677/Screen_shot_2010-08-25_at_10.12.40_AM_normal.png","geo_enabled":false,"created_at":"Sun Aug 17 00:23:13 +0000 2008","description":"graphic designer + foodie, with a love of music, movies, running, design, + the outdoors!","time_zone":"Mountain Time (US & Canada)","url":"http://jessiefarris.com/","screen_name":"jessiekf","notifications":null,"profile_background_color":"303030","listed_count":1,"lang":"en"}} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we document instructions for referencing benchmarks somewhere? This may warrant a new heading in CONTRIBUTING.md.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the goal is to run benchmarks on evergreen and publish reports there. PHPLIB-1187