diff --git a/generator/composer.json b/generator/composer.json
index 7974a887e..436f0ad75 100644
--- a/generator/composer.json
+++ b/generator/composer.json
@@ -18,6 +18,8 @@
"mongodb/mongodb": "@dev",
"nette/php-generator": "^4",
"symfony/console": "^6.3",
+ "symfony/css-selector": "^6.3",
+ "symfony/dom-crawler": "^6.3",
"symfony/yaml": "^6.3"
},
"license": "Apache-2.0",
diff --git a/generator/config/schema.json b/generator/config/schema.json
new file mode 100644
index 000000000..422b52c39
--- /dev/null
+++ b/generator/config/schema.json
@@ -0,0 +1,122 @@
+{
+ "$schema": "http://json-schema.org/draft-06/schema#",
+ "$ref": "#/definitions/Operator",
+ "definitions": {
+ "Operator": {
+ "type": "object",
+ "additionalProperties": false,
+ "properties": {
+ "name": {
+ "$comment": "The name of the operator. Must start with a $",
+ "type": "string",
+ "pattern": "^\\$[a-z][a-zA-Z]+$"
+ },
+ "category": {
+ "$comment": "The category as defined by MongoDB's documentation.",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "link": {
+ "$comment": "The link to the operator's documentation on MongoDB's website.",
+ "type": "string",
+ "format": "uri",
+ "qt-uri-protocols": [
+ "https"
+ ]
+ },
+ "returnType": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "encode": {
+ "$comment": "Specifies how operator parameters are encoded.",
+ "$comment": "array: parameters are encoded as an array of values in the order they are defined by the spec",
+ "$comment": "object: parameters are encoded as an object with keys matching the parameter names",
+ "$comment": "single: get the single parameter value",
+ "type": "string",
+ "enum": [
+ "array",
+ "object",
+ "single"
+ ]
+ },
+ "description": {
+ "$comment": "The description of the argument from MongoDB's documentation.",
+ "type": "string"
+ },
+ "parameters": {
+ "$comment": "An optional list of parameters for the operator.",
+ "type": "array",
+ "items": {
+ "$ref": "#/definitions/Parameter"
+ }
+ }
+ },
+ "required": [
+ "category",
+ "description",
+ "encode",
+ "link",
+ "name",
+ "parameters",
+ "returnType"
+ ],
+ "title": "Operator"
+ },
+ "Parameter": {
+ "type": "object",
+ "additionalProperties": false,
+ "properties": {
+ "name": {
+ "type": "string",
+ "pattern": "^[a-z][a-zA-Z0-9]+$"
+ },
+ "returnType": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "description": {
+ "$comment": "The description of the argument from MongoDB's documentation.",
+ "type": "string"
+ },
+ "optional": {
+ "$comment": "Whether the parameter is optional or not.",
+ "type": "boolean"
+ },
+ "valueMin": {
+ "$comment": "The minimum value for a numeric parameter.",
+ "type": "number"
+ },
+ "valueMax": {
+ "$comment": "The minimum value for a numeric parameter.",
+ "type": "number"
+ },
+ "variadic": {
+ "$comment": "Whether the parameter is variadic or not.",
+ "type": "string",
+ "enum": [
+ "list",
+ "map"
+ ]
+ },
+ "variadicMin": {
+ "$comment": "The minimum number of arguments for a variadic parameter.",
+ "type": "integer",
+ "minimum": 0
+ }
+ },
+ "required": [
+ "description",
+ "name",
+ "type"
+ ],
+ "title": "Parameter"
+ }
+ }
+}
diff --git a/generator/generate b/generator/generate
index 017dbef6e..caa436278 100755
--- a/generator/generate
+++ b/generator/generate
@@ -2,6 +2,7 @@
add(new GenerateCommand(__DIR__ . '/../', __DIR__ . '/config'));
+$application->add(new ScrapeCommand(__DIR__ . '/config'));
$application->setDefaultCommand('generate');
$application->run();
diff --git a/generator/src/Command/ScrapeCommand.php b/generator/src/Command/ScrapeCommand.php
new file mode 100644
index 000000000..5b455eaf0
--- /dev/null
+++ b/generator/src/Command/ScrapeCommand.php
@@ -0,0 +1,235 @@
+ $tabs Associative array of names to table ids */
+ private array $tabs;
+
+ public function __construct(
+ private string $configDir,
+ ) {
+ parent::__construct();
+ }
+
+ public function configure(): void
+ {
+ $this->setName('scrape');
+ }
+
+ public function execute(InputInterface $input, OutputInterface $output): int
+ {
+ $index = file_get_contents('https://docs.google.com/spreadsheets/d/e/2PACX-1vROpGTJGXAKf2SVuSZaw16NwMVtzMVGH9b-YiMtddgZRZOjOO6jK2YLbTUZ0N_qe74nxGY9hYhUe-l2/pubhtml');
+ $this->crawler = new Crawler($index);
+ $this->extractTabs();
+
+ $docs = $this->getTableData($this->crawler, 'aggregation pipeline operators');
+ foreach ($docs as $doc) {
+ $this->writeYamlFile('aggregation-operators', $this->formatSpec($doc));
+ }
+
+ $docs = $this->getTableData($this->crawler, 'query operators');
+ foreach ($docs as $doc) {
+ $this->writeYamlFile('query-operators', $this->formatSpec($doc));
+ }
+
+ $docs = $this->getTableData($this->crawler, 'aggregation pipeline stages');
+ foreach ($docs as $doc) {
+ $this->writeYamlFile('aggregation-stages', $this->formatSpec($doc));
+ }
+
+ return Command::SUCCESS;
+ }
+
+ private function extractTabs(): void
+ {
+ // Extract tab names and ids
+ $tabs = $this->crawler->filter('#sheet-menu > li')->each(fn (Crawler $li) => [
+ 'name' => $li->text(),
+ 'id' => str_replace('sheet-button-', '', $li->attr('id')),
+ ]);
+
+ $this->tabs = array_combine(array_column($tabs, 'name'), array_column($tabs, 'id'));
+ }
+
+ private function getTableData(Crawler $crawler, string $tabName): array
+ {
+ $id = $this->tabs[$tabName] ?? throw new InvalidArgumentException('Invalid tab name: ' . $tabName);
+
+ $table = $crawler->filter('#' . $id . ' table > tbody');
+
+ // Load the table into a 2D array
+ $rows = [];
+ $table->filter('tr')->each(function (Crawler $row, $rowIndex) use (&$rows): void {
+ $cellIndex = 0;
+
+ $row->filter('td')->each(function (Crawler $cell) use (&$rows, &$rowIndex, &$cellIndex): bool {
+ // Skip freezebar cells
+ if (str_contains($cell->attr('class') ?? '', 'freezebar-cell')) {
+ return true;
+ }
+
+ $rowspan = $cell->attr('rowspan') ?: 1;
+
+ // Advance to the next available cell
+ while (array_key_exists($rowIndex, $rows) && array_key_exists($cellIndex, $rows[$rowIndex])) {
+ $cellIndex++;
+ }
+
+ $value = $cell->html();
+ $value = str_replace(['
', '
', '
'], PHP_EOL, $value);
+ $value = strip_tags($value);
+ $value = htmlspecialchars_decode($value);
+ $value = trim($value);
+
+ // Fill the next cells with null for colspan and rowspan
+ for ($rowIndexLoop = $rowIndex; $rowIndexLoop < $rowIndex + $rowspan; $rowIndexLoop++) {
+ if ($rowIndexLoop === $rowIndex) {
+ $rows[$rowIndexLoop][$cellIndex] = $value;
+ } else {
+ $rows[$rowIndexLoop][$cellIndex] = null;
+ }
+ }
+
+ return false;
+ });
+
+ if (isset($rows[$rowIndex])) {
+ ksort($rows[$rowIndex]);
+ }
+ });
+
+ // Extract headers from first row
+ $headers = array_shift($rows);
+
+ // Map header to field names + aggregate args
+ $docs = [];
+ $docId = 0;
+ $argId = 0;
+ foreach ($rows as $row) {
+ // Create a new document for each row that starts with a non-empty cell
+ if ($row[0] && $docs !== []) {
+ $docId++;
+ $argId = 0;
+ }
+
+ foreach ($row as $index => $cell) {
+ if (str_contains($headers[$index], 'Arg')) {
+ $docs[$docId]['Args'][$argId][str_replace('Arg', '', $headers[$index])] = $cell;
+ } elseif (null !== $cell) {
+ $docs[$docId][$headers[$index]] = $cell;
+ }
+ }
+
+ $argId++;
+ }
+
+ return $docs;
+ }
+
+ /**
+ * @param array{
+ * Name: string,
+ * Category: string,
+ * Description: string,
+ * Link: string,
+ * ReturnType: string,
+ * Encode: string,
+ * Args: array{ Name: string, Type: string, Options: string, Description: string }
+ * } $doc
+ */
+ private function formatSpec(array $doc): array
+ {
+ foreach (['Name', 'Category', 'Description', 'Link', 'ReturnType', 'Encode', 'Args'] as $key) {
+ assert(isset($doc[$key]), 'Missing ' . $key . ' for ' . var_export($doc, true));
+ }
+
+ $spec = [];
+ $spec['name'] = $doc['Name'];
+ $spec['category'] = explode(PHP_EOL, $doc['Category']);
+ sort($spec['category']);
+ $spec['link'] = $doc['Link'];
+ $spec['returnType'] = explode(PHP_EOL, $doc['ReturnType']);
+ $spec['encode'] = $doc['Encode'];
+
+ if ($doc['Description']) {
+ $spec['description'] = $doc['Description'] . PHP_EOL;
+ }
+
+ foreach ($doc['Args'] as $arg) {
+ foreach (['Name', 'Type', 'Options', 'Description'] as $key) {
+ assert(isset($arg[$key]), 'Missing Arg' . $key . ' for ' . var_export($doc, true));
+ }
+
+ $parameter = [];
+ $parameter['name'] = $arg['Name'];
+ $parameter['type'] = explode(PHP_EOL, $doc['ReturnType']);
+ if (str_contains($arg['Options'], 'Optional')) {
+ $parameter['optional'] = true;
+ }
+
+ if ($arg['Description']) {
+ $parameter['description'] = $arg['Description'] . PHP_EOL;
+ }
+
+ $spec['parameters'][] = $parameter;
+ }
+
+ return $spec;
+ }
+
+ private function writeYamlFile(string $dirname, array $data): void
+ {
+ $yaml = Yaml::dump($data, 3, 4, Yaml::DUMP_MULTI_LINE_LITERAL_BLOCK);
+ $dirname = $this->configDir . '/' . $dirname;
+ if (! file_exists($dirname)) {
+ mkdir($dirname, 0755);
+ }
+
+ $name = str_replace('$', '', $data['name']) ?: 'positional';
+ $filename = $dirname . '/' . $name . '.yaml';
+
+ // Add a schema reference to the top of the file
+ $schema = '# $schema: ../schema.json' . PHP_EOL;
+
+ // Add a trailing newline if one is not present
+ if (! str_ends_with($yaml, PHP_EOL)) {
+ $yaml .= PHP_EOL;
+ }
+
+ file_put_contents($filename, $schema . $yaml);
+ }
+}