diff --git a/generator/composer.json b/generator/composer.json index 7974a887e..436f0ad75 100644 --- a/generator/composer.json +++ b/generator/composer.json @@ -18,6 +18,8 @@ "mongodb/mongodb": "@dev", "nette/php-generator": "^4", "symfony/console": "^6.3", + "symfony/css-selector": "^6.3", + "symfony/dom-crawler": "^6.3", "symfony/yaml": "^6.3" }, "license": "Apache-2.0", diff --git a/generator/config/schema.json b/generator/config/schema.json new file mode 100644 index 000000000..422b52c39 --- /dev/null +++ b/generator/config/schema.json @@ -0,0 +1,122 @@ +{ + "$schema": "http://json-schema.org/draft-06/schema#", + "$ref": "#/definitions/Operator", + "definitions": { + "Operator": { + "type": "object", + "additionalProperties": false, + "properties": { + "name": { + "$comment": "The name of the operator. Must start with a $", + "type": "string", + "pattern": "^\\$[a-z][a-zA-Z]+$" + }, + "category": { + "$comment": "The category as defined by MongoDB's documentation.", + "type": "array", + "items": { + "type": "string" + } + }, + "link": { + "$comment": "The link to the operator's documentation on MongoDB's website.", + "type": "string", + "format": "uri", + "qt-uri-protocols": [ + "https" + ] + }, + "returnType": { + "type": "array", + "items": { + "type": "string" + } + }, + "encode": { + "$comment": "Specifies how operator parameters are encoded.", + "$comment": "array: parameters are encoded as an array of values in the order they are defined by the spec", + "$comment": "object: parameters are encoded as an object with keys matching the parameter names", + "$comment": "single: get the single parameter value", + "type": "string", + "enum": [ + "array", + "object", + "single" + ] + }, + "description": { + "$comment": "The description of the argument from MongoDB's documentation.", + "type": "string" + }, + "parameters": { + "$comment": "An optional list of parameters for the operator.", + "type": "array", + "items": { + "$ref": "#/definitions/Parameter" + } + } + }, + "required": [ + "category", + "description", + "encode", + "link", + "name", + "parameters", + "returnType" + ], + "title": "Operator" + }, + "Parameter": { + "type": "object", + "additionalProperties": false, + "properties": { + "name": { + "type": "string", + "pattern": "^[a-z][a-zA-Z0-9]+$" + }, + "returnType": { + "type": "array", + "items": { + "type": "string" + } + }, + "description": { + "$comment": "The description of the argument from MongoDB's documentation.", + "type": "string" + }, + "optional": { + "$comment": "Whether the parameter is optional or not.", + "type": "boolean" + }, + "valueMin": { + "$comment": "The minimum value for a numeric parameter.", + "type": "number" + }, + "valueMax": { + "$comment": "The minimum value for a numeric parameter.", + "type": "number" + }, + "variadic": { + "$comment": "Whether the parameter is variadic or not.", + "type": "string", + "enum": [ + "list", + "map" + ] + }, + "variadicMin": { + "$comment": "The minimum number of arguments for a variadic parameter.", + "type": "integer", + "minimum": 0 + } + }, + "required": [ + "description", + "name", + "type" + ], + "title": "Parameter" + } + } +} diff --git a/generator/generate b/generator/generate index 017dbef6e..caa436278 100755 --- a/generator/generate +++ b/generator/generate @@ -2,6 +2,7 @@ add(new GenerateCommand(__DIR__ . '/../', __DIR__ . '/config')); +$application->add(new ScrapeCommand(__DIR__ . '/config')); $application->setDefaultCommand('generate'); $application->run(); diff --git a/generator/src/Command/ScrapeCommand.php b/generator/src/Command/ScrapeCommand.php new file mode 100644 index 000000000..5b455eaf0 --- /dev/null +++ b/generator/src/Command/ScrapeCommand.php @@ -0,0 +1,235 @@ + $tabs Associative array of names to table ids */ + private array $tabs; + + public function __construct( + private string $configDir, + ) { + parent::__construct(); + } + + public function configure(): void + { + $this->setName('scrape'); + } + + public function execute(InputInterface $input, OutputInterface $output): int + { + $index = file_get_contents('https://docs.google.com/spreadsheets/d/e/2PACX-1vROpGTJGXAKf2SVuSZaw16NwMVtzMVGH9b-YiMtddgZRZOjOO6jK2YLbTUZ0N_qe74nxGY9hYhUe-l2/pubhtml'); + $this->crawler = new Crawler($index); + $this->extractTabs(); + + $docs = $this->getTableData($this->crawler, 'aggregation pipeline operators'); + foreach ($docs as $doc) { + $this->writeYamlFile('aggregation-operators', $this->formatSpec($doc)); + } + + $docs = $this->getTableData($this->crawler, 'query operators'); + foreach ($docs as $doc) { + $this->writeYamlFile('query-operators', $this->formatSpec($doc)); + } + + $docs = $this->getTableData($this->crawler, 'aggregation pipeline stages'); + foreach ($docs as $doc) { + $this->writeYamlFile('aggregation-stages', $this->formatSpec($doc)); + } + + return Command::SUCCESS; + } + + private function extractTabs(): void + { + // Extract tab names and ids + $tabs = $this->crawler->filter('#sheet-menu > li')->each(fn (Crawler $li) => [ + 'name' => $li->text(), + 'id' => str_replace('sheet-button-', '', $li->attr('id')), + ]); + + $this->tabs = array_combine(array_column($tabs, 'name'), array_column($tabs, 'id')); + } + + private function getTableData(Crawler $crawler, string $tabName): array + { + $id = $this->tabs[$tabName] ?? throw new InvalidArgumentException('Invalid tab name: ' . $tabName); + + $table = $crawler->filter('#' . $id . ' table > tbody'); + + // Load the table into a 2D array + $rows = []; + $table->filter('tr')->each(function (Crawler $row, $rowIndex) use (&$rows): void { + $cellIndex = 0; + + $row->filter('td')->each(function (Crawler $cell) use (&$rows, &$rowIndex, &$cellIndex): bool { + // Skip freezebar cells + if (str_contains($cell->attr('class') ?? '', 'freezebar-cell')) { + return true; + } + + $rowspan = $cell->attr('rowspan') ?: 1; + + // Advance to the next available cell + while (array_key_exists($rowIndex, $rows) && array_key_exists($cellIndex, $rows[$rowIndex])) { + $cellIndex++; + } + + $value = $cell->html(); + $value = str_replace(['
', '
', '
'], PHP_EOL, $value); + $value = strip_tags($value); + $value = htmlspecialchars_decode($value); + $value = trim($value); + + // Fill the next cells with null for colspan and rowspan + for ($rowIndexLoop = $rowIndex; $rowIndexLoop < $rowIndex + $rowspan; $rowIndexLoop++) { + if ($rowIndexLoop === $rowIndex) { + $rows[$rowIndexLoop][$cellIndex] = $value; + } else { + $rows[$rowIndexLoop][$cellIndex] = null; + } + } + + return false; + }); + + if (isset($rows[$rowIndex])) { + ksort($rows[$rowIndex]); + } + }); + + // Extract headers from first row + $headers = array_shift($rows); + + // Map header to field names + aggregate args + $docs = []; + $docId = 0; + $argId = 0; + foreach ($rows as $row) { + // Create a new document for each row that starts with a non-empty cell + if ($row[0] && $docs !== []) { + $docId++; + $argId = 0; + } + + foreach ($row as $index => $cell) { + if (str_contains($headers[$index], 'Arg')) { + $docs[$docId]['Args'][$argId][str_replace('Arg', '', $headers[$index])] = $cell; + } elseif (null !== $cell) { + $docs[$docId][$headers[$index]] = $cell; + } + } + + $argId++; + } + + return $docs; + } + + /** + * @param array{ + * Name: string, + * Category: string, + * Description: string, + * Link: string, + * ReturnType: string, + * Encode: string, + * Args: array{ Name: string, Type: string, Options: string, Description: string } + * } $doc + */ + private function formatSpec(array $doc): array + { + foreach (['Name', 'Category', 'Description', 'Link', 'ReturnType', 'Encode', 'Args'] as $key) { + assert(isset($doc[$key]), 'Missing ' . $key . ' for ' . var_export($doc, true)); + } + + $spec = []; + $spec['name'] = $doc['Name']; + $spec['category'] = explode(PHP_EOL, $doc['Category']); + sort($spec['category']); + $spec['link'] = $doc['Link']; + $spec['returnType'] = explode(PHP_EOL, $doc['ReturnType']); + $spec['encode'] = $doc['Encode']; + + if ($doc['Description']) { + $spec['description'] = $doc['Description'] . PHP_EOL; + } + + foreach ($doc['Args'] as $arg) { + foreach (['Name', 'Type', 'Options', 'Description'] as $key) { + assert(isset($arg[$key]), 'Missing Arg' . $key . ' for ' . var_export($doc, true)); + } + + $parameter = []; + $parameter['name'] = $arg['Name']; + $parameter['type'] = explode(PHP_EOL, $doc['ReturnType']); + if (str_contains($arg['Options'], 'Optional')) { + $parameter['optional'] = true; + } + + if ($arg['Description']) { + $parameter['description'] = $arg['Description'] . PHP_EOL; + } + + $spec['parameters'][] = $parameter; + } + + return $spec; + } + + private function writeYamlFile(string $dirname, array $data): void + { + $yaml = Yaml::dump($data, 3, 4, Yaml::DUMP_MULTI_LINE_LITERAL_BLOCK); + $dirname = $this->configDir . '/' . $dirname; + if (! file_exists($dirname)) { + mkdir($dirname, 0755); + } + + $name = str_replace('$', '', $data['name']) ?: 'positional'; + $filename = $dirname . '/' . $name . '.yaml'; + + // Add a schema reference to the top of the file + $schema = '# $schema: ../schema.json' . PHP_EOL; + + // Add a trailing newline if one is not present + if (! str_ends_with($yaml, PHP_EOL)) { + $yaml .= PHP_EOL; + } + + file_put_contents($filename, $schema . $yaml); + } +}