From afaf7bbdf558ed6992c81a4d6533ebb1ec34fdb8 Mon Sep 17 00:00:00 2001 From: Shish Date: Thu, 14 Dec 2023 02:08:50 +0000 Subject: [PATCH] fix more search edge-cases --- core/imageboard/image.php | 84 +++++++++++++++++++++++++-------------- ext/index/test.php | 26 +++++++++--- 2 files changed, 75 insertions(+), 35 deletions(-) diff --git a/core/imageboard/image.php b/core/imageboard/image.php index ec30eb64b..fff41e985 100644 --- a/core/imageboard/image.php +++ b/core/imageboard/image.php @@ -159,7 +159,8 @@ private static function find_images_internal(int $start = 0, ?int $limit = null, } } - $querylet = Image::build_search_querylet($tags, $limit, $start); + [$tag_conditions, $img_conditions, $order] = self::terms_to_conditions($tags); + $querylet = self::build_search_querylet($tag_conditions, $img_conditions, $order, $limit, $start); return $database->get_all_iterable($querylet->sql, $querylet->variables); } @@ -246,7 +247,8 @@ public static function count_images(array $tags = []): int if (Extension::is_enabled(RatingsInfo::KEY)) { $tags[] = "rating:*"; } - $querylet = Image::build_search_querylet($tags); + [$tag_conditions, $img_conditions, $order] = self::terms_to_conditions($tags); + $querylet = self::build_search_querylet($tag_conditions, $img_conditions, $order); $total = (int)$database->get_one("SELECT COUNT(*) AS cnt FROM ($querylet->sql) AS tbl", $querylet->variables); if (SPEED_HAX && $total > 5000) { // when we have a ton of images, the count @@ -307,7 +309,8 @@ public function get_next(array $tags = [], bool $next = true): ?Image } else { $tags[] = 'id'. $gtlt . $this->id; $tags[] = 'order:id_'. strtolower($dir); - $querylet = Image::build_search_querylet($tags); + [$tag_conditions, $img_conditions, $order] = self::terms_to_conditions($tags); + $querylet = self::build_search_querylet($tag_conditions, $img_conditions, $order); $querylet->append_sql(' LIMIT 1'); $row = $database->get_row($querylet->sql, $querylet->variables); } @@ -749,13 +752,13 @@ private static function tag_or_wildcard_to_ids(string $tag): array } /** + * Turn a human input string into a an abstract search query + * * @param string[] $terms + * @return array{0: TagCondition[], 1: ImgCondition[], 2: string} */ - private static function build_search_querylet( - array $terms, - ?int $limit = null, - ?int $offset = null - ): Querylet { + private static function terms_to_conditions(array $terms): array + { global $config; $tag_conditions = []; @@ -776,18 +779,30 @@ private static function build_search_querylet( $order = ($order ?: "images.".$config->get_string(IndexConfig::ORDER)); - /* - * Turn a bunch of Querylet objects into a base query - * - * Must follow the format - * - * SELECT images.* - * FROM (...) AS images - * WHERE (...) - * - * ie, return a set of images.* columns, and end with a WHERE - */ + return [$tag_conditions, $img_conditions, $order]; + } + /** + * Turn an abstract search query into an SQL Querylet + * + * Must follow the format + * + * SELECT images.* + * FROM (...) AS images + * WHERE (...) + * + * ie, return a set of images.* columns, and end with a WHERE + * + * @param TagCondition[] $tag_conditions + * @param ImgCondition[] $img_conditions + */ + private static function build_search_querylet( + array $tag_conditions, + array $img_conditions, + string $order, + ?int $limit = null, + ?int $offset = null + ): Querylet { // no tags, do a simple search if (count($tag_conditions) === 0) { $query = new Querylet("SELECT images.* FROM images WHERE 1=1"); @@ -796,15 +811,28 @@ private static function build_search_querylet( // one tag sorted by ID - we can fetch this from the image_tags table, // and do the offset / limit there, which is 10x faster than fetching // all the image_tags and doing the offset / limit on the result. + // + // NOTE: this is currently impossible to test, because the test suite + // loads all extensions, some of whom add generic img_conditions onto + // the search, which prevents this optimisation from being used. elseif ( count($tag_conditions) === 1 + && $tag_conditions[0]->positive + // We can only do this if img_conditions is empty, because + // we're going to apply the offset / limit to the image_tags + // subquery, and applying extra conditions to the top-level + // query might reduce the total results below the target limit && empty($img_conditions) + // We can only do this if we're sorting by ID, because + // we're going to be using the image_tags table, which + // only has image_id and tag_id, not any other columns && ($order == "id DESC" || $order == "images.id DESC") - && !is_null($offset) + // This is only an optimisation if we are applying limit + // and offset && !is_null($limit) + && !is_null($offset) ) { $tc = $tag_conditions[0]; - $in = $tc->positive ? "IN" : "NOT IN"; // IN (SELECT id FROM tags) is 100x slower than doing a separate // query and then a second query for IN(first_query_results)?? $tag_array = self::tag_or_wildcard_to_ids($tc->tag); @@ -820,17 +848,16 @@ private static function build_search_querylet( $query = new Querylet(" SELECT images.* FROM images INNER JOIN ( - SELECT it.image_id + SELECT DISTINCT it.image_id FROM image_tags it - WHERE it.tag_id $in ($set) + WHERE it.tag_id IN ($set) ORDER BY it.image_id DESC LIMIT :limit OFFSET :offset ) a on a.image_id = images.id - ORDER BY images.id DESC + WHERE 1=1 ", ["limit" => $limit, "offset" => $offset]); // don't offset at the image level because // we already offset at the image_tags level - $order = null; $limit = null; $offset = null; } @@ -890,7 +917,7 @@ private static function build_search_querylet( } $first = array_shift($inner_joins); - $sub_query = "SELECT it.image_id FROM image_tags it "; + $sub_query = "SELECT DISTINCT it.image_id FROM image_tags it "; $i = 0; foreach ($inner_joins as $inner_join) { $i++; @@ -946,9 +973,8 @@ private static function build_search_querylet( $query->append(new Querylet($img_sql, $img_vars)); } - if (!is_null($order)) { - $query->append(new Querylet(" ORDER BY ".$order)); - } + $query->append(new Querylet(" ORDER BY ".$order)); + if (!is_null($limit)) { $query->append(new Querylet(" LIMIT :limit ", ["limit" => $limit])); $query->append(new Querylet(" OFFSET :offset ", ["offset" => $offset])); diff --git a/ext/index/test.php b/ext/index/test.php index 47146e7a9..127b306ac 100644 --- a/ext/index/test.php +++ b/ext/index/test.php @@ -178,14 +178,21 @@ public function testWildSearchOneResult($image_ids) } #[Depends('testUpload')] - public function testWildSearchManyResults($image_ids) + public function testWildSearchManyResultsSimple($image_ids) { $image_ids = $this->testUpload(); - // two images match comp* - one matches it once, - // one matches it twice + // two images match comp* - one matches it once, one matches it twice $this->assert_search_results(["comp*"], [$image_ids[1], $image_ids[0]]); } + #[Depends('testUpload')] + public function testWildSearchManyResultsComplex($image_ids) + { + $image_ids = $this->testUpload(); + // same thing, but with the complex branch + $this->assert_search_results(["comp*", "-asdf"], [$image_ids[1], $image_ids[0]]); + } + /* * * * * * * * * * * * Mixed * * * * * * * * * * * */ @@ -196,20 +203,27 @@ public function testMixedSearchTagMeta($image_ids) // multiple tags, many results $this->assert_search_results(["computer", "size=640x480"], [$image_ids[1], $image_ids[0]]); } - // tag + negative - // wildcards + ??? /* * * * * * * * * * * * Negative * * * * * * * * * * * */ #[Depends('testUpload')] - public function testNegative($image_ids) + public function testSubtractFromSearch($image_ids) { $image_ids = $this->testUpload(); // negative tag, should have one result $this->assert_search_results(["computer", "-pbx"], [$image_ids[1]]); + // removing something that doesn't exist should have no effect + $this->assert_search_results(["computer", "-not_a_tag"], [$image_ids[1], $image_ids[0]]); + } + + #[Depends('testUpload')] + public function testSubtractFromDefault($image_ids) + { + $image_ids = $this->testUpload(); + // negative tag alone, should work $this->assert_search_results(["-pbx"], [$image_ids[1]]);