From d235fd2b9eb0126fc1a37304a6865e480e66a7e0 Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Tue, 23 Jan 2024 12:30:04 +0100
Subject: [PATCH 01/28] Add support for search engine document updates

This means that now documents can be partially updated instead of always being completely replaced. This features is not used yet anywhere, but it will be useful when including similarity data to the search engine.

https://github.com/MTG/freesound/issues/1714
---
 utils/search/__init__.py                      |   7 +-
 utils/search/backends/solr555pysolr.py        | 519 ++++++++----------
 utils/search/backends/solr9pysolr.py          |   4 +-
 .../backends/tests/test_solr555pysolr.py      |   4 +-
 .../search/backends/tests/test_solr_common.py |   5 +-
 utils/search/search_sounds.py                 |  10 +-
 6 files changed, 264 insertions(+), 285 deletions(-)

diff --git a/utils/search/__init__.py b/utils/search/__init__.py
index 44873af81..50fbe5518 100644
--- a/utils/search/__init__.py
+++ b/utils/search/__init__.py
@@ -184,11 +184,16 @@ class SearchEngineBase:
 
     # Sound search related methods
 
-    def add_sounds_to_index(self, sound_objects):
+    def add_sounds_to_index(self, sound_objects, fields_to_include=[], update_mode=False):
         """Indexes the provided sound objects in the search index
 
         Args:
             sound_objects (list[sounds.models.Sound]): Sound objects of the sounds to index
+            fields_to_include (list[str]): Specific sound fields that will be included in the document to
+                be indexed. If empty, all available sound fields will be included.
+            update_mode (bool): Whether to perform an update of the existing documents in the index or to 
+                completely replace them. An update is useful so that fields not included in the document are 
+                not removed from the index.
         """
         raise NotImplementedError
 
diff --git a/utils/search/backends/solr555pysolr.py b/utils/search/backends/solr555pysolr.py
index e23493b9f..1dd258103 100644
--- a/utils/search/backends/solr555pysolr.py
+++ b/utils/search/backends/solr555pysolr.py
@@ -93,227 +93,6 @@
 }
 
 
-def convert_sound_to_search_engine_document(sound):
-    """
-    TODO: Document that this includes remove_control_chars due to originally sending XML. not strictly necessary when submitting
-          to json (and also, freesound model code fixes this), but keep it in to ensure that docs are clean.
-    TODO: Assert that sound object is correct?
-    """
-    document = {}
-
-    # Basic sound fields
-    keep_fields = ['username', 'created', 'is_explicit', 'is_remix', 'num_ratings', 'channels', 'md5',
-                   'was_remixed', 'original_filename', 'duration', 'id', 'num_downloads', 'filesize']
-    for key in keep_fields:
-        document[key] = getattr(sound, key)
-    if sound.type == '':
-        document["type"] = "wav"
-    else:
-        document["type"] = sound.type
-    document["original_filename"] = remove_control_chars(getattr(sound, "original_filename"))
-    document["description"] = remove_control_chars(getattr(sound, "description"))
-    document["tag"] = list(set([t.lower() for t in getattr(sound, "tag_array")]))
-    document["license"] = getattr(sound, "license_name")
-    if document["num_ratings"] >= settings.MIN_NUMBER_RATINGS:
-        document["avg_rating"] = getattr(sound, "avg_rating")
-    else:
-        document["avg_rating"] = 0
-
-    if getattr(sound, "pack_id"):
-        document["pack"] = remove_control_chars(getattr(sound, "pack_name"))
-        document["grouping_pack"] = str(getattr(sound, "pack_id")) + "_" + remove_control_chars(
-            getattr(sound, "pack_name"))
-    else:
-        document["grouping_pack"] = str(getattr(sound, "id"))
-
-    document["is_geotagged"] = False
-    if getattr(sound, "geotag_id"):
-        document["is_geotagged"] = True
-        if not math.isnan(getattr(sound, "geotag_lon")) and not math.isnan(getattr(sound, "geotag_lat")):
-            document["geotag"] = str(getattr(sound, "geotag_lon")) + " " + str(getattr(sound, "geotag_lat"))
-
-    document["in_remix_group"] = getattr(sound, "was_remixed") or getattr(sound, "is_remix")
-
-    document["bitdepth"] = getattr(sound, "bitdepth") if getattr(sound, "bitdepth") else 0
-    document["bitrate"] = getattr(sound, "bitrate") if getattr(sound, "bitrate") else 0
-    document["samplerate"] = int(getattr(sound, "samplerate")) if getattr(sound, "samplerate") else 0
-
-    document["comment"] = [remove_control_chars(comment_text) for comment_text in getattr(sound, "comments_array")]
-    document["comments"] = getattr(sound, "num_comments")
-    locations = sound.locations()
-    document["waveform_path_m"] = locations["display"]["wave"]["M"]["path"]
-    document["waveform_path_l"] = locations["display"]["wave"]["L"]["path"]
-    document["spectral_path_m"] = locations["display"]["spectral"]["M"]["path"]
-    document["spectral_path_l"] = locations["display"]["spectral"]["L"]["path"]
-    document["preview_path"] = locations["preview"]["LQ"]["mp3"]["path"]
-
-    # Analyzer's output
-    for analyzer_name, analyzer_info in settings.ANALYZERS_CONFIGURATION.items():
-        if 'descriptors_map' in analyzer_info:
-            query_select_name = analyzer_name.replace('-', '_')
-            analysis_data = getattr(sound, query_select_name, None)
-            if analysis_data is not None:
-                # If analysis is present, index all existing analysis fields using SOLR dynamic fields depending on
-                # the value type (see SOLR_DYNAMIC_FIELDS_SUFFIX_MAP) so solr knows how to treat when filtering, etc.
-                for key, value in json.loads(analysis_data).items():
-                    if isinstance(value, list):
-                        # Make sure that the list is formed by strings
-                        value = [f'{item}' for item in value]
-                    suffix = SOLR_DYNAMIC_FIELDS_SUFFIX_MAP.get(type(value), None)
-                    if suffix:
-                        document[f'{key}{suffix}'] = value
-    return document
-
-
-def convert_post_to_search_engine_document(post):
-    body = remove_control_chars(post.body)
-    if not body:
-        return None
-
-    document = {
-        "id": post.id,
-        "thread_id": post.thread.id,
-        "thread_title": remove_control_chars(post.thread.title),
-        "thread_author": post.thread.author.username,
-        "thread_created": post.thread.created,
-
-        "forum_name": post.thread.forum.name,
-        "forum_name_slug": post.thread.forum.name_slug,
-
-        "post_author": post.author.username,
-        "post_created": post.created,
-        "post_body": body,
-
-        "num_posts": post.thread.num_posts,
-        "has_posts": False if post.thread.num_posts == 0 else True
-    }
-    return document
-
-
-def add_solr_suffix_to_dynamic_fieldname(fieldname):
-    """Add the corresponding SOLR dynamic field suffix to the given fieldname. If the fieldname does not correspond
-    to a dynamic field, leave it unchanged. See docstring in 'add_solr_suffix_to_dynamic_fieldnames_in_filter' for
-    more information"""
-    dynamic_fields_map = {}
-    for analyzer, analyzer_data in settings.ANALYZERS_CONFIGURATION.items():
-        if 'descriptors_map' in analyzer_data:
-            descriptors_map = settings.ANALYZERS_CONFIGURATION[analyzer]['descriptors_map']
-            for _, db_descriptor_key, descriptor_type in descriptors_map:
-                if descriptor_type is not None:
-                    dynamic_fields_map[db_descriptor_key] = '{}{}'.format(
-                        db_descriptor_key, SOLR_DYNAMIC_FIELDS_SUFFIX_MAP[descriptor_type])
-    return dynamic_fields_map.get(fieldname, fieldname)
-
-
-
-def add_solr_suffix_to_dynamic_fieldnames_in_filter(query_filter):
-    """Processes a filter string containing field names and replaces the occurrences of fieldnames that match with
-    descriptor names from the descriptors_map of different configured analyzers with updated fieldnames with
-    the required SOLR dynamic field suffix. This is needed because fields from analyzers are indexed as dynamic
-    fields which need to end with a specific suffi that SOLR uses to learn about the type of the field and how it
-    should treat it.
-    """
-    for analyzer, analyzer_data in settings.ANALYZERS_CONFIGURATION.items():
-        if 'descriptors_map' in analyzer_data:
-            descriptors_map = settings.ANALYZERS_CONFIGURATION[analyzer]['descriptors_map']
-            for _, db_descriptor_key, descriptor_type in descriptors_map:
-                if descriptor_type is not None:
-                    query_filter = query_filter.replace(
-                        f'{db_descriptor_key}:','{}{}:'.format(
-                            db_descriptor_key, SOLR_DYNAMIC_FIELDS_SUFFIX_MAP[descriptor_type]))
-    return query_filter
-
-
-def search_process_sort(sort, forum=False):
-    """Translates sorting criteria to solr sort criteria and add extra criteria if sorting by ratings.
-
-    If order by rating, when rating is the same sort also by number of ratings.
-
-    Args:
-        sort (str): sorting criteria as defined in settings.SEARCH_SOUNDS_SORT_OPTIONS_WEB.
-        forum (bool, optional): use the forum sort options map instead of the standard sort map
-
-    Returns:
-        List[str]: list containing the sorting field names list for the search engine.
-    """
-    search_map = SORT_OPTIONS_MAP_FORUM if forum else SORT_OPTIONS_MAP
-    if sort in [sort_web_name for sort_web_name, _ in search_map.items()]:
-        if search_map[sort] == "avg_rating desc" or search_map[sort] == "avg_rating asc":
-            sort = [search_map[sort], "num_ratings desc"]
-        else:
-            sort = [search_map[sort]]
-    else:
-        sort = [search_map[settings.SEARCH_FORUM_SORT_DEFAULT if forum else settings.SEARCH_SOUNDS_SORT_DEFAULT]]
-    return sort
-
-
-def search_filter_make_intersection(query_filter):
-    # In solr 4, fq="a:1 b:2" will take the AND of these two filters, but in solr 5+, this will use OR
-    # fq=a:1&fq=b:2 can be used to take an AND, however we don't support this syntax
-    # The AND behaviour can be approximated by using fq="+a:1 +b:2", therefore we add a + to the beginning of each 
-    # filter item to force AND. Because we use Dismax query parser, if we have a filter like fq="a:1 OR b:2" which will
-    # be converted to fq="+a:1 OR +b:2" by this function, this will still correctly use the OR operator (this would not
-    # be the case with standard lucene query parser).
-    # NOTE: for the filter names we match "a-zA-Z_" instead of using \w as using \w would cause problems for filters
-    # which have date ranges inside.
-    # NOTE: in the future filter handling should be refactored and we should use a proper filter parser
-    # that allows us to define our own filter syntax and then represent filters as some intermediate structure that can later
-    # be converted to valid lucene/dismax syntax.
-    query_filter = re.sub(r'\b([a-zA-Z_]+:)', r'+\1', query_filter)
-    query_filter = re.sub(r"(\+)\1+", r"\1", query_filter)  # This is to avoid having multiple + in a row if user already has added them
-    if len(query_filter) > 0 and query_filter[-1] == '+':
-        query_filter = query_filter[:-1]
-    return query_filter
-
-
-def search_process_filter(query_filter, only_sounds_within_ids=False, only_sounds_with_pack=False):
-    """Process the filter to make a number of adjustments
-
-        1) Add type suffix to human-readable audio analyzer descriptor names (needed for dynamic solr field names).
-        2) If only sounds with pack should be returned, add such a filter.
-        3) Add filter for sound IDs if only_sounds_within_ids is passed.
-        4) Rewrite geotag bounding box queries to use solr 5+ syntax
-
-    Step 1) is used for the dynamic field names used in Solr (e.g. ac_tonality -> ac_tonality_s, ac_tempo ->
-    ac_tempo_i). The dynamic field names we define in Solr schema are '*_b' (for bool), '*_d' (for float),
-    '*_i' (for integer) and '*_s' (for string). At indexing time, we append these suffixes to the analyzer
-    descriptor names that need to be indexed so Solr can treat the types properly. Now we automatically append the
-    suffices to the filter names so users do not need to deal with that and Solr understands recognizes the field name.
-
-    Args:
-        query_filter (str): query filter string.
-        only_sounds_with_pack (bool, optional): whether to only include sounds that belong to a pack
-        only_sounds_within_ids (List[int], optional): restrict search results to sounds with these IDs
-
-    Returns:
-        str: processed filter query string.
-    """
-    # Add type suffix to human-readable audio analyzer descriptor names which is needed for solr dynamic fields
-    query_filter = add_solr_suffix_to_dynamic_fieldnames_in_filter(query_filter)
-
-    # If we only want sounds with packs and there is no pack filter, add one
-    if only_sounds_with_pack and not 'pack:' in query_filter:
-        query_filter += ' pack:*'
-
-    if 'geotag:"Intersects(' in query_filter:
-        # Replace geotag:"Intersects(<MINIMUM_LONGITUDE> <MINIMUM_LATITUDE> <MAXIMUM_LONGITUDE> <MAXIMUM_LATITUDE>)"
-        #    with geotag:["<MINIMUM_LATITUDE>, <MINIMUM_LONGITUDE>" TO "<MAXIMUM_LONGITUDE> <MAXIMUM_LATITUDE>"]
-        query_filter = re.sub(r'geotag:"Intersects\((.+?) (.+?) (.+?) (.+?)\)"', r'geotag:["\2,\1" TO "\4,\3"]', query_filter)
-
-    query_filter = search_filter_make_intersection(query_filter)
-
-    # When calculating results form clustering, the "only_sounds_within_ids" argument is passed and we filter
-    # our query to the sounds in that list of IDs.
-    if only_sounds_within_ids:
-        sounds_within_ids_filter = ' OR '.join([f'id:{sound_id}' for sound_id in only_sounds_within_ids])
-        if query_filter:
-            query_filter += f' AND ({sounds_within_ids_filter})'
-        else:
-            query_filter = f'({sounds_within_ids_filter})'
-
-    return query_filter
-
-
 class FreesoundSoundJsonEncoder(json.JSONEncoder):
     def default(self, value):
         if isinstance(value, datetime):
@@ -358,10 +137,247 @@ def get_forum_index(self):
                 always_commit=True
             )
         return self.forum_index
+    
+    # Util functions
+    def transform_document_into_update_document(self, document):
+        """
+        In order to update a document in SOLR, we need to send a document with the same ID of the document we want to update and the
+        list of fields with the values we want to set wrapped in a {'set': value} dictionary. This function transforms a normal solr
+        document with {key:value} pairs into a document that will update all the fields. This is useful when we only want to update some
+        fields but not remove those not updated. Using this method we can update similarity-related sound fields and the rest of the
+        fields independently.
+        """
+        new_document = {'id': document['id']}
+        new_document.update({key: {'set': value} for key, value in document.items() if key != 'id'})
+        return new_document
+
+    def convert_sound_to_search_engine_document(self, sound, fields_to_include=[]):
+        """
+        TODO: Document that this includes remove_control_chars due to originally sending XML. not strictly necessary when submitting
+            to json (and also, freesound model code fixes this), but keep it in to ensure that docs are clean.
+        TODO: Assert that sound object is correct?
+        """
+        # Document ID (same as sound ID)
+        document = {'id': sound.id}
+
+        # Basic sound fields
+        keep_fields = ['username', 'created', 'is_explicit', 'is_remix', 'num_ratings', 'channels', 'md5',
+                    'was_remixed', 'original_filename', 'duration', 'num_downloads', 'filesize']
+        for key in keep_fields:
+            document[key] = getattr(sound, key)
+        if sound.type == '':
+            document["type"] = "wav"
+        else:
+            document["type"] = sound.type
+        document["original_filename"] = remove_control_chars(getattr(sound, "original_filename"))
+        document["description"] = remove_control_chars(getattr(sound, "description"))
+        document["tag"] = list(set([t.lower() for t in getattr(sound, "tag_array")]))
+        document["license"] = getattr(sound, "license_name")
+        
+        if document["num_ratings"] >= settings.MIN_NUMBER_RATINGS:
+            document["avg_rating"] = getattr(sound, "avg_rating")
+        else:
+            document["avg_rating"] = 0
+
+        if getattr(sound, "pack_id"):
+            document["pack"] = remove_control_chars(getattr(sound, "pack_name"))
+            document["grouping_pack"] = str(getattr(sound, "pack_id")) + "_" + remove_control_chars(
+                getattr(sound, "pack_name"))
+        else:
+            document["grouping_pack"] = str(getattr(sound, "id"))
+
+        document["is_geotagged"] = False
+        if getattr(sound, "geotag_id"):
+            document["is_geotagged"] = True
+            if not math.isnan(getattr(sound, "geotag_lon")) and not math.isnan(getattr(sound, "geotag_lat")):
+                document["geotag"] = str(getattr(sound, "geotag_lon")) + " " + str(getattr(sound, "geotag_lat"))
+
+        document["in_remix_group"] = getattr(sound, "was_remixed") or getattr(sound, "is_remix")
+
+        document["bitdepth"] = getattr(sound, "bitdepth") if getattr(sound, "bitdepth") else 0
+        document["bitrate"] = getattr(sound, "bitrate") if getattr(sound, "bitrate") else 0
+        document["samplerate"] = int(getattr(sound, "samplerate")) if getattr(sound, "samplerate") else 0
+
+        document["comment"] = [remove_control_chars(comment_text) for comment_text in getattr(sound, "comments_array")]
+        document["comments"] = getattr(sound, "num_comments")
+ 
+        locations = sound.locations()
+        document["waveform_path_m"] = locations["display"]["wave"]["M"]["path"]
+        document["waveform_path_l"] = locations["display"]["wave"]["L"]["path"]
+        document["spectral_path_m"] = locations["display"]["spectral"]["M"]["path"]
+        document["spectral_path_l"] = locations["display"]["spectral"]["L"]["path"]
+        document["preview_path"] = locations["preview"]["LQ"]["mp3"]["path"]
+        
+        # Analyzer's output
+        for analyzer_name, analyzer_info in settings.ANALYZERS_CONFIGURATION.items():
+            if 'descriptors_map' in analyzer_info:
+                query_select_name = analyzer_name.replace('-', '_')
+                analysis_data = getattr(sound, query_select_name, None)
+                if analysis_data is not None:
+                    # If analysis is present, index all existing analysis fields using SOLR dynamic fields depending on
+                    # the value type (see SOLR_DYNAMIC_FIELDS_SUFFIX_MAP) so solr knows how to treat when filtering, etc.
+                    for key, value in json.loads(analysis_data).items():
+                        if isinstance(value, list):
+                            # Make sure that the list is formed by strings
+                            value = [f'{item}' for item in value]
+                        suffix = SOLR_DYNAMIC_FIELDS_SUFFIX_MAP.get(type(value), None)
+                        if suffix:
+                            document[f'{key}{suffix}'] = value
+
+        # Remove fields that should not be included
+        # Note that we could optimize this by never getting the data for these fields in the first place, but because
+        # the data is already retrieved in the queryset, that optimization would be negligible so we keep it simple.
+        document = {k: v for k, v in document.items() if k in fields_to_include or not fields_to_include}
+
+        return document
+
+    def convert_post_to_search_engine_document(self, post):
+        body = remove_control_chars(post.body)
+        if not body:
+            return None
+
+        document = {
+            "id": post.id,
+            "thread_id": post.thread.id,
+            "thread_title": remove_control_chars(post.thread.title),
+            "thread_author": post.thread.author.username,
+            "thread_created": post.thread.created,
+
+            "forum_name": post.thread.forum.name,
+            "forum_name_slug": post.thread.forum.name_slug,
+
+            "post_author": post.author.username,
+            "post_created": post.created,
+            "post_body": body,
+
+            "num_posts": post.thread.num_posts,
+            "has_posts": False if post.thread.num_posts == 0 else True
+        }
+        return document
+
+    def add_solr_suffix_to_dynamic_fieldname(self, fieldname):
+        """Add the corresponding SOLR dynamic field suffix to the given fieldname. If the fieldname does not correspond
+        to a dynamic field, leave it unchanged. See docstring in 'add_solr_suffix_to_dynamic_fieldnames_in_filter' for
+        more information"""
+        dynamic_fields_map = {}
+        for analyzer, analyzer_data in settings.ANALYZERS_CONFIGURATION.items():
+            if 'descriptors_map' in analyzer_data:
+                descriptors_map = settings.ANALYZERS_CONFIGURATION[analyzer]['descriptors_map']
+                for _, db_descriptor_key, descriptor_type in descriptors_map:
+                    if descriptor_type is not None:
+                        dynamic_fields_map[db_descriptor_key] = '{}{}'.format(
+                            db_descriptor_key, SOLR_DYNAMIC_FIELDS_SUFFIX_MAP[descriptor_type])
+        return dynamic_fields_map.get(fieldname, fieldname)
+
+    def add_solr_suffix_to_dynamic_fieldnames_in_filter(self, query_filter):
+        """Processes a filter string containing field names and replaces the occurrences of fieldnames that match with
+        descriptor names from the descriptors_map of different configured analyzers with updated fieldnames with
+        the required SOLR dynamic field suffix. This is needed because fields from analyzers are indexed as dynamic
+        fields which need to end with a specific suffi that SOLR uses to learn about the type of the field and how it
+        should treat it.
+        """
+        for analyzer, analyzer_data in settings.ANALYZERS_CONFIGURATION.items():
+            if 'descriptors_map' in analyzer_data:
+                descriptors_map = settings.ANALYZERS_CONFIGURATION[analyzer]['descriptors_map']
+                for _, db_descriptor_key, descriptor_type in descriptors_map:
+                    if descriptor_type is not None:
+                        query_filter = query_filter.replace(
+                            f'{db_descriptor_key}:','{}{}:'.format(
+                                db_descriptor_key, SOLR_DYNAMIC_FIELDS_SUFFIX_MAP[descriptor_type]))
+        return query_filter
+
+    def search_process_sort(self, sort, forum=False):
+        """Translates sorting criteria to solr sort criteria and add extra criteria if sorting by ratings.
+
+        If order by rating, when rating is the same sort also by number of ratings.
+
+        Args:
+            sort (str): sorting criteria as defined in settings.SEARCH_SOUNDS_SORT_OPTIONS_WEB.
+            forum (bool, optional): use the forum sort options map instead of the standard sort map
+
+        Returns:
+            List[str]: list containing the sorting field names list for the search engine.
+        """
+        search_map = SORT_OPTIONS_MAP_FORUM if forum else SORT_OPTIONS_MAP
+        if sort in [sort_web_name for sort_web_name, _ in search_map.items()]:
+            if search_map[sort] == "avg_rating desc" or search_map[sort] == "avg_rating asc":
+                sort = [search_map[sort], "num_ratings desc"]
+            else:
+                sort = [search_map[sort]]
+        else:
+            sort = [search_map[settings.SEARCH_FORUM_SORT_DEFAULT if forum else settings.SEARCH_SOUNDS_SORT_DEFAULT]]
+        return sort
+
+    def search_filter_make_intersection(self, query_filter):
+        # In solr 4, fq="a:1 b:2" will take the AND of these two filters, but in solr 5+, this will use OR
+        # fq=a:1&fq=b:2 can be used to take an AND, however we don't support this syntax
+        # The AND behaviour can be approximated by using fq="+a:1 +b:2", therefore we add a + to the beginning of each 
+        # filter item to force AND. Because we use Dismax query parser, if we have a filter like fq="a:1 OR b:2" which will
+        # be converted to fq="+a:1 OR +b:2" by this function, this will still correctly use the OR operator (this would not
+        # be the case with standard lucene query parser).
+        # NOTE: for the filter names we match "a-zA-Z_" instead of using \w as using \w would cause problems for filters
+        # which have date ranges inside.
+        # NOTE: in the future filter handling should be refactored and we should use a proper filter parser
+        # that allows us to define our own filter syntax and then represent filters as some intermediate structure that can later
+        # be converted to valid lucene/dismax syntax.
+        query_filter = re.sub(r'\b([a-zA-Z_]+:)', r'+\1', query_filter)
+        query_filter = re.sub(r"(\+)\1+", r"\1", query_filter)  # This is to avoid having multiple + in a row if user already has added them
+        if len(query_filter) > 0 and query_filter[-1] == '+':
+            query_filter = query_filter[:-1]
+        return query_filter
+
+    def search_process_filter(self, query_filter, only_sounds_within_ids=False, only_sounds_with_pack=False):
+            """Process the filter to make a number of adjustments
+
+                1) Add type suffix to human-readable audio analyzer descriptor names (needed for dynamic solr field names).
+                2) If only sounds with pack should be returned, add such a filter.
+                3) Add filter for sound IDs if only_sounds_within_ids is passed.
+                4) Rewrite geotag bounding box queries to use solr 5+ syntax
+
+            Step 1) is used for the dynamic field names used in Solr (e.g. ac_tonality -> ac_tonality_s, ac_tempo ->
+            ac_tempo_i). The dynamic field names we define in Solr schema are '*_b' (for bool), '*_d' (for float),
+            '*_i' (for integer) and '*_s' (for string). At indexing time, we append these suffixes to the analyzer
+            descriptor names that need to be indexed so Solr can treat the types properly. Now we automatically append the
+            suffices to the filter names so users do not need to deal with that and Solr understands recognizes the field name.
+
+            Args:
+                query_filter (str): query filter string.
+                only_sounds_with_pack (bool, optional): whether to only include sounds that belong to a pack
+                only_sounds_within_ids (List[int], optional): restrict search results to sounds with these IDs
+
+            Returns:
+                str: processed filter query string.
+            """
+            # Add type suffix to human-readable audio analyzer descriptor names which is needed for solr dynamic fields
+            query_filter = self.add_solr_suffix_to_dynamic_fieldnames_in_filter(query_filter)
+
+            # If we only want sounds with packs and there is no pack filter, add one
+            if only_sounds_with_pack and not 'pack:' in query_filter:
+                query_filter += ' pack:*'
+
+            if 'geotag:"Intersects(' in query_filter:
+                # Replace geotag:"Intersects(<MINIMUM_LONGITUDE> <MINIMUM_LATITUDE> <MAXIMUM_LONGITUDE> <MAXIMUM_LATITUDE>)"
+                #    with geotag:["<MINIMUM_LATITUDE>, <MINIMUM_LONGITUDE>" TO "<MAXIMUM_LONGITUDE> <MAXIMUM_LATITUDE>"]
+                query_filter = re.sub('geotag:"Intersects\((.+?) (.+?) (.+?) (.+?)\)"', r'geotag:["\2,\1" TO "\4,\3"]', query_filter)
+
+            query_filter = self.search_filter_make_intersection(query_filter)
+
+            # When calculating results form clustering, the "only_sounds_within_ids" argument is passed and we filter
+            # our query to the sounds in that list of IDs.
+            if only_sounds_within_ids:
+                sounds_within_ids_filter = ' OR '.join(['id:{}'.format(sound_id) for sound_id in only_sounds_within_ids])
+                if query_filter:
+                    query_filter += ' AND ({})'.format(sounds_within_ids_filter)
+                else:
+                    query_filter = '({})'.format(sounds_within_ids_filter)
+
+            return query_filter
 
     # Sound methods
-    def add_sounds_to_index(self, sound_objects):
-        documents = [convert_sound_to_search_engine_document(s) for s in sound_objects]
+    def add_sounds_to_index(self, sound_objects, update_mode=False, fields_to_include=[]):
+        documents = [self.convert_sound_to_search_engine_document(s, fields_to_include=fields_to_include) for s in sound_objects]
+        if update_mode:
+            documents = [self.transform_document_into_update_document(d) for d in documents]
         try:
             self.get_sounds_index().add(documents)
         except pysolr.SolrError as e:
@@ -394,53 +410,6 @@ def sound_exists_in_index(self, sound_object_or_id):
         response = self.search_sounds(query_filter=f'id:{sound_id}', offset=0, num_sounds=1)
         return response.num_found > 0
 
-    def search_process_filter(self, query_filter, only_sounds_within_ids=False, only_sounds_with_pack=False):
-        """Process the filter to make a number of adjustments
-
-            1) Add type suffix to human-readable audio analyzer descriptor names (needed for dynamic solr field names).
-            2) If only sounds with pack should be returned, add such a filter.
-            3) Add filter for sound IDs if only_sounds_within_ids is passed.
-            4) Rewrite geotag bounding box queries to use solr 5+ syntax
-
-        Step 1) is used for the dynamic field names used in Solr (e.g. ac_tonality -> ac_tonality_s, ac_tempo ->
-        ac_tempo_i). The dynamic field names we define in Solr schema are '*_b' (for bool), '*_d' (for float),
-        '*_i' (for integer) and '*_s' (for string). At indexing time, we append these suffixes to the analyzer
-        descriptor names that need to be indexed so Solr can treat the types properly. Now we automatically append the
-        suffices to the filter names so users do not need to deal with that and Solr understands recognizes the field name.
-
-        Args:
-            query_filter (str): query filter string.
-            only_sounds_with_pack (bool, optional): whether to only include sounds that belong to a pack
-            only_sounds_within_ids (List[int], optional): restrict search results to sounds with these IDs
-
-        Returns:
-            str: processed filter query string.
-        """
-        # Add type suffix to human-readable audio analyzer descriptor names which is needed for solr dynamic fields
-        query_filter = add_solr_suffix_to_dynamic_fieldnames_in_filter(query_filter)
-
-        # If we only want sounds with packs and there is no pack filter, add one
-        if only_sounds_with_pack and not 'pack:' in query_filter:
-            query_filter += ' pack:*'
-
-        if 'geotag:"Intersects(' in query_filter:
-            # Replace geotag:"Intersects(<MINIMUM_LONGITUDE> <MINIMUM_LATITUDE> <MAXIMUM_LONGITUDE> <MAXIMUM_LATITUDE>)"
-            #    with geotag:["<MINIMUM_LATITUDE>, <MINIMUM_LONGITUDE>" TO "<MAXIMUM_LONGITUDE> <MAXIMUM_LATITUDE>"]
-            query_filter = re.sub('geotag:"Intersects\((.+?) (.+?) (.+?) (.+?)\)"', r'geotag:["\2,\1" TO "\4,\3"]', query_filter)
-
-        query_filter = search_filter_make_intersection(query_filter)
-
-        # When calculating results form clustering, the "only_sounds_within_ids" argument is passed and we filter
-        # our query to the sounds in that list of IDs.
-        if only_sounds_within_ids:
-            sounds_within_ids_filter = ' OR '.join(['id:{}'.format(sound_id) for sound_id in only_sounds_within_ids])
-            if query_filter:
-                query_filter += ' AND ({})'.format(sounds_within_ids_filter)
-            else:
-                query_filter = '({})'.format(sounds_within_ids_filter)
-
-        return query_filter
-
     def search_sounds(self, textual_query='', query_fields=None, query_filter='', offset=0, current_page=None,
                       num_sounds=settings.SOUNDS_PER_PAGE, sort=settings.SEARCH_SOUNDS_SORT_OPTION_AUTOMATIC,
                       group_by_pack=False, num_sounds_per_pack_group=1, facets=None, only_sounds_with_pack=False, 
@@ -454,10 +423,10 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of
             # If no fields provided, use the default
             query_fields = settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS
         if isinstance(query_fields, list):
-            query_fields = [add_solr_suffix_to_dynamic_fieldname(FIELD_NAMES_MAP.get(field, field)) for field in query_fields]
+            query_fields = [self.add_solr_suffix_to_dynamic_fieldname(FIELD_NAMES_MAP.get(field, field)) for field in query_fields]
         elif isinstance(query_fields, dict):
             # Also remove fields with weight <= 0
-            query_fields = [(add_solr_suffix_to_dynamic_fieldname(FIELD_NAMES_MAP.get(field, field)), weight)
+            query_fields = [(self.add_solr_suffix_to_dynamic_fieldname(FIELD_NAMES_MAP.get(field, field)), weight)
                 for field, weight in query_fields.items() if weight > 0]
 
         # Set main query options
@@ -475,7 +444,7 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of
                                 rows=num_sounds,
                                 field_list=["id", "score"],  # We only want the sound IDs of the results as we load data from DB
                                 filter_query=query_filter,
-                                sort=search_process_sort(sort))
+                                sort=self.search_process_sort(sort))
 
         # Configure facets
         if facets is not None:
@@ -544,7 +513,7 @@ def get_random_sound_id(self):
 
     # Forum posts methods
     def add_forum_posts_to_index(self, forum_post_objects):
-        documents = [convert_post_to_search_engine_document(p) for p in forum_post_objects]
+        documents = [self.convert_post_to_search_engine_document(p) for p in forum_post_objects]
         documents = [d for d in documents if d is not None]
         try:
             self.get_forum_index().add(documents)
@@ -609,7 +578,7 @@ def search_forum_posts(self, textual_query='', query_filter='', sort=settings.SE
                                             "post_created",
                                             "num_posts"],
                                 filter_query=query_filter,
-                                sort=search_process_sort(sort, forum=True))
+                                sort=self.search_process_sort(sort, forum=True))
 
         if group_by_thread:
             query.set_group_field("thread_title_grouped")
diff --git a/utils/search/backends/solr9pysolr.py b/utils/search/backends/solr9pysolr.py
index 396b6aa79..8827d8545 100644
--- a/utils/search/backends/solr9pysolr.py
+++ b/utils/search/backends/solr9pysolr.py
@@ -84,7 +84,7 @@ def search_process_filter(self, query_filter, only_sounds_within_ids=False, only
             str: processed filter query string.
         """
         # Add type suffix to human-readable audio analyzer descriptor names which is needed for solr dynamic fields
-        query_filter = solr555pysolr.add_solr_suffix_to_dynamic_fieldnames_in_filter(query_filter)
+        query_filter = self.add_solr_suffix_to_dynamic_fieldnames_in_filter(query_filter)
 
         # When filtering by the created field, use the `created_range` DateRangeType field instead
         # which include the ability to filter on exact values and ranges of values.
@@ -100,7 +100,7 @@ def search_process_filter(self, query_filter, only_sounds_within_ids=False, only
             #    with geotag:["<MINIMUM_LATITUDE>, <MINIMUM_LONGITUDE>" TO "<MAXIMUM_LONGITUDE> <MAXIMUM_LATITUDE>"]
             query_filter = re.sub('geotag:"Intersects\((.+?) (.+?) (.+?) (.+?)\)"', r'geotag:["\2,\1" TO "\4,\3"]', query_filter)
 
-        query_filter = solr555pysolr.search_filter_make_intersection(query_filter)
+        query_filter = self.search_filter_make_intersection(query_filter)
 
         # When calculating results form clustering, the "only_sounds_within_ids" argument is passed and we filter
         # our query to the sounds in that list of IDs.
diff --git a/utils/search/backends/tests/test_solr555pysolr.py b/utils/search/backends/tests/test_solr555pysolr.py
index 63a7a5f11..eab0e31ca 100644
--- a/utils/search/backends/tests/test_solr555pysolr.py
+++ b/utils/search/backends/tests/test_solr555pysolr.py
@@ -6,9 +6,9 @@ class Solr555PySolrTest(TestCase):
     def test_search_filter_make_intersection(self):
 
         filter_query = "username:alastairp"
-        updated = solr555pysolr.search_filter_make_intersection(filter_query)
+        updated = solr555pysolr.Solr555PySolrSearchEngine().search_filter_make_intersection(filter_query)
         self.assertEqual(updated, "+username:alastairp")
 
         filter_query = "username:alastairp license:(a OR b)"
-        updated = solr555pysolr.search_filter_make_intersection(filter_query)
+        updated = solr555pysolr.Solr555PySolrSearchEngine().search_filter_make_intersection(filter_query)
         self.assertEqual(updated, "+username:alastairp +license:(a OR b)")
diff --git a/utils/search/backends/tests/test_solr_common.py b/utils/search/backends/tests/test_solr_common.py
index 337b5aadb..048ce5840 100644
--- a/utils/search/backends/tests/test_solr_common.py
+++ b/utils/search/backends/tests/test_solr_common.py
@@ -1,15 +1,14 @@
 from django.test import TestCase
 
-from utils.search.backends import solr_common
 from utils.search.backends import solr555pysolr
 
 class SolrCommonTest(TestCase):
     def test_search_filter_make_intersection(self):
 
         filter_query = "username:alastairp"
-        updated = solr555pysolr.search_filter_make_intersection(filter_query)
+        updated = solr555pysolr.Solr555PySolrSearchEngine().search_filter_make_intersection(filter_query)
         self.assertEqual(updated, "+username:alastairp")
 
         filter_query = "username:alastairp license:(a OR b)"
-        updated = solr555pysolr.search_filter_make_intersection(filter_query)
+        updated = solr555pysolr.Solr555PySolrSearchEngine().search_filter_make_intersection(filter_query)
         self.assertEqual(updated, "+username:alastairp +license:(a OR b)")
diff --git a/utils/search/search_sounds.py b/utils/search/search_sounds.py
index bea52458f..1ec991203 100644
--- a/utils/search/search_sounds.py
+++ b/utils/search/search_sounds.py
@@ -379,11 +379,17 @@ def perform_search_engine_query(query_params):
     return results, paginator
 
 
-def add_sounds_to_search_engine(sound_objects):
+def add_sounds_to_search_engine(sound_objects, fields_to_include=[], update_mode=False):
     """Add the Sounds from the queryset to the search engine
 
     Args:
         sound_objects (list[sounds.models.Sound]): list (or queryset) of Sound objects to index
+        fields_to_include (list[str]): use this list to indicate the specific field names of the sounds 
+            that need to be included in the documents that will be indexed. If no fields are specified 
+            (fields_to_update=[]), then all available fields will be included.
+        update_mode (bool): if True, the sounds' data will be updated in the index, otherwise it will be 
+            replaced by the new generated documents. This is specially useful in combination with
+            fields_to_include so that different fields of the indexed can be updated separately. 
 
     Returns:
         int: number of sounds added to the index
@@ -395,7 +401,7 @@ def add_sounds_to_search_engine(sound_objects):
     try:
         console_logger.info("Adding %d sounds to the search engine" % num_sounds)
         search_logger.info("Adding %d sounds to the search engine" % num_sounds)
-        get_search_engine().add_sounds_to_index(sound_objects)
+        get_search_engine().add_sounds_to_index(sound_objects, fields_to_include=fields_to_include, update_mode=update_mode)
         return num_sounds
     except SearchEngineException as e:
         console_logger.info(f"Failed to add sounds to search engine index: {str(e)}")

From 9e3b67840a739273d78676b7b9de021f4cfaff5e Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Tue, 23 Jan 2024 13:11:06 +0100
Subject: [PATCH 02/28] Small fixes in search test command

---
 utils/search/backends/test_search_engine_backend.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/utils/search/backends/test_search_engine_backend.py b/utils/search/backends/test_search_engine_backend.py
index eba5f80cb..99f58b24b 100644
--- a/utils/search/backends/test_search_engine_backend.py
+++ b/utils/search/backends/test_search_engine_backend.py
@@ -235,7 +235,7 @@ def sound_check_group_by_pack(self):
             assert_and_continue('group_name' in result, 'No group_name field in doc from results')
             assert_and_continue('group_docs' in result, 'No group_docs field in doc from results')
             assert_and_continue('n_more_in_group' in result, 'No n_more_in_group field in doc from results')
-            group_sounds = Sound.objects.bulk_query_id(sound_ids=[r['id'] for r in result['group_docs']])
+            group_sounds = Sound.objects.bulk_query_id(sound_ids=[int(r['id']) for r in result['group_docs']])
             first_sound_pack = group_sounds[0].pack
             for sound in group_sounds:
                 assert_and_continue(sound.pack == first_sound_pack, 'Different packs in pack group')
@@ -376,9 +376,7 @@ def test_search_enginge_backend_sounds(self):
         self.sound_check_get_user_tags(sounds[0])
         self.sound_check_get_pack_tags(sounds)
 
-        console_logger.info('Testing of sound search methods finished. You might want to run the '
-                            'reindex_search_engine_sounds -c command to make sure the index is left in a correct '
-                            'state after having run these tests')
+        console_logger.info('Testing of sound search methods finished!')
 
     def forum_check_mandatory_doc_fields(self):
         # Check that returned forum posts (docs) from search engine include the mandatory fields
@@ -519,6 +517,4 @@ def test_search_enginge_backend_forum(self):
         self.forum_check_highlighting()
         self.forum_check_extra_queries()
 
-        console_logger.info('Testing of forum search methods finished. You might want to run the '
-                            'reindex_search_engine_forum -c command to make sure the index is left in a correct '
-                            'state after having run these tests')
+        console_logger.info('Testing of forum search methods finished!')

From 300201eebfebc82327bfc5ab15978ca7a5917922 Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Tue, 23 Jan 2024 13:12:49 +0100
Subject: [PATCH 03/28] Rename comments field to num_comments

---
 _docs/api/source/resources.rst                     | 2 +-
 utils/search/backends/solr555pysolr.py             | 2 +-
 utils/search/solr9/cores/freesound/conf/schema.xml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/_docs/api/source/resources.rst b/_docs/api/source/resources.rst
index bb96e3841..6c5c1259c 100644
--- a/_docs/api/source/resources.rst
+++ b/_docs/api/source/resources.rst
@@ -80,7 +80,7 @@ Filter name             Type           Description
 ``avg_rating``          numerical      Average rating for the sound in the range [0, 5].
 ``num_ratings``         integer        Number of times the sound has been rated.
 ``comment``             string         Textual content of the comments of a sound  (tokenized). The filter is satisfied if sound contains the filter value in at least one of its comments.
-``comments``            integer        Number of times the sound has been commented.
+``num_comments``            integer        Number of times the sound has been commented.
 ======================  =============  ====================================================
 
 
diff --git a/utils/search/backends/solr555pysolr.py b/utils/search/backends/solr555pysolr.py
index 1dd258103..174a0d56f 100644
--- a/utils/search/backends/solr555pysolr.py
+++ b/utils/search/backends/solr555pysolr.py
@@ -199,7 +199,7 @@ def convert_sound_to_search_engine_document(self, sound, fields_to_include=[]):
         document["samplerate"] = int(getattr(sound, "samplerate")) if getattr(sound, "samplerate") else 0
 
         document["comment"] = [remove_control_chars(comment_text) for comment_text in getattr(sound, "comments_array")]
-        document["comments"] = getattr(sound, "num_comments")
+        document["num_comments"] = getattr(sound, "num_comments")
  
         locations = sound.locations()
         document["waveform_path_m"] = locations["display"]["wave"]["M"]["path"]
diff --git a/utils/search/solr9/cores/freesound/conf/schema.xml b/utils/search/solr9/cores/freesound/conf/schema.xml
index 4cc4fbccd..cfe8a5f10 100644
--- a/utils/search/solr9/cores/freesound/conf/schema.xml
+++ b/utils/search/solr9/cores/freesound/conf/schema.xml
@@ -231,7 +231,7 @@
   <field name="num_ratings" type="plong" indexed="true" stored="true" required="true" /> <!-- num rating -->
 
   <field name="comment" type="freesound_text" indexed="true" stored="true" required="false" multiValued="true"/>
-  <field name="comments" type="plong" indexed="true" stored="true" required="true" /> <!-- num comments -->
+  <field name="num_comments" type="plong" indexed="true" stored="true" required="true" /> <!-- num comments -->
 
   <field name="waveform_path_m" type="string" indexed="false" stored="true" required="true" />
   <field name="waveform_path_l" type="string" indexed="false" stored="true" required="true" />

From 21349a9a55f3020cc8bd9e2f29e192f2da1d1a67 Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Tue, 23 Jan 2024 14:00:07 +0100
Subject: [PATCH 04/28] Make fields non-required, some cleanups

---
 .../solr9/cores/freesound/conf/schema.xml     | 76 ++++++++-----------
 1 file changed, 32 insertions(+), 44 deletions(-)

diff --git a/utils/search/solr9/cores/freesound/conf/schema.xml b/utils/search/solr9/cores/freesound/conf/schema.xml
index cfe8a5f10..323429efb 100644
--- a/utils/search/solr9/cores/freesound/conf/schema.xml
+++ b/utils/search/solr9/cores/freesound/conf/schema.xml
@@ -1,15 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <schema name="default-config" version="1.6">
   <uniqueKey>id</uniqueKey>
-  <fieldType name="_nest_path_" class="solr.NestPathField" maxCharsForDocValues="-1" omitNorms="true" omitTermFreqAndPositions="true" stored="false" multiValued="false"/>
-  <fieldType name="ancestor_path" class="solr.TextField">
-    <analyzer type="index">
-      <tokenizer name="keyword"/>
-    </analyzer>
-    <analyzer type="query">
-      <tokenizer delimiter="/" name="pathHierarchy"/>
-    </analyzer>
-  </fieldType>
+  <fieldType name="_nest_path_" class="solr.NestPathField"/>
   <fieldType name="daterange" class="solr.DateRangeField"/>
   <fieldType name="binary" class="solr.BinaryField"/>
   <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
@@ -145,7 +137,6 @@
       <filter name="lowercase"/>
     </analyzer>
   </fieldType>
-
   <fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true">
     <analyzer>
       <!-- KeywordTokenizer does no actual tokenizing, so the entire
@@ -185,19 +176,18 @@
          performing any ranking -->
     <similarity class="solr.BooleanSimilarityFactory"/>
   </fieldType>
-
-
-  <field name="_nest_path_" type="_nest_path_"/>
+  <fieldType name="knn_vector100" class="solr.DenseVectorField" vectorDimension="100" similarityFunction="cosine" knnAlgorithm="hnsw" hnswMaxConnections="10" hnswBeamWidth="40"/>
 
   <field name="_root_" type="string" docValues="false" indexed="true" stored="false"/>
   <field name="_text_" type="text_general" multiValued="true" indexed="true" stored="false"/>
   <field name="_version_" type="plong" indexed="false" stored="false"/>
   <field name="id" type="string" multiValued="false" indexed="true" required="true" stored="true"/>
 
-  <field name="username" type="alphaOnlySort" indexed="true" stored="true" required="true" />
-  <field name="created" type="pdate" indexed="true" stored="true" required="true" />
-  <field name="created_range" type="daterange" indexed="true" stored="true" required="true" />
-  <field name="original_filename" type="freesound_text" indexed="true" stored="true" required="true" />
+  <field name="username" type="alphaOnlySort" indexed="true" stored="true" required="false" />
+  <field name="original_filename" type="freesound_text" indexed="true" stored="true" required="false" />
+  <field name="created" type="pdate" indexed="true" stored="true" required="false" />
+  <field name="created_range" type="daterange" indexed="true" stored="true" required="false" />
+  <copyField source="created" dest="created_range" />
 
   <field name="description" type="freesound_text" indexed="true" stored="true" required="false" />
   <field name="tag" type="alphaOnlySort" indexed="true" stored="true" required="false" multiValued="true"/>
@@ -205,52 +195,50 @@
 
   <field name="license" type="lowercase" indexed="true" stored="true" />
 
-  <field name="is_remix" type="boolean" indexed="true" stored="true" required="true" />
-  <field name="was_remixed" type="boolean" indexed="true" stored="true" required="true" />
-  <field name="in_remix_group" type="boolean" indexed="true" stored="true" required="true" />
+  <field name="is_remix" type="boolean" indexed="true" stored="true" required="false" />
+  <field name="was_remixed" type="boolean" indexed="true" stored="true" required="false" />
+  <field name="in_remix_group" type="boolean" indexed="true" stored="true" required="false" />
 
   <field name="pack" type="string" indexed="true" stored="true" required="false" /> <!-- literal -->
   <field name="grouping_pack" type="string" indexed="true" stored="true" required="false" /> <!-- literal -->
   <field name="pack_tokenized" type="freesound_text" indexed="true" stored="true" required="false" /> <!-- parsed -->
+  <copyField source="pack" dest="pack_tokenized" />
 
-  <field name="is_geotagged" type="boolean" indexed="true" stored="true" required="true" />
+  <field name="is_geotagged" type="boolean" indexed="true" stored="true" required="false" />
   <field name="geotag" type="location_rpt" indexed="true" stored="true" required="false"  multiValued="true"/>
 
-  <field name="type" type="alphaOnlySort" indexed="true" stored="true" required="true" />
-  <field name="duration" type="pdouble" indexed="true" stored="true" required="true" />
-  <field name="bitdepth" type="pint" indexed="true" stored="true" required="true" />
-  <field name="bitrate" type="pint" indexed="true" stored="true" required="true" />
-  <field name="samplerate" type="pint" indexed="true" stored="true" required="true" />
-  <field name="filesize" type="plong" indexed="true" stored="true" required="true" />
-  <field name="channels" type="pint" indexed="true" stored="true" required="true" />
+  <field name="type" type="alphaOnlySort" indexed="true" stored="true" required="false" />
+  <field name="duration" type="pdouble" indexed="true" stored="true" required="false" />
+  <field name="bitdepth" type="pint" indexed="true" stored="true" required="false" />
+  <field name="bitrate" type="pint" indexed="true" stored="true" required="false" />
+  <field name="samplerate" type="pint" indexed="true" stored="true" required="false" />
+  <field name="filesize" type="plong" indexed="true" stored="true" required="false" />
+  <field name="channels" type="pint" indexed="true" stored="true" required="false" />
   <field name="md5" type="string" indexed="true" stored="true" />
 
-  <field name="num_downloads" type="plong" indexed="true" stored="true" required="true" /> <!-- num downloads -->
+  <field name="num_downloads" type="plong" indexed="true" stored="true" required="false" /> <!-- num downloads -->
 
-  <field name="avg_rating" type="pfloat" indexed="true" stored="true" required="true" /> <!-- avg ratings -->
-  <field name="num_ratings" type="plong" indexed="true" stored="true" required="true" /> <!-- num rating -->
+  <field name="avg_rating" type="pfloat" indexed="true" stored="true" required="false" /> <!-- avg ratings -->
+  <field name="num_ratings" type="plong" indexed="true" stored="true" required="false" /> <!-- num rating -->
 
   <field name="comment" type="freesound_text" indexed="true" stored="true" required="false" multiValued="true"/>
-  <field name="num_comments" type="plong" indexed="true" stored="true" required="true" /> <!-- num comments -->
+  <field name="num_comments" type="plong" indexed="true" stored="true" required="false" /> <!-- num comments -->
 
-  <field name="waveform_path_m" type="string" indexed="false" stored="true" required="true" />
-  <field name="waveform_path_l" type="string" indexed="false" stored="true" required="true" />
-  <field name="spectral_path_m" type="string" indexed="false" stored="true" required="true" />
-  <field name="spectral_path_l" type="string" indexed="false" stored="true" required="true" />
-  <field name="preview_path" type="string" indexed="false" stored="true" required="true" />
+  <field name="waveform_path_m" type="string" indexed="false" stored="true" required="false" />
+  <field name="waveform_path_l" type="string" indexed="false" stored="true" required="false" />
+  <field name="spectral_path_m" type="string" indexed="false" stored="true" required="false" />
+  <field name="spectral_path_l" type="string" indexed="false" stored="true" required="false" />
+  <field name="preview_path" type="string" indexed="false" stored="true" required="false" />
 
   <!-- Dynamic fields -->
   <!-- This will be used to index Audio Commons analysis fields and potentially other stuff -->
-  <dynamicField name="*_i" type="pint" indexed="true" stored="true"/>
-  <dynamicField name="*_d" type="pdouble" indexed="true" stored="true"/>
-  <dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
-  <dynamicField name="*_s" type="string" indexed="true" stored="true"/>
+  <dynamicField name="*_i" type="pint" indexed="true" stored="true" required="false"/>
+  <dynamicField name="*_d" type="pdouble" indexed="true" stored="true" required="false"/>
+  <dynamicField name="*_b" type="boolean" indexed="true" stored="true" required="false"/>
+  <dynamicField name="*_s" type="string" indexed="true" stored="true" required="false"/>
   <dynamicField name="*_ls" type="alphaOnlySort" indexed="true" stored="true" required="false" multiValued="true"/> <!-- list of string (or tags) -->
 
   <!-- Other fields -->
   <dynamicField name="random_*" type="random" indexed="true" stored="false"/>
 
-  <copyField source="pack" dest="pack_tokenized" />
-  <copyField source="created" dest="created_range" />
-
 </schema>
\ No newline at end of file

From 26991bed775176f81b2c38e88be89e881af62633 Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Tue, 23 Jan 2024 14:00:35 +0100
Subject: [PATCH 05/28] Add tests for update/fields_to_include parameters of
 add_sounds_to_index

---
 utils/search/__init__.py                      |  4 +--
 utils/search/backends/solr555pysolr.py        | 14 +++++-----
 .../backends/test_search_engine_backend.py    | 26 ++++++++++++++++++-
 utils/search/search_sounds.py                 |  6 ++---
 4 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/utils/search/__init__.py b/utils/search/__init__.py
index 50fbe5518..697963c8b 100644
--- a/utils/search/__init__.py
+++ b/utils/search/__init__.py
@@ -184,14 +184,14 @@ class SearchEngineBase:
 
     # Sound search related methods
 
-    def add_sounds_to_index(self, sound_objects, fields_to_include=[], update_mode=False):
+    def add_sounds_to_index(self, sound_objects, fields_to_include=[], update=False):
         """Indexes the provided sound objects in the search index
 
         Args:
             sound_objects (list[sounds.models.Sound]): Sound objects of the sounds to index
             fields_to_include (list[str]): Specific sound fields that will be included in the document to
                 be indexed. If empty, all available sound fields will be included.
-            update_mode (bool): Whether to perform an update of the existing documents in the index or to 
+            update (bool): Whether to perform an update of the existing documents in the index or to 
                 completely replace them. An update is useful so that fields not included in the document are 
                 not removed from the index.
         """
diff --git a/utils/search/backends/solr555pysolr.py b/utils/search/backends/solr555pysolr.py
index 174a0d56f..54f4c8328 100644
--- a/utils/search/backends/solr555pysolr.py
+++ b/utils/search/backends/solr555pysolr.py
@@ -224,10 +224,12 @@ def convert_sound_to_search_engine_document(self, sound, fields_to_include=[]):
                         if suffix:
                             document[f'{key}{suffix}'] = value
 
-        # Remove fields that should not be included
-        # Note that we could optimize this by never getting the data for these fields in the first place, but because
-        # the data is already retrieved in the queryset, that optimization would be negligible so we keep it simple.
-        document = {k: v for k, v in document.items() if k in fields_to_include or not fields_to_include}
+        if fields_to_include:
+            # Remove fields that should not be included
+            # Note that we could optimize this by never getting the data for these fields in the first place, but because
+            # the data is already retrieved in the queryset, that optimization would be negligible so we keep it simple.
+            document = {k: v for k, v in document.items() if k in fields_to_include}
+            document['id'] = sound.id  # Make sure we always include the ID
 
         return document
 
@@ -374,9 +376,9 @@ def search_process_filter(self, query_filter, only_sounds_within_ids=False, only
             return query_filter
 
     # Sound methods
-    def add_sounds_to_index(self, sound_objects, update_mode=False, fields_to_include=[]):
+    def add_sounds_to_index(self, sound_objects, update=False, fields_to_include=[]):
         documents = [self.convert_sound_to_search_engine_document(s, fields_to_include=fields_to_include) for s in sound_objects]
-        if update_mode:
+        if update:
             documents = [self.transform_document_into_update_document(d) for d in documents]
         try:
             self.get_sounds_index().add(documents)
diff --git a/utils/search/backends/test_search_engine_backend.py b/utils/search/backends/test_search_engine_backend.py
index 99f58b24b..dd6dff4d0 100644
--- a/utils/search/backends/test_search_engine_backend.py
+++ b/utils/search/backends/test_search_engine_backend.py
@@ -361,7 +361,31 @@ def test_search_enginge_backend_sounds(self):
             assert_and_continue(self.search_engine.sound_exists_in_index(sound),
                                 f'Sound ID {sound.id} should be in search index')
 
-        # Re-index all sounds to leave index in "correct" state
+        # Test the 'update' and 'include_fields' parameters of add_sounds_to_index. 
+        # Start by emptying the index and testing that when adding sounds with update=True, these get created if they don't already exist
+        self.search_engine.remove_all_sounds()
+        self.search_engine.add_sounds_to_index(sounds, update=True)
+        for sound in sounds:
+            assert_and_continue(self.search_engine.sound_exists_in_index(sound),
+                                f'Sound ID {sound.id} should be in the search index')
+            
+        # Make a query filtering by a field we know is in the index and check that all results are returned
+        results = self.search_engine.search_sounds(query_filter='duration:[* TO *]')
+        assert_and_continue(len(sounds) == results.num_found, "All sounds should have been returned for this query")
+        
+        # Now we index again but only with 2 fields and with update=False. This should replace existing documents and
+        # only index the selected fields. We then repeat the previous query, but because "duration" field was not included 
+        # in the new index, now the query should return no results.
+        self.search_engine.add_sounds_to_index(sounds, update=False, fields_to_include=['id', 'original_filename'])
+        results = self.search_engine.search_sounds(query_filter='duration:[* TO *]')
+        assert_and_continue(0 == results.num_found, "No soulds should have been returned in this query")
+
+        # Now we update the index with the duration field for all sounds and repeat the query, we should get all results again
+        self.search_engine.add_sounds_to_index(sounds, update=True, fields_to_include=['duration'])
+        results = self.search_engine.search_sounds(query_filter='duration:[* TO *]')
+        assert_and_continue(len(sounds) == results.num_found, "All sounds should have been returned for this query")
+
+        # Re-index all sounds to leave index in "correct" state for next tests
         self.search_engine.add_sounds_to_index(sounds)
 
         self.sound_check_mandatory_doc_fields()
diff --git a/utils/search/search_sounds.py b/utils/search/search_sounds.py
index 1ec991203..79d999497 100644
--- a/utils/search/search_sounds.py
+++ b/utils/search/search_sounds.py
@@ -379,7 +379,7 @@ def perform_search_engine_query(query_params):
     return results, paginator
 
 
-def add_sounds_to_search_engine(sound_objects, fields_to_include=[], update_mode=False):
+def add_sounds_to_search_engine(sound_objects, fields_to_include=[], update=False):
     """Add the Sounds from the queryset to the search engine
 
     Args:
@@ -387,7 +387,7 @@ def add_sounds_to_search_engine(sound_objects, fields_to_include=[], update_mode
         fields_to_include (list[str]): use this list to indicate the specific field names of the sounds 
             that need to be included in the documents that will be indexed. If no fields are specified 
             (fields_to_update=[]), then all available fields will be included.
-        update_mode (bool): if True, the sounds' data will be updated in the index, otherwise it will be 
+        update (bool): if True, the sounds' data will be updated in the index, otherwise it will be 
             replaced by the new generated documents. This is specially useful in combination with
             fields_to_include so that different fields of the indexed can be updated separately. 
 
@@ -401,7 +401,7 @@ def add_sounds_to_search_engine(sound_objects, fields_to_include=[], update_mode
     try:
         console_logger.info("Adding %d sounds to the search engine" % num_sounds)
         search_logger.info("Adding %d sounds to the search engine" % num_sounds)
-        get_search_engine().add_sounds_to_index(sound_objects, fields_to_include=fields_to_include, update_mode=update_mode)
+        get_search_engine().add_sounds_to_index(sound_objects, fields_to_include=fields_to_include, update=update)
         return num_sounds
     except SearchEngineException as e:
         console_logger.info(f"Failed to add sounds to search engine index: {str(e)}")

From 92ea90eaca9c46065ce951c921f23aa0ab53e3c7 Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Tue, 23 Jan 2024 23:47:19 +0100
Subject: [PATCH 06/28] Add solr-based basic similarity search support

---
 freesound/settings.py                         |  13 +
 general/tasks.py                              |   8 +-
 sounds/views.py                               |  12 +-
 utils/search/__init__.py                      |   8 +
 utils/search/backends/solr555pysolr.py        | 226 +++++++++++++-----
 utils/search/backends/solr_common.py          |   9 +-
 .../backends/test_search_engine_backend.py    |   5 +
 utils/search/search_sounds.py                 |  13 +-
 .../solr9/cores/freesound/conf/schema.xml     |   9 +
 9 files changed, 220 insertions(+), 83 deletions(-)

diff --git a/freesound/settings.py b/freesound/settings.py
index 2223ddd38..3d06c31e8 100644
--- a/freesound/settings.py
+++ b/freesound/settings.py
@@ -638,6 +638,19 @@
 SOLR5_BASE_URL = "http://search:8983/solr"
 SOLR9_BASE_URL = "http://search:8983/solr"
 
+SEARCH_ENGINE_SIMILARITY_ANALYZERS = {
+    FSDSINET_ANALYZER_NAME: {
+        'vector_property_name': 'embeddings', 
+        'vector_size': 100,
+    },
+    AUDIOSET_YAMNET_ANALYZER_NAME: {
+        'vector_property_name': 'embeddings', 
+        'vector_size': 100,  # Note yamnet has higher dimensionality and here we're cropping dimensions
+    },
+}
+SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER = FSDSINET_ANALYZER_NAME
+USE_SEARCH_ENGINE_SIMILARITY = False  # Does not currently apply to API
+
 # -------------------------------------------------------------------------------
 # Similarity client settings
 SIMILARITY_ADDRESS = 'similarity'
diff --git a/general/tasks.py b/general/tasks.py
index 2dea3c913..d7e0209d6 100644
--- a/general/tasks.py
+++ b/general/tasks.py
@@ -260,10 +260,12 @@ def process_analysis_results(sound_id, analyzer, status, analysis_time, exceptio
                 {'task_name': PROCESS_ANALYSIS_RESULTS_TASK_NAME, 'sound_id': sound_id, 'analyzer': analyzer, 'status': status,
                  'exception': str(exception), 'work_time': round(time.time() - start_time)}))
         else:
-            # Load analysis output to database field (following configuration  in settings.ANALYZERS_CONFIGURATION)
+            # Load analysis output to database field (following configuration in settings.ANALYZERS_CONFIGURATION)
             a.load_analysis_data_from_file_to_db()
-            # Set sound to index dirty so that the sound gets reindexed with updated analysis fields
-            a.sound.mark_index_dirty(commit=True)
+            
+            if analyzer in settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS or analyzer in settings.ANALYZERS_CONFIGURATION:
+                # If the analyzer produces data that should be indexed in the search engine, set sound index to dirty so that the sound gets reindexed soon
+                a.sound.mark_index_dirty(commit=True)
             workers_logger.info("Finished processing analysis results (%s)" % json.dumps(
                 {'task_name': PROCESS_ANALYSIS_RESULTS_TASK_NAME, 'sound_id': sound_id, 'analyzer': analyzer, 'status': status,
                  'work_time': round(time.time() - start_time)}))
diff --git a/sounds/views.py b/sounds/views.py
index 34ff14fdf..4e86e6cf0 100644
--- a/sounds/views.py
+++ b/sounds/views.py
@@ -825,8 +825,16 @@ def similar(request, username, sound_id):
     if sound.user.username.lower() != username.lower():
         raise Http404
 
-    similarity_results, _ = get_similar_sounds(
-        sound, request.GET.get('preset', None), settings.NUM_SIMILAR_SOUNDS_PER_PAGE * settings.NUM_SIMILAR_SOUNDS_PAGES)
+    if not settings.USE_SEARCH_ENGINE_SIMILARITY:
+        # Get similar sounds from similarity service (gaia)
+        similarity_results, _ = get_similar_sounds(
+            sound, request.GET.get('preset', None), settings.NUM_SIMILAR_SOUNDS_PER_PAGE * settings.NUM_SIMILAR_SOUNDS_PAGES)
+    else:
+        # Get similar sounds from solr
+        from utils.search import get_search_engine
+        results = get_search_engine().search_sounds(similar_to=sound.id)
+        similarity_results = [(result['id'], result['score']) for result in results.docs]
+    
     paginator = paginate(request, [sound_id for sound_id, _ in similarity_results], settings.NUM_SIMILAR_SOUNDS_PER_PAGE)
     similar_sounds = Sound.objects.ordered_ids(paginator['page'].object_list)
     tvars = {'similar_sounds': similar_sounds, 'sound': sound}
diff --git a/utils/search/__init__.py b/utils/search/__init__.py
index 697963c8b..7ee6ad0db 100644
--- a/utils/search/__init__.py
+++ b/utils/search/__init__.py
@@ -219,6 +219,14 @@ def sound_exists_in_index(self, sound_object_or_id):
             bool: whether the sound is indexed in the search engine
         """
         raise NotImplementedError
+    
+    def get_all_sound_ids_from_index(self):
+        """Return a list of all sound IDs indexed in the search engine
+
+        Returns:
+            List[int]: list of all sound IDs indexed in the search engine
+        """
+        raise NotImplementedError
 
     def search_sounds(self, textual_query='', query_fields=None, query_filter='', offset=0, current_page=None,
                       num_sounds=settings.SOUNDS_PER_PAGE, sort=settings.SEARCH_SOUNDS_SORT_OPTION_AUTOMATIC,
diff --git a/utils/search/backends/solr555pysolr.py b/utils/search/backends/solr555pysolr.py
index 54f4c8328..2e9be07db 100644
--- a/utils/search/backends/solr555pysolr.py
+++ b/utils/search/backends/solr555pysolr.py
@@ -23,12 +23,13 @@
 import re
 import math
 from datetime import date, datetime
+from collections import defaultdict
 
 import pysolr
 from django.conf import settings
 
 from forum.models import Post
-from sounds.models import Sound
+from sounds.models import Sound, SoundAnalysis
 from utils.text import remove_control_chars
 from utils.search import SearchEngineBase, SearchResults, SearchEngineException
 from utils.search.backends.solr_common import SolrQuery, SolrResponseInterpreter
@@ -158,7 +159,7 @@ def convert_sound_to_search_engine_document(self, sound, fields_to_include=[]):
         TODO: Assert that sound object is correct?
         """
         # Document ID (same as sound ID)
-        document = {'id': sound.id}
+        document = {'id': sound.id, 'is_sound': True}
 
         # Basic sound fields
         keep_fields = ['username', 'created', 'is_explicit', 'is_remix', 'num_ratings', 'channels', 'md5',
@@ -230,6 +231,7 @@ def convert_sound_to_search_engine_document(self, sound, fields_to_include=[]):
             # the data is already retrieved in the queryset, that optimization would be negligible so we keep it simple.
             document = {k: v for k, v in document.items() if k in fields_to_include}
             document['id'] = sound.id  # Make sure we always include the ID
+            document['is_sound'] = True  # Make sure we always include the ID
 
         return document
 
@@ -378,6 +380,37 @@ def search_process_filter(self, query_filter, only_sounds_within_ids=False, only
     # Sound methods
     def add_sounds_to_index(self, sound_objects, update=False, fields_to_include=[]):
         documents = [self.convert_sound_to_search_engine_document(s, fields_to_include=fields_to_include) for s in sound_objects]
+        
+        # If required, collect similarity vectors from all configured analyzers
+        if 'similarity_vectors' in fields_to_include or not fields_to_include:
+            similarity_data = defaultdict(list)
+            sound_ids = [s.id for s in sound_objects]
+            for analyzer_name, config_options in settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS.items():
+                # If we should index similarity data, add it to the documents
+                vector_solr_field_type = {100: 'sim_vector100'}.get(config_options['vector_size'], None)
+                if vector_solr_field_type is None:
+                    # If the vector size is not supported, then we can't index the vectors generated by the requested analyzer
+                    continue
+                for sa in SoundAnalysis.objects.filter(sound_id__in=sound_ids, analyzer=analyzer_name, analysis_status="OK"):
+                    similarity_vectors_per_analyzer_per_sound=[]
+                    data = sa.get_analysis_data_from_file()
+                    if data is not None:
+                        if data[config_options['vector_property_name']] is not None:
+                            similarity_vectors_per_analyzer_per_sound.append({
+                                'is_sound': False,
+                                'analyzer': sa.analyzer,
+                                'timestamp_start': 0,  # This will be used in the future if analyzers generate multiple sound vectors
+                                'timestamp_end': -1,  # This will be used in the future if analyzers generate multiple sound vectors
+                                vector_solr_field_type: data[config_options['vector_property_name']][0:config_options['vector_size']]
+                            })
+                    if similarity_vectors_per_analyzer_per_sound:
+                        similarity_data[sa.sound_id] += similarity_vectors_per_analyzer_per_sound
+            
+            # Add collected vectors to the documents created
+            for document in documents:
+                if document['id'] in similarity_data:
+                    document['similarity_vectors'] = similarity_data[document['id']]
+        
         if update:
             documents = [self.transform_document_into_update_document(d) for d in documents]
         try:
@@ -398,7 +431,6 @@ def remove_sounds_from_index(self, sound_objects_or_ids):
             raise SearchEngineException(e)
 
     def remove_all_sounds(self):
-        """Removes all sounds from the search index"""
         try:
             self.get_sounds_index().delete(q="*:*")
         except pysolr.SolrError as e:
@@ -411,79 +443,144 @@ def sound_exists_in_index(self, sound_object_or_id):
             sound_id = sound_object_or_id.id
         response = self.search_sounds(query_filter=f'id:{sound_id}', offset=0, num_sounds=1)
         return response.num_found > 0
+    
+    def get_all_sound_ids_from_index(self):
+        page_size=2000
+        solr_ids = []
+        solr_count = None
+        current_page = 1
+        while solr_count is None or len(solr_ids) < solr_count:
+            response = self.search_sounds(sort=settings.SEARCH_SOUNDS_SORT_OPTION_DATE_NEW_FIRST,
+                                          offset=(current_page - 1) * page_size,
+                                          num_sounds=page_size)
+            solr_ids += [int(element['id']) for element in response.docs]
+            solr_count = response.num_found
+            current_page += 1
+        return sorted(solr_ids)
 
     def search_sounds(self, textual_query='', query_fields=None, query_filter='', offset=0, current_page=None,
                       num_sounds=settings.SOUNDS_PER_PAGE, sort=settings.SEARCH_SOUNDS_SORT_OPTION_AUTOMATIC,
                       group_by_pack=False, num_sounds_per_pack_group=1, facets=None, only_sounds_with_pack=False, 
-                      only_sounds_within_ids=False, group_counts_as_one_in_facets=False):
+                      only_sounds_within_ids=False, group_counts_as_one_in_facets=False, similar_to=None):
 
         query = SolrQuery()
 
+        if similar_to is None:
+            # Usual search query, no similarity search
+        
+            # Process search fields: replace "db" field names by solr field names and set default weights if needed
+            if query_fields is None:
+                # If no fields provided, use the default
+                query_fields = settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS
+            if isinstance(query_fields, list):
+                query_fields = [self.add_solr_suffix_to_dynamic_fieldname(FIELD_NAMES_MAP.get(field, field)) for field in query_fields]
+            elif isinstance(query_fields, dict):
+                # Also remove fields with weight <= 0
+                query_fields = [(self.add_solr_suffix_to_dynamic_fieldname(FIELD_NAMES_MAP.get(field, field)), weight)
+                    for field, weight in query_fields.items() if weight > 0]
+
+            # Set main query options
+            query.set_dismax_query(textual_query, query_fields=query_fields)
+
+            # Process filter
+            query_filter = self.search_process_filter(query_filter,
+                                                    only_sounds_within_ids=only_sounds_within_ids,
+                                                    only_sounds_with_pack=only_sounds_with_pack)
+
+            # Set other query options
+            if current_page is not None:
+                offset = (current_page - 1) * num_sounds
+            query.set_query_options(start=offset,
+                                    rows=num_sounds,
+                                    field_list=["id", "score"],  # We only want the sound IDs of the results as we load data from DB
+                                    filter_query=query_filter,
+                                    sort=self.search_process_sort(sort))
+
+            # Configure facets
+            if facets is not None:
+                facet_fields = [FIELD_NAMES_MAP[field_name] for field_name, _ in facets.items()]
+                query.add_facet_fields(*facet_fields)
+                query.set_facet_options_default(**SOLR_SOUND_FACET_DEFAULT_OPTIONS)
+                for field_name, extra_options in facets.items():
+                    query.set_facet_options(FIELD_NAMES_MAP[field_name], **extra_options)
+
+            # Configure grouping
+            if group_by_pack:
+                query.set_group_field(group_field="grouping_pack")
+                query.set_group_options(
+                    group_func=None,
+                    group_query=None,
+                    group_rows=10,  # TODO: if limit is lower than rows and start=0, this should probably be equal to limit
+                    group_start=0,
+                    group_limit=num_sounds_per_pack_group,  # This is the number of documents that will be returned for each group.
+                    group_offset=0,
+                    group_sort=None,
+                    group_sort_ingroup=None,
+                    group_format='grouped',
+                    group_main=False,
+                    group_num_groups=True,
+                    group_cache_percent=0,
+                    group_truncate=group_counts_as_one_in_facets)
+        else:
+            
+            vector = None
+            if isinstance(similar_to, list):
+                vector = similar_to  # we allow vectors to be passed directly
+            else:
+                # similar_to should be a sound_id
+                sa = SoundAnalysis.objects.filter(sound_id=similar_to, analyzer=settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER, analysis_status="OK")
+                config_options = settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS[settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER]
+                if sa.exists():
+                    data = sa.first().get_analysis_data_from_file()
+                    if data is not None:
+                        vector = data[config_options['vector_property_name']][0:config_options['vector_size']] 
+            
+            # Set query
+            if vector is not None:
+                max_similar_sounds = 1000000  # TODO: evaluate the performance impact of this so we can set it as high as possible. We want this to be high so that we can get a lot of similar sounds where to apply the filters to. Ideally we'd like this to be the whole collection.
+                serialized_vector = ','.join([str(n) for n in vector])
+                query.set_query(f'{{!knn f=sim_vector100 topK={max_similar_sounds}}}[{serialized_vector}]')
+            
+                # Process filter
+                query_filter = self.search_process_filter(query_filter,
+                                                          only_sounds_within_ids=only_sounds_within_ids,
+                                                          only_sounds_with_pack=only_sounds_with_pack)
+
+                # Set other query options
+                if current_page is not None:
+                    offset = (current_page - 1) * num_sounds
+                
+                filter_query = [f'analyzer:{settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER}']  # Add basic filter to only get similarity vectors from selected analyzer
+                for part in query_filter.split('+'):
+                    if part:
+                        # Add extra query filters to the search query, but using the approptiate prefix to make sure they are applied to the root documents
+                        filter_query.append('{!child of=\"*:* -_nest_path_:*\"}' + part)
+
+                query.set_query_options(start=offset,
+                                        rows=num_sounds,
+                                        field_list=["id", "score"],  # We only want the sound IDs of the results as we load data from DB. In the future we might add timestamp_start/timestamp_end
+                                        filter_query=filter_query,
+                                        sort=['score desc'])
+                
+                # NOTE: ATM we can not add more query options (faceting, etc) to similarity search because it does not return
+                # root documents but child documents (so that we can get multiple matches per sound if there are multiple vectors indexed).
+                # If we manage to improve the query so that it returns root documents, we can add more query options here. Or we could also
+                # first do the similarity search query and then do a normal search with the results of the similarity search as a filter...
+            else:
+                query.set_query('')
+                # If there is no vector found we can't do similarity search. Configure the query to return no results
 
-        # Process search fields: replace "db" field names by solr field names and set default weights if needed
-        if query_fields is None:
-            # If no fields provided, use the default
-            query_fields = settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS
-        if isinstance(query_fields, list):
-            query_fields = [self.add_solr_suffix_to_dynamic_fieldname(FIELD_NAMES_MAP.get(field, field)) for field in query_fields]
-        elif isinstance(query_fields, dict):
-            # Also remove fields with weight <= 0
-            query_fields = [(self.add_solr_suffix_to_dynamic_fieldname(FIELD_NAMES_MAP.get(field, field)), weight)
-                for field, weight in query_fields.items() if weight > 0]
-
-        # Set main query options
-        query.set_dismax_query(textual_query, query_fields=query_fields)
-
-        # Process filter
-        query_filter = self.search_process_filter(query_filter,
-                                                  only_sounds_within_ids=only_sounds_within_ids,
-                                                  only_sounds_with_pack=only_sounds_with_pack)
-
-        # Set other query options
-        if current_page is not None:
-            offset = (current_page - 1) * num_sounds
-        query.set_query_options(start=offset,
-                                rows=num_sounds,
-                                field_list=["id", "score"],  # We only want the sound IDs of the results as we load data from DB
-                                filter_query=query_filter,
-                                sort=self.search_process_sort(sort))
-
-        # Configure facets
-        if facets is not None:
-            facet_fields = [FIELD_NAMES_MAP[field_name] for field_name, _ in facets.items()]
-            query.add_facet_fields(*facet_fields)
-            query.set_facet_options_default(**SOLR_SOUND_FACET_DEFAULT_OPTIONS)
-            for field_name, extra_options in facets.items():
-                query.set_facet_options(FIELD_NAMES_MAP[field_name], **extra_options)
-
-        # Configure grouping
-        if group_by_pack:
-            query.set_group_field(group_field="grouping_pack")
-            query.set_group_options(
-                group_func=None,
-                group_query=None,
-                group_rows=10,  # TODO: if limit is lower than rows and start=0, this should probably be equal to limit
-                group_start=0,
-                group_limit=num_sounds_per_pack_group,  # This is the number of documents that will be returned for each group.
-                group_offset=0,
-                group_sort=None,
-                group_sort_ingroup=None,
-                group_format='grouped',
-                group_main=False,
-                group_num_groups=True,
-                group_cache_percent=0,
-                group_truncate=group_counts_as_one_in_facets)
 
         # Do the query!
         # Note: we create a SearchResults with the same members as SolrResponseInterpreter (the response from .search()).
         # We do it in this way to conform to SearchEngine.search_sounds definition which must return SearchResults
         try:
-            results = self.get_sounds_index().search(**query.as_kwargs())
+            results = self.get_sounds_index().search(**query.as_kwargs(force_sounds=similar_to is None))
             # Solr uses a string for the id field, but django uses an int. Convert the id in all results to int
             # before use to avoid issues
-            docs = results.docs
-            for d in docs:
-                d["id"] = int(d["id"])
+            for d in results.docs:
+                # Get the sound ids from the results
+                d["id"] = int(d["id"] if similar_to is None else d["id"].split('/')[0])
             return SearchResults(
                 docs=results.docs,
                 num_found=results.num_found,
@@ -505,7 +602,7 @@ def get_random_sound_id(self):
         query.set_query("*:*")
         query.set_query_options(start=0, rows=1, field_list=["id"], filter_query=filter_query, sort=sort)
         try:
-            response = self.get_sounds_index().search(search_handler="select", **query.as_kwargs())
+            response = self.get_sounds_index().search(search_handler="select", **query.as_kwargs(force_sounds=True))
             docs = response.docs
             if docs:
                 return int(docs[0]['id'])
@@ -535,7 +632,6 @@ def remove_forum_posts_from_index(self, forum_post_objects_or_ids):
             raise SearchEngineException(e)
 
     def remove_all_forum_posts(self):
-        """Removes all forum posts from the search index"""
         try:
             self.get_forum_index().delete(q="*:*")
         except pysolr.SolrError as e:
@@ -613,7 +709,7 @@ def get_user_tags(self, username):
         query.add_facet_fields("tag")
         query.set_facet_options("tag", limit=10, mincount=1)
         try:
-            results = self.get_sounds_index().search(**query.as_kwargs())
+            results = self.get_sounds_index().search(**query.as_kwargs(force_sounds=True))
             return results.facets['tag']
         except pysolr.SolrError as e:
             raise SearchEngineException(e)
@@ -626,7 +722,7 @@ def get_pack_tags(self, username, pack_name):
         query.add_facet_fields("tag")
         query.set_facet_options("tag", limit=20, mincount=1)
         try:
-            results = self.get_sounds_index().search(**query.as_kwargs())
+            results = self.get_sounds_index().search(**query.as_kwargs(force_sounds=True))
             return results.facets['tag']
         except pysolr.SolrError as e:
             raise SearchEngineException(e)
diff --git a/utils/search/backends/solr_common.py b/utils/search/backends/solr_common.py
index 7128d6aa9..1014e9b75 100644
--- a/utils/search/backends/solr_common.py
+++ b/utils/search/backends/solr_common.py
@@ -268,12 +268,19 @@ def set_group_options(self, group_func=None, group_query=None, group_rows=10, gr
         self.params['group.truncate'] = group_truncate
         self.params['group.cache.percent'] = group_cache_percent
 
-    def as_kwargs(self):
+    def as_kwargs(self, force_sounds=False):
         """Return params in a way that can be passed to pysolr commands as kwargs"""
         params = {k: v for k, v in self.params.items() if v is not None}
         for k, v in params.items():
             if isinstance(v, bool):
                 params[k] = json.dumps(v)
+        # If 'force_sounds', we want to make sure we only include sound documents in the query and not any child documents. Add an extra fq to force that.
+        if force_sounds:
+            current_fq = params['fq']
+            if isinstance(current_fq, list):
+                params.update({'fq': current_fq + ['is_sound:1']}) 
+            else:
+                params.update({'fq': [current_fq, 'is_sound:1']}) 
         return params
 
 
diff --git a/utils/search/backends/test_search_engine_backend.py b/utils/search/backends/test_search_engine_backend.py
index dd6dff4d0..707edbc27 100644
--- a/utils/search/backends/test_search_engine_backend.py
+++ b/utils/search/backends/test_search_engine_backend.py
@@ -388,6 +388,11 @@ def test_search_enginge_backend_sounds(self):
         # Re-index all sounds to leave index in "correct" state for next tests
         self.search_engine.add_sounds_to_index(sounds)
 
+        # Test that the method to get all sound IDs works as expected
+        sound_ids = self.search_engine.get_all_sound_ids_from_index()
+        sound_ids_db = sorted([s.id for s in sounds])
+        assert_and_continue(sound_ids_db == sound_ids, 'get_all_sound_ids_from_index returned wrong sound IDs')
+
         self.sound_check_mandatory_doc_fields()
         self.sound_check_random_sound()
         self.sound_check_offsets()
diff --git a/utils/search/search_sounds.py b/utils/search/search_sounds.py
index 79d999497..f56610fa5 100644
--- a/utils/search/search_sounds.py
+++ b/utils/search/search_sounds.py
@@ -446,21 +446,10 @@ def get_all_sound_ids_from_search_engine(page_size=2000):
     """
     console_logger.info("Getting all sound ids from search engine")
     search_engine = get_search_engine()
-    solr_ids = []
-    solr_count = None
-    current_page = 1
     try:
-        while solr_count is None or len(solr_ids) < solr_count:
-            response = search_engine.search_sounds(query_filter="*:*",
-                                                   sort=settings.SEARCH_SOUNDS_SORT_OPTION_DATE_NEW_FIRST,
-                                                   offset=(current_page - 1) * page_size,
-                                                   num_sounds=page_size)
-            solr_ids += [int(element['id']) for element in response.docs]
-            solr_count = response.num_found
-            current_page += 1
+        return search_engine.get_all_sound_ids_from_index()
     except SearchEngineException as e:
         search_logger.info(f"Could not retrieve all sound IDs from search engine: {str(e)}")
-    return sorted(solr_ids)
 
 
 def get_random_sound_id_from_search_engine():
diff --git a/utils/search/solr9/cores/freesound/conf/schema.xml b/utils/search/solr9/cores/freesound/conf/schema.xml
index 323429efb..6625443e3 100644
--- a/utils/search/solr9/cores/freesound/conf/schema.xml
+++ b/utils/search/solr9/cores/freesound/conf/schema.xml
@@ -182,6 +182,7 @@
   <field name="_text_" type="text_general" multiValued="true" indexed="true" stored="false"/>
   <field name="_version_" type="plong" indexed="false" stored="false"/>
   <field name="id" type="string" multiValued="false" indexed="true" required="true" stored="true"/>
+  <field name="is_sound" type="boolean" indexed="true" stored="true" required="true" />
 
   <field name="username" type="alphaOnlySort" indexed="true" stored="true" required="false" />
   <field name="original_filename" type="freesound_text" indexed="true" stored="true" required="false" />
@@ -238,6 +239,14 @@
   <dynamicField name="*_s" type="string" indexed="true" stored="true" required="false"/>
   <dynamicField name="*_ls" type="alphaOnlySort" indexed="true" stored="true" required="false" multiValued="true"/> <!-- list of string (or tags) -->
 
+  <!-- Similarity-related fields (will be stored as child documents with the fields below set) -->
+  <field name="_nest_parent_" type="string" indexed="true" stored="true" />
+  <field name="_nest_path_" type="_nest_path_"/>
+  <field name="sim_vector100" type="knn_vector100" indexed="true" stored="true" required="false"/>
+  <field name="analyzer" type="string" indexed="true" stored="true" required="false" />
+  <field name="timestamp_start" type="pdouble" indexed="true" stored="true" required="false" />
+  <field name="timestamp_end" type="pdouble" indexed="true" stored="true" required="false" />
+
   <!-- Other fields -->
   <dynamicField name="random_*" type="random" indexed="true" stored="false"/>
 

From 0d793d1b28978c9c98080a624971ec7e48187128 Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Wed, 24 Jan 2024 09:39:28 +0100
Subject: [PATCH 07/28] Add more taks to vscode workspace

---
 freesound.code-workspace | 43 ++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/freesound.code-workspace b/freesound.code-workspace
index b159aadd8..4dc94ba62 100644
--- a/freesound.code-workspace
+++ b/freesound.code-workspace
@@ -34,28 +34,23 @@
 	"tasks": {
 		"version": "2.0.0",
 		"tasks": [
-			{
-				"label": "Run web and search",
-				"type": "shell",
-				"command": "docker-compose up web search",
-				"problemMatcher": []
-			},
+	
 			{
 				"label": "Docker compose build",
 				"type": "shell",
-				"command": "docker-compose build",
+				"command": "docker compose build",
 				"problemMatcher": []
 			},
 			{
 				"label": "Build static",
 				"type": "shell",
-				"command": "docker-compose run --rm web npm run build && docker-compose run --rm web python manage.py collectstatic --clear --noinput",
+				"command": "docker compose run --rm web npm run build && docker compose run --rm web python manage.py collectstatic --clear --noinput",
 				"problemMatcher": []
 			},
 			{
 				"label": "Install static",
 				"type": "shell",
-				"command": "docker-compose run --rm web npm install --force",
+				"command": "docker compose run --rm web npm install --force",
 				"problemMatcher": []
 			},
 			{
@@ -67,37 +62,55 @@
 			{
 				"label": "Create caches",
 				"type": "shell",
-				"command": "docker-compose run --rm web python manage.py create_front_page_caches && docker-compose run --rm web python manage.py create_random_sounds && docker-compose run --rm web python manage.py generate_geotags_bytearray",
+				"command": "docker compose run --rm web python manage.py create_front_page_caches && docker compose run --rm web python manage.py create_random_sounds && docker compose run --rm web python manage.py generate_geotags_bytearray",
 				"problemMatcher": []
 			},
 			{
 				"label": "Run tests",
 				"type": "shell",
-				"command": "docker-compose run --rm web python manage.py test --settings=freesound.test_settings",
+				"command": "docker compose run --rm web python manage.py test --settings=freesound.test_settings",
 				"problemMatcher": []
 			},
 			{
 				"label": "Run tests verbose with warnings",
 				"type": "shell",
-				"command": "docker-compose run --rm web python -Wa manage.py test -v3 --settings=freesound.test_settings",
+				"command": "docker compose run --rm web python -Wa manage.py test -v3 --settings=freesound.test_settings",
 				"problemMatcher": []
 			},
 			{
 				"label": "Migrate",
 				"type": "shell",
-				"command": "docker-compose run --rm web python manage.py migrate",
+				"command": "docker compose run --rm web python manage.py migrate",
 				"problemMatcher": []
 			},
 			{
 				"label": "Make migrations",
 				"type": "shell",
-				"command": "docker-compose run --rm web python manage.py makemigrations",
+				"command": "docker compose run --rm web python manage.py makemigrations",
 				"problemMatcher": []
 			},
 			{
 				"label": "Shell plus",
 				"type": "shell",
-				"command": "docker-compose run --rm web python manage.py shell_plus",
+				"command": "docker compose run --rm web python manage.py shell_plus",
+				"problemMatcher": []
+			},
+			{
+				"label": "Reindex search engine",
+				"type": "shell",
+				"command": "docker compose run --rm web python manage.py reindex_search_engine_sounds && docker compose run --rm web python manage.py reindex_search_engine_forum",
+				"problemMatcher": []
+			},
+			{
+				"label": "Post dirty sounds to search engine",
+				"type": "shell",
+				"command": "docker compose run --rm web python manage.py post_dirty_sounds_to_search_engine",
+				"problemMatcher": []
+			},
+			{
+				"label": "Orchestrate analysis",
+				"type": "shell",
+				"command": "docker compose run --rm web python manage.py orchestrate_analysis",
 				"problemMatcher": []
 			}
 		]

From edddb731a90aa7c15cf3305c2ff63bdb1212d712 Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Wed, 24 Jan 2024 09:40:01 +0100
Subject: [PATCH 08/28] Replace docker-compose by docker compose in docs

---
 DEVELOPERS.md |  6 +++---
 README.md     | 34 +++++++++++++++++-----------------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/DEVELOPERS.md b/DEVELOPERS.md
index 3a3a02c8a..7e4a6aa4d 100644
--- a/DEVELOPERS.md
+++ b/DEVELOPERS.md
@@ -144,7 +144,7 @@ If a new search engine backend class is to be implemented, it must closely follo
 utils.search.SearchEngineBase docstrings. There is a Django management command that can be used in order to test
 the implementation of a search backend. You can run it like:
 
-    docker-compose run --rm web python manage.py test_search_engine_backend -fsw --backend utils.search.backends.solr9pysolr.Solr9PySolrSearchEngine
+    docker compose run --rm web python manage.py test_search_engine_backend -fsw --backend utils.search.backends.solr9pysolr.Solr9PySolrSearchEngine
 
 Please read carefully the documentation of the management command to better understand how it works and how is it
 doing the testing.
@@ -217,7 +217,7 @@ https://github.com/mtg/freesound-audio-analyzers. The docker compose of the main
 services for the external analyzers which depend on docker images having been previously built from the 
 `freesound-audio-analyzers` repository. To build these images you simply need to checkout the code repository and run 
 `make`. Once the images are built, Freesound can be run including the external analyzer services by of the docker compose 
-file by running `docker-compose --profile analyzers up`
+file by running `docker compose --profile analyzers up`
 
 The new analysis pipeline uses a job queue based on Celery/RabbitMQ. RabbitMQ console can be accessed at port `5673`
 (e.g. `http://localhost:5673/rabbitmq-admin`) and using `guest` as both username and password. Also, accessing 
@@ -231,7 +231,7 @@ for Freesound async tasks other than analysis).
 
 - Make sure that there are no outstanding deprecation warnings for the version of django that we are upgrading to.
 
-      docker-compose run --rm web python -Wd manage.py test
+      docker compose run --rm web python -Wd manage.py test
 
 Check for warnings of the form `RemovedInDjango110Warning` (TODO: Make tests fail if a warning occurs)
 
diff --git a/README.md b/README.md
index d637b50f3..9e2ff7a93 100644
--- a/README.md
+++ b/README.md
@@ -65,35 +65,35 @@ Below are instructions for setting up a local Freesound installation for develop
 
 8. Build all Docker containers. The first time you run this command can take a while as a number of Docker images need to be downloaded and things need to be installed and compiled. 
 
-       docker-compose build
+       docker compose build
 
 9. Download the [Freesound development database dump](https://drive.google.com/file/d/11z9s8GyYkVlmWdEsLSwUuz0AjZ8cEvGy/view?usp=share_link) (~6MB), uncompress it and place the resulting `freesound-small-dev-dump-2023-09.sql` in the `freesound-data/db_dev_dump/` directory. Then run the database container and load the data into it using the commands below. You should get permission to download this file from Freesound admins.
 
-       docker-compose up -d db
-       docker-compose run --rm db psql -h db -U freesound  -d freesound -f freesound-data/db_dev_dump/freesound-small-dev-dump-2023-09.sql
+       docker compose up -d db
+       docker compose run --rm db psql -h db -U freesound  -d freesound -f freesound-data/db_dev_dump/freesound-small-dev-dump-2023-09.sql
        # or if the above command does not work, try this one 
-       docker-compose run --rm --no-TTY db psql -h db -U freesound -d freesound < freesound-data/db_dev_dump/freesound-small-dev-dump-2023-09.sql
+       docker compose run --rm --no-TTY db psql -h db -U freesound -d freesound < freesound-data/db_dev_dump/freesound-small-dev-dump-2023-09.sql
 
 10. Update database by running Django migrations
 
-        docker-compose run --rm web python manage.py migrate
+        docker compose run --rm web python manage.py migrate
 
 11. Create a superuser account to be able to log in to the local Freesound website and to the admin site
 
-        docker-compose run --rm web python manage.py createsuperuser
+        docker compose run --rm web python manage.py createsuperuser
 
 12. Install static build dependencies
 
-        docker-compose run --rm web npm install --force
+        docker compose run --rm web npm install --force
 
 13. Build static files. Note that this step will need to be re-run every time there are changes in Freesound's static code (JS, CSS and static media files).
 
-        docker-compose run --rm web npm run build
-        docker-compose run --rm web python manage.py collectstatic --noinput
+        docker compose run --rm web npm run build
+        docker compose run --rm web python manage.py collectstatic --noinput
 
 14. Run services 🎉
 
-        docker-compose up
+        docker compose up
 
     When running this command, the most important services that make Freesound work will be run locally.
     This includes the web application and database, but also the search engine, cache manager, queue manager and asynchronous workers, including audio processing. 
@@ -102,24 +102,24 @@ Below are instructions for setting up a local Freesound installation for develop
 15. Build the search index, so you can search for sounds and forum posts
 
         # Open a new terminal window so the services started in the previous step keep running
-        docker-compose run --rm web python manage.py reindex_search_engine_sounds
-        docker-compose run --rm web python manage.py reindex_search_engine_forum
+        docker compose run --rm web python manage.py reindex_search_engine_sounds
+        docker compose run --rm web python manage.py reindex_search_engine_forum
 
     After following the steps, you'll have a functional Freesound installation up and running, with the most relevant services properly configured. 
     You can run Django's shell plus command like this:
 
-        docker-compose run --rm web python manage.py shell_plus
+        docker compose run --rm web python manage.py shell_plus
 
     Because the `web` container mounts a named volume for the home folder of the user running the shell plus process, command history should be kept between container runs :)
 
-16. (extra step) The steps above will get Freesound running, but to save resources in your local machine some non-essential services will not be started by default. If you look at the `docker-compose.yml` file, you'll see that some services are marked with the profile `analyzers` or `all`. These services include sound similarity, search results clustering and the audio analyzers. To run these services you need to explicitly tell `docker-compose` using the `--profile` (note that some services need additional configuration steps (see *Freesound analysis pipeline* section in `DEVELOPERS.md`):
+16. (extra step) The steps above will get Freesound running, but to save resources in your local machine some non-essential services will not be started by default. If you look at the `docker compose.yml` file, you'll see that some services are marked with the profile `analyzers` or `all`. These services include sound similarity, search results clustering and the audio analyzers. To run these services you need to explicitly tell `docker compose` using the `--profile` (note that some services need additional configuration steps (see *Freesound analysis pipeline* section in `DEVELOPERS.md`):
 
-        docker-compose --profile analyzers up   # To run all basic services + sound analyzers
-        docker-compose --profile all up         # To run all services
+        docker compose --profile analyzers up   # To run all basic services + sound analyzers
+        docker compose --profile all up         # To run all services
 
 
 ### Running tests
 
 You can run tests using the Django test runner in the `web` container like that:
 
-    docker-compose run --rm web python manage.py test --settings=freesound.test_settings
+    docker compose run --rm web python manage.py test --settings=freesound.test_settings

From 475a2221219c15337beb420d926486cd4abb9ce0 Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Wed, 24 Jan 2024 09:40:29 +0100
Subject: [PATCH 09/28] Add parameter to chose analyzer for similarity

---
 utils/search/backends/solr555pysolr.py | 104 ++++++++++++++-----------
 1 file changed, 58 insertions(+), 46 deletions(-)

diff --git a/utils/search/backends/solr555pysolr.py b/utils/search/backends/solr555pysolr.py
index 2e9be07db..2822431d2 100644
--- a/utils/search/backends/solr555pysolr.py
+++ b/utils/search/backends/solr555pysolr.py
@@ -86,6 +86,11 @@
 }
 
 
+SOLR_VECTOR_FIELDS_DIMENSIONS_MAP = {
+    100: 'sim_vector100',
+}
+
+
 SOLR_SOUND_FACET_DEFAULT_OPTIONS = {
     'limit': 5,
     'sort': True,
@@ -387,7 +392,7 @@ def add_sounds_to_index(self, sound_objects, update=False, fields_to_include=[])
             sound_ids = [s.id for s in sound_objects]
             for analyzer_name, config_options in settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS.items():
                 # If we should index similarity data, add it to the documents
-                vector_solr_field_type = {100: 'sim_vector100'}.get(config_options['vector_size'], None)
+                vector_solr_field_type = SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(config_options['vector_size'], None)
                 if vector_solr_field_type is None:
                     # If the vector size is not supported, then we can't index the vectors generated by the requested analyzer
                     continue
@@ -461,7 +466,8 @@ def get_all_sound_ids_from_index(self):
     def search_sounds(self, textual_query='', query_fields=None, query_filter='', offset=0, current_page=None,
                       num_sounds=settings.SOUNDS_PER_PAGE, sort=settings.SEARCH_SOUNDS_SORT_OPTION_AUTOMATIC,
                       group_by_pack=False, num_sounds_per_pack_group=1, facets=None, only_sounds_with_pack=False, 
-                      only_sounds_within_ids=False, group_counts_as_one_in_facets=False, similar_to=None):
+                      only_sounds_within_ids=False, group_counts_as_one_in_facets=False, 
+                      similar_to=None, similar_to_analyzer=settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER):
 
         query = SolrQuery()
 
@@ -522,53 +528,59 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of
                     group_cache_percent=0,
                     group_truncate=group_counts_as_one_in_facets)
         else:
+            # Similarity search!
             
-            vector = None
-            if isinstance(similar_to, list):
-                vector = similar_to  # we allow vectors to be passed directly
-            else:
-                # similar_to should be a sound_id
-                sa = SoundAnalysis.objects.filter(sound_id=similar_to, analyzer=settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER, analysis_status="OK")
-                config_options = settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS[settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER]
-                if sa.exists():
-                    data = sa.first().get_analysis_data_from_file()
-                    if data is not None:
-                        vector = data[config_options['vector_property_name']][0:config_options['vector_size']] 
-            
-            # Set query
-            if vector is not None:
-                max_similar_sounds = 1000000  # TODO: evaluate the performance impact of this so we can set it as high as possible. We want this to be high so that we can get a lot of similar sounds where to apply the filters to. Ideally we'd like this to be the whole collection.
-                serialized_vector = ','.join([str(n) for n in vector])
-                query.set_query(f'{{!knn f=sim_vector100 topK={max_similar_sounds}}}[{serialized_vector}]')
-            
-                # Process filter
-                query_filter = self.search_process_filter(query_filter,
-                                                          only_sounds_within_ids=only_sounds_within_ids,
-                                                          only_sounds_with_pack=only_sounds_with_pack)
-
-                # Set other query options
-                if current_page is not None:
-                    offset = (current_page - 1) * num_sounds
+            # We fist set an empty query that will return no results and will be used by default if similarity can't be performed
+            query.set_query('')  
+            if similar_to_analyzer in settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS:
+                # Get target vector from sound or from parameter
+                vector = None
+                extra_offset = 0
+                if isinstance(similar_to, list):
+                    vector = similar_to  # we allow vectors to be passed directly
+                    vector_field_name = SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(len(vector), None)
+                else:
+                    # similar_to should be a sound_id
+                    extra_offset = 1  # We add 1 to the offset so that we don't get the sound itself as a result
+                    sa = SoundAnalysis.objects.filter(sound_id=similar_to, analyzer=similar_to_analyzer, analysis_status="OK")
+                    config_options = settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS[similar_to_analyzer]
+                    vector_field_name = SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(config_options['vector_size'], None)
+                    if sa.exists():
+                        data = sa.first().get_analysis_data_from_file()
+                        if data is not None:
+                            vector = data[config_options['vector_property_name']][0:config_options['vector_size']] 
                 
-                filter_query = [f'analyzer:{settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER}']  # Add basic filter to only get similarity vectors from selected analyzer
-                for part in query_filter.split('+'):
-                    if part:
-                        # Add extra query filters to the search query, but using the approptiate prefix to make sure they are applied to the root documents
-                        filter_query.append('{!child of=\"*:* -_nest_path_:*\"}' + part)
-
-                query.set_query_options(start=offset,
-                                        rows=num_sounds,
-                                        field_list=["id", "score"],  # We only want the sound IDs of the results as we load data from DB. In the future we might add timestamp_start/timestamp_end
-                                        filter_query=filter_query,
-                                        sort=['score desc'])
+                # Set query
+                if vector is not None and vector_field_name is not None:
+                    max_similar_sounds = 1000000  # TODO: evaluate the performance impact of this so we can set it as high as possible. We want this to be high so that we can get a lot of similar sounds where to apply the filters to. Ideally we'd like this to be the whole collection.
+                    serialized_vector = ','.join([str(n) for n in vector])
+                    query.set_query(f'{{!knn f={vector_field_name} topK={max_similar_sounds}}}[{serialized_vector}]')
                 
-                # NOTE: ATM we can not add more query options (faceting, etc) to similarity search because it does not return
-                # root documents but child documents (so that we can get multiple matches per sound if there are multiple vectors indexed).
-                # If we manage to improve the query so that it returns root documents, we can add more query options here. Or we could also
-                # first do the similarity search query and then do a normal search with the results of the similarity search as a filter...
-            else:
-                query.set_query('')
-                # If there is no vector found we can't do similarity search. Configure the query to return no results
+                    # Process filter
+                    query_filter = self.search_process_filter(query_filter,
+                                                            only_sounds_within_ids=only_sounds_within_ids,
+                                                            only_sounds_with_pack=only_sounds_with_pack)
+
+                    # Set other query options
+                    if current_page is not None:
+                        offset = (current_page - 1) * num_sounds
+                    
+                    filter_query = [f'analyzer:{settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER}']  # Add basic filter to only get similarity vectors from selected analyzer
+                    for part in query_filter.split('+'):
+                        if part:
+                            # Add extra query filters to the search query, but using the approptiate prefix to make sure they are applied to the root documents
+                            filter_query.append('{!child of=\"is_sound:1\"}' + part)
+
+                    query.set_query_options(start=offset + extra_offset,
+                                            rows=num_sounds,
+                                            field_list=["id", "score"],  # We only want the sound IDs of the results as we load data from DB. In the future we might add timestamp_start/timestamp_end
+                                            filter_query=filter_query,
+                                            sort=['score desc'])
+                    
+                    # NOTE: ATM we can not add more query options (faceting, etc) to similarity search because it does not return
+                    # root documents but child documents (so that we can get multiple matches per sound if there are multiple vectors indexed).
+                    # If we manage to improve the query so that it returns root documents, we can add more query options here. Or we could also
+                    # first do the similarity search query and then do a normal search with the results of the similarity search as a filter...
 
 
         # Do the query!

From 442b3a3e2472a7fa5e373d8d238cdb65a370fd2c Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Wed, 24 Jan 2024 10:47:11 +0100
Subject: [PATCH 10/28] Handle case with no valid embeddings

---
 utils/search/backends/solr555pysolr.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/utils/search/backends/solr555pysolr.py b/utils/search/backends/solr555pysolr.py
index 2822431d2..8858df5be 100644
--- a/utils/search/backends/solr555pysolr.py
+++ b/utils/search/backends/solr555pysolr.py
@@ -548,7 +548,10 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of
                     if sa.exists():
                         data = sa.first().get_analysis_data_from_file()
                         if data is not None:
-                            vector = data[config_options['vector_property_name']][0:config_options['vector_size']] 
+                            vector_raw = data[config_options['vector_property_name']]
+                            if vector_raw is not None:
+                                vector = vector_raw[0:config_options['vector_size']] 
+                            
                 
                 # Set query
                 if vector is not None and vector_field_name is not None:

From 85fd8d2d2aaba11838d1ca9c3803e459cf247712 Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Wed, 24 Jan 2024 10:48:41 +0100
Subject: [PATCH 11/28] Get appropriate similairty state when search engine
 similarity is enabled

We used to store similarity_state field in the sound model, but this is no longer needed when using the search engine based similarity.
---
 search/tests.py                      | 39 +++++++++++++++++++--------
 sounds/models.py                     | 40 +++++++++++++++++++++++++---
 sounds/templatetags/display_sound.py |  6 ++---
 sounds/tests/test_sound.py           |  2 ++
 sounds/views.py                      | 15 ++++++-----
 templates/sounds/player.html         |  2 +-
 templates/sounds/sound.html          |  2 +-
 7 files changed, 80 insertions(+), 26 deletions(-)

diff --git a/search/tests.py b/search/tests.py
index 8f76da1e0..388a55337 100644
--- a/search/tests.py
+++ b/search/tests.py
@@ -20,7 +20,7 @@
 
 from django.core.cache import cache
 from django.test import TestCase
-from django.test.utils import skipIf
+from django.test.utils import skipIf, override_settings
 from django.urls import reverse
 from sounds.models import Sound
 from utils.search import SearchResults, SearchResultsPaginator
@@ -142,6 +142,7 @@ def test_search_page_response_ok(self, perform_search_engine_query):
         self.assertEqual(resp.context['error_text'], None)
         self.assertEqual(len(resp.context['docs']), self.NUM_RESULTS)
 
+
     @mock.patch('search.views.perform_search_engine_query')
     def test_search_page_num_queries(self, perform_search_engine_query):
         perform_search_engine_query.return_value = self.perform_search_engine_query_response
@@ -155,16 +156,32 @@ def test_search_page_num_queries(self, perform_search_engine_query):
         cache.clear()
         with self.assertNumQueries(1):
             self.client.get(reverse('sounds-search') + '?cm=1')
-
-        # Now check number of queries when displaying results as packs (i.e., searching for packs)
-        cache.clear()
-        with self.assertNumQueries(5):
-            self.client.get(reverse('sounds-search') + '?only_p=1')
-
-        # Also check packs when displaying in grid mode
-        cache.clear()
-        with self.assertNumQueries(5):
-            self.client.get(reverse('sounds-search') + '?only_p=1&cm=1')
+       
+        with override_settings(USE_SEARCH_ENGINE_SIMILARITY=True):
+            # When using search engine similarity, there'll be one extra query performed to get the similarity status of the sounds
+
+            # Now check number of queries when displaying results as packs (i.e., searching for packs)
+            cache.clear()
+            with self.assertNumQueries(6):
+                self.client.get(reverse('sounds-search') + '?only_p=1')
+
+            # Also check packs when displaying in grid mode
+            cache.clear()
+            with self.assertNumQueries(6):
+                self.client.get(reverse('sounds-search') + '?only_p=1&cm=1')
+
+        with override_settings(USE_SEARCH_ENGINE_SIMILARITY=False):
+            # When not using search engine similarity, there'll be one less query performed as similarity state is retrieved directly from sound object
+
+            # Now check number of queries when displaying results as packs (i.e., searching for packs)
+            cache.clear()
+            with self.assertNumQueries(5):
+                self.client.get(reverse('sounds-search') + '?only_p=1')
+
+            # Also check packs when displaying in grid mode
+            cache.clear()
+            with self.assertNumQueries(5):
+                self.client.get(reverse('sounds-search') + '?only_p=1&cm=1')
 
     @mock.patch('search.views.perform_search_engine_query')
     def test_search_page_with_filters(self, perform_search_engine_query):
diff --git a/sounds/models.py b/sounds/models.py
index ed47de7f4..817c4c6dc 100644
--- a/sounds/models.py
+++ b/sounds/models.py
@@ -412,9 +412,15 @@ def get_analyzers_data_left_join_sql(self):
 
     def get_analysis_state_essentia_exists_sql(self):
         """Returns the SQL bits to add analysis_state_essentia_exists to the returned data indicating if thers is a
-        SoundAnalysis objects existing for th given sound_id for the essentia analyzer and with status OK"""
+        SoundAnalysis objects existing for the given sound_id for the essentia analyzer and with status OK"""
         return f"          exists(select 1 from sounds_soundanalysis where sounds_soundanalysis.sound_id = sound.id AND sounds_soundanalysis.analyzer = '{settings.FREESOUND_ESSENTIA_EXTRACTOR_NAME}' AND sounds_soundanalysis.analysis_status = 'OK') as analysis_state_essentia_exists,"
 
+    def get_search_engine_similarity_state_sql(self):
+        """Returns the SQL bits to add search_engine_similarity_state to the returned data indicating if thers is a
+        SoundAnalysis object existing for the default similarity analyzer (settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER)
+        given sound_id and with status OK"""
+        return f"          exists(select 1 from sounds_soundanalysis where sounds_soundanalysis.sound_id = sound.id AND sounds_soundanalysis.analyzer = '{settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER}' AND sounds_soundanalysis.analysis_status = 'OK') as search_engine_similarity_state,"
+
     def bulk_query_solr(self, sound_ids):
         """For each sound, get all fields needed to index the sound in Solr. Using this custom query to avoid the need
         of having to do some extra queries when displaying some fields related to the sound (e.g. for tags). Using this
@@ -514,6 +520,7 @@ def bulk_query(self, where, order_by, limit, args, include_analyzers_output=Fals
           accounts_profile.has_avatar as user_has_avatar,
           %s
           %s
+          %s
           ARRAY(
             SELECT tags_tag.name
             FROM tags_tag
@@ -530,7 +537,8 @@ def bulk_query(self, where, order_by, limit, args, include_analyzers_output=Fals
           LEFT JOIN tickets_ticket ON tickets_ticket.sound_id = sound.id
           %s
           LEFT OUTER JOIN sounds_remixgroup_sounds ON sounds_remixgroup_sounds.sound_id = sound.id
-        WHERE %s """ % (self.get_analysis_state_essentia_exists_sql(),
+        WHERE %s """ % (self.get_search_engine_similarity_state_sql(),
+                        self.get_analysis_state_essentia_exists_sql(),
                         self.get_analyzers_data_select_sql() if include_analyzers_output else '',
                         ContentType.objects.get_for_model(Sound).id,
                         self.get_analyzers_data_left_join_sql() if include_analyzers_output else '',
@@ -1350,6 +1358,20 @@ def get_geotag_name(self):
             return f'{self.geotag_lat:.2f}, {self.geotag_lon:.3f}'
         else:
             return f'{self.geotag.lat:.2f}, {self.geotag.lon:.3f}'
+        
+    @property
+    def ready_for_similarity(self):
+        # Retruns True is the sound has been analyzed for similarity and should be available for simialrity queries
+        if settings.USE_SEARCH_ENGINE_SIMILARITY:
+            if hasattr(self, 'search_engine_similarity_state'):
+                # If attribute is precomputed from query (because Sound was retrieved using bulk_query), no need to perform extra queries
+                return self.search_engine_similarity_state
+            else:
+                # Otherwise, check if there is a SoundAnalysis object for this sound with the correct analyzer and status
+                return SoundAnalysis.objects.filter(sound_id=self.id, analyzer=settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER, analysis_status='OK').exists()
+        else:
+            # If not using search engine based similarity, then use the old similarity_state DB field
+            return self.similarity_state == "OK"
 
     class Meta:
         ordering = ("-created", )
@@ -1577,7 +1599,7 @@ def bulk_query_id(self, pack_ids, sound_ids_for_pack_id=dict(), exclude_deleted=
                     selected_sounds_data.append({
                             'id': s.id,
                             'username': p.user.username,  # Packs have same username as sounds inside pack
-                            'similarity_state': s.similarity_state,
+                            'ready_for_similarity': s.similarity_state == "OK" if not settings.USE_SEARCH_ENGINE_SIMILARITY else None,  # If using search engine similarity, this needs to be retrieved later (see below)
                             'duration': s.duration,
                             'preview_mp3': s.locations('preview.LQ.mp3.url'),
                             'preview_ogg': s.locations('preview.LQ.ogg.url'),
@@ -1585,7 +1607,7 @@ def bulk_query_id(self, pack_ids, sound_ids_for_pack_id=dict(), exclude_deleted=
                             'spectral': s.locations('display.spectral_bw.L.url'),
                             'num_ratings': s.num_ratings,
                             'avg_rating': s.avg_rating
-                        })
+                        })            
             p.num_sounds_unpublished_precomputed = p.sounds.count() - p.num_sounds
             p.licenses_data_precomputed = ([lid for _, lid in licenses], [lname for lname, _ in licenses])
             p.pack_tags = [{'name': tag, 'count': count, 'browse_url': p.browse_pack_tag_url(tag)}
@@ -1596,6 +1618,16 @@ def bulk_query_id(self, pack_ids, sound_ids_for_pack_id=dict(), exclude_deleted=
             p.num_ratings_precomputed = len(ratings)
             p.avg_rating_precomputed = sum(ratings) / len(ratings) if len(ratings) else 0.0
 
+        if settings.USE_SEARCH_ENGINE_SIMILARITY:
+            # To save an individual query for each selected sound, we get the similarity state of all selected sounds per pack in one single extra query
+            selected_sounds_ids = []
+            for p in packs:
+                selected_sounds_ids += [s['id'] for s in p.selected_sounds_data]
+            sound_ids_ready_for_similarity = SoundAnalysis.objects.filter(sound_id__in=selected_sounds_ids, analyzer=settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER, analysis_status="OK").values_list('sound_id', flat=True)
+            for p in packs:
+                for s in p.selected_sounds_data:
+                    s['ready_for_similarity'] = s['id'] in sound_ids_ready_for_similarity
+
         return packs
 
     def dict_ids(self, pack_ids, exclude_deleted=True):
diff --git a/sounds/templatetags/display_sound.py b/sounds/templatetags/display_sound.py
index 71cf40499..4d9cdbd70 100644
--- a/sounds/templatetags/display_sound.py
+++ b/sounds/templatetags/display_sound.py
@@ -200,7 +200,7 @@ def display_sound_no_sound_object(context, file_data, player_size, show_bookmark
         'spectral': sound.locations('display.spectral_bw.L.url'),
         'id': sound.id,  # Only used for sounds that do actually have a sound object so we can display bookmark/similarity buttons
         'username': sound.user.username,  # Only used for sounds that do actually have a sound object so we can display bookmark/similarity/remix buttons
-        'similarity_state': sound.similarity_state  # Only used for sounds that do actually have a sound object so we can display bookmark/similarity/remix buttons
+        'ready_for_similarity': sound.ready_for_similarity  # Only used for sounds that do actually have a sound object so we can display bookmark/similarity/remix buttons
         'remixgroup_id': sound.remixgroup_id  # Only used for sounds that do actually have a sound object so we can display bookmark/similarity/remix buttons
         'num_ratings': sound.num_ratings,  # Used to display rating widget in players
         'avg_rating': sound.avg_rating,  # Used to display rating widget in players
@@ -210,7 +210,7 @@ def display_sound_no_sound_object(context, file_data, player_size, show_bookmark
         'sound': {
             'id': file_data.get('id', file_data['preview_mp3'].split('/')[-2]),  # If no id, use a unique fake ID to avoid caching problems
             'username': file_data.get('username', 'nousername'),
-            'similarity_state': file_data.get('similarity_state', 'FA'),
+            'ready_for_similarity': file_data.get('ready_for_similarity', False),
             'duration': file_data['duration'],
             'samplerate': file_data.get('samplerate', 44100),
             'num_ratings': file_data.get('num_ratings', 0),
@@ -236,7 +236,7 @@ def display_sound_no_sound_object(context, file_data, player_size, show_bookmark
         },
         'show_milliseconds': 'true' if ('big' in player_size ) else 'false',
         'show_bookmark_button': show_bookmark and 'id' in file_data,
-        'show_similar_sounds_button': show_similar_sounds and 'similarity_state' in file_data,
+        'show_similar_sounds_button': show_similar_sounds and file_data.get('ready_for_similarity', False),
         'show_remix_group_button': show_remix and 'remixgroup_id' in file_data,
         'show_rate_widget': 'avg_rating' in file_data,
         'player_size': player_size,
diff --git a/sounds/tests/test_sound.py b/sounds/tests/test_sound.py
index 889b82ca5..5b6b411fa 100644
--- a/sounds/tests/test_sound.py
+++ b/sounds/tests/test_sound.py
@@ -793,6 +793,7 @@ def _test_similarity_update(self, cache_keys, expected, request_func, similarity
         self.assertEqual(self.sound.similarity_state, 'OK')
         self.assertContains(request_func(user) if user is not None else request_func(), expected)
 
+    @override_settings(USE_SEARCH_ENGINE_SIMILARITY=False)
     def test_similarity_update_display(self):
         self._test_similarity_update(
             self._get_sound_display_cache_keys(),
@@ -801,6 +802,7 @@ def test_similarity_update_display(self):
             user=self.user,
         )
 
+    @override_settings(USE_SEARCH_ENGINE_SIMILARITY=False)
     def test_similarity_update_view(self):
         self._test_similarity_update(
             self._get_sound_view_footer_top_cache_keys(),
diff --git a/sounds/views.py b/sounds/views.py
index 4e86e6cf0..8d3b871e8 100644
--- a/sounds/views.py
+++ b/sounds/views.py
@@ -65,6 +65,7 @@
 from utils.nginxsendfile import sendfile, prepare_sendfile_arguments_for_sound_download
 from utils.pagination import paginate
 from utils.ratelimit import key_for_ratelimiting, rate_per_ip
+from utils.search import get_search_engine, SearchEngineException
 from utils.search.search_sounds import get_random_sound_id_from_search_engine, perform_search_engine_query
 from utils.similarity_utilities import get_similar_sounds
 from utils.sound_upload import create_sound, NoAudioException, AlreadyExistsException, CantMoveException, \
@@ -820,8 +821,7 @@ def similar(request, username, sound_id):
     sound = get_object_or_404(Sound,
                               id=sound_id,
                               moderation_state="OK",
-                              processing_state="OK",
-                              similarity_state="OK")
+                              processing_state="OK")
     if sound.user.username.lower() != username.lower():
         raise Http404
 
@@ -831,10 +831,13 @@ def similar(request, username, sound_id):
             sound, request.GET.get('preset', None), settings.NUM_SIMILAR_SOUNDS_PER_PAGE * settings.NUM_SIMILAR_SOUNDS_PAGES)
     else:
         # Get similar sounds from solr
-        from utils.search import get_search_engine
-        results = get_search_engine().search_sounds(similar_to=sound.id)
-        similarity_results = [(result['id'], result['score']) for result in results.docs]
-    
+        try:
+            results = get_search_engine().search_sounds(similar_to=sound.id)
+            similarity_results = [(result['id'], result['score']) for result in results.docs]
+        except SearchEngineException:
+            # Search engine not available, return empty list
+            similarity_results = []
+        
     paginator = paginate(request, [sound_id for sound_id, _ in similarity_results], settings.NUM_SIMILAR_SOUNDS_PER_PAGE)
     similar_sounds = Sound.objects.ordered_ids(paginator['page'].object_list)
     tvars = {'similar_sounds': similar_sounds, 'sound': sound}
diff --git a/templates/sounds/player.html b/templates/sounds/player.html
index f658834ef..3d6b463eb 100644
--- a/templates/sounds/player.html
+++ b/templates/sounds/player.html
@@ -5,7 +5,7 @@
     data-bookmark="{% if show_bookmark_button %}true{% else %}false{% endif %}"
     data-bookmark-modal-url="{% if show_bookmark_button %}{% url 'bookmarks-add-form-for-sound' sound.id %}{% endif %}"
     data-add-bookmark-url="{% if show_bookmark_button %}{% url 'add-bookmark' sound.id %}{% endif %}"
-    data-similar-sounds="{% if show_similar_sounds_button and sound.similarity_state == 'OK' %}true{% else %}false{% endif %}"
+    data-similar-sounds="{% if show_similar_sounds_button and sound.ready_for_similarity %}true{% else %}false{% endif %}"
     data-similar-sounds-modal-url="{% if show_similar_sounds_button %}{% url 'sound-similar' sound.username sound.id %}?ajax=1{% endif %}"
     data-remix-group="{% if show_remix_group_button and sound.remixgroup_id %}true{% else %}false{% endif %}"
     data-remix-group-modal-url="{% if show_remix_group_button %}{% url 'sound-remixes' sound.username sound.id %}?ajax=1{% endif %}"
diff --git a/templates/sounds/sound.html b/templates/sounds/sound.html
index 5e222496a..7193e9529 100644
--- a/templates/sounds/sound.html
+++ b/templates/sounds/sound.html
@@ -65,7 +65,7 @@ <h1><a class="bw-link--black" href="{% url 'sound' sound.user.username sound.id
                                         {% bw_icon 'remix' %}
                                     </button>
                                     {% endif %}
-                                    {% if sound.similarity_state == 'OK' %}
+                                    {% if sound.ready_for_similarity %}
                                     <span class="h-spacing-left-1"></span>
                                     <button title="Find similar sounds" aria-label="Find similar sounds" class="btn-neutral" data-toggle="modal-default" data-modal-activation-param="similar" data-modal-content-url="{% url 'sound-similar' sound.user.username sound.id %}?ajax=1">
                                         {% bw_icon 'similar' %}

From fc09d38d4261d5127b75c134a152bb685ca675b3 Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Wed, 24 Jan 2024 11:25:56 +0100
Subject: [PATCH 12/28] Document similar_to new params in search_sound

---
 utils/search/__init__.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/utils/search/__init__.py b/utils/search/__init__.py
index 7ee6ad0db..fd7e3ceb0 100644
--- a/utils/search/__init__.py
+++ b/utils/search/__init__.py
@@ -231,7 +231,8 @@ def get_all_sound_ids_from_index(self):
     def search_sounds(self, textual_query='', query_fields=None, query_filter='', offset=0, current_page=None,
                       num_sounds=settings.SOUNDS_PER_PAGE, sort=settings.SEARCH_SOUNDS_SORT_OPTION_AUTOMATIC,
                       group_by_pack=False, num_sounds_per_pack_group=1, facets=None, only_sounds_with_pack=False, 
-                      only_sounds_within_ids=False, group_counts_as_one_in_facets=False):
+                      only_sounds_within_ids=False, group_counts_as_one_in_facets=False, 
+                      simialr_to=None, similar_to_analyzer=None):
         """Search for sounds that match specific criteria and return them in a SearchResults object
 
         Args:
@@ -269,6 +270,13 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of
                 large groups in facets. We use it for computing the main tag cloud and avoiding a large packs of sounds
                 with the same tags to largely influence the general tag cloud (only one sound of the pack will be
                 counted)
+            similar_to (int or List[float], optional): sound ID or similarity vector to be used as target for similarity 
+                search. Note that when this parameter is passed, some of the other parameters will be ignored 
+                ('textual_query', 'facets', 'group_by_pack', 'num_sounds_per_pack_group', 'group_counts_as_one_in_facets'). 
+                'query_filter' should still be usable, although this remains to be throughly tested. 
+            similar_to_analyzer (str, optional): analyzer name from which to select similarity vectors for similarity search.
+                It defaults to settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER, but it could be change to something else
+                if we want to use a different type of similarity vectors for a similarity search query.
 
         Returns:
             SearchResults: SearchResults object containing the results of the query

From 5233d6afdd99442e00ca71ba0d1fd13965567135 Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Wed, 24 Jan 2024 12:58:19 +0100
Subject: [PATCH 13/28] Add get param to test using a sound as target in search
 page

---
 search/views.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/search/views.py b/search/views.py
index 79ea71928..d2a687920 100644
--- a/search/views.py
+++ b/search/views.py
@@ -155,6 +155,13 @@ def search_view_helper(request, tags_mode=False):
 
     tvars.update(advanced_search_params_dict)
 
+    if 'similar_to' in request.GET:
+        # If similar_to is passed as a query parameter, we add it to the query params so it is used by the search engine 
+        # to perform a similarity search query. This is just for test purposes, as search interface is not currently expected
+        # to be able to do such queries but similarity is only used in modals. However this is useful to test if the similarity
+        # search supports filters and othter parameters as expected
+        query_params.update({'similar_to': int(request.GET['similar_to'])})
+
     try:       
         results, paginator = perform_search_engine_query(query_params)
         if not only_sounds_with_pack:

From b06c30c04c39a3590cb9f64722b14c1718642ba4 Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Wed, 24 Jan 2024 13:00:05 +0100
Subject: [PATCH 14/28] Add get_parent similarity search mode

This mode complements get_child mode, and will match parent documents instead of child documents
---
 utils/search/backends/solr555pysolr.py | 131 +++++++++++++++++++------
 1 file changed, 100 insertions(+), 31 deletions(-)

diff --git a/utils/search/backends/solr555pysolr.py b/utils/search/backends/solr555pysolr.py
index 8858df5be..ec7c1ef1a 100644
--- a/utils/search/backends/solr555pysolr.py
+++ b/utils/search/backends/solr555pysolr.py
@@ -415,6 +415,13 @@ def add_sounds_to_index(self, sound_objects, update=False, fields_to_include=[])
             for document in documents:
                 if document['id'] in similarity_data:
                     document['similarity_vectors'] = similarity_data[document['id']]
+                    for similarity_vector in similarity_data[document['id']]:
+                        if similarity_vector['analyzer'] == settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER:
+                            # Also add the first sim vector of the default analyzer to the root document so we can match root documents directly as well (albeit only with one vector)
+                            config_options = settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS[settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER]
+                            vector_solr_field_type = SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(config_options['vector_size'], None)
+                            document[vector_solr_field_type] = similarity_data[document['id']][0][vector_solr_field_type]  
+                            break
         
         if update:
             documents = [self.transform_document_into_update_document(d) for d in documents]
@@ -529,19 +536,17 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of
                     group_truncate=group_counts_as_one_in_facets)
         else:
             # Similarity search!
-            
+            query.set_query('')
+
             # We fist set an empty query that will return no results and will be used by default if similarity can't be performed
-            query.set_query('')  
             if similar_to_analyzer in settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS:
                 # Get target vector from sound or from parameter
                 vector = None
-                extra_offset = 0
                 if isinstance(similar_to, list):
                     vector = similar_to  # we allow vectors to be passed directly
                     vector_field_name = SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(len(vector), None)
                 else:
                     # similar_to should be a sound_id
-                    extra_offset = 1  # We add 1 to the offset so that we don't get the sound itself as a result
                     sa = SoundAnalysis.objects.filter(sound_id=similar_to, analyzer=similar_to_analyzer, analysis_status="OK")
                     config_options = settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS[similar_to_analyzer]
                     vector_field_name = SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(config_options['vector_size'], None)
@@ -552,38 +557,101 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of
                             if vector_raw is not None:
                                 vector = vector_raw[0:config_options['vector_size']] 
                             
-                
                 # Set query
                 if vector is not None and vector_field_name is not None:
-                    max_similar_sounds = 1000000  # TODO: evaluate the performance impact of this so we can set it as high as possible. We want this to be high so that we can get a lot of similar sounds where to apply the filters to. Ideally we'd like this to be the whole collection.
                     serialized_vector = ','.join([str(n) for n in vector])
-                    query.set_query(f'{{!knn f={vector_field_name} topK={max_similar_sounds}}}[{serialized_vector}]')
-                
-                    # Process filter
-                    query_filter = self.search_process_filter(query_filter,
-                                                            only_sounds_within_ids=only_sounds_within_ids,
-                                                            only_sounds_with_pack=only_sounds_with_pack)
-
-                    # Set other query options
-                    if current_page is not None:
-                        offset = (current_page - 1) * num_sounds
+                    mode = 'get_parent'
+
+                    if mode == 'get_parent':
+                        # "Get parent" mode matches root documents and therefore can only be matched with a single similarity
+                        # vector corresponding to a single analyzer type and a single timestamp of the sound. However, this
+                        # mode allows to do results grouping, faceing and all the fancy search options just like in other usual
+                        # search query.
+
+                        # Set main query options
+                        max_similar_sounds = 500
+                        query.set_query(f'{{!knn f={vector_field_name} topK={max_similar_sounds}}}[{serialized_vector}]')
+
+                        # Process filter
+                        query_filter = self.search_process_filter(query_filter,
+                                                                only_sounds_within_ids=only_sounds_within_ids,
+                                                                only_sounds_with_pack=only_sounds_with_pack)
+                        filter_query = [query_filter, 'is_sound:1']  # Add basic filter to only get sound documents
+                        if isinstance(similar_to, int):
+                            # Also if target is specified as a sound ID, remove it from the list
+                            filter_query.append(f'-id:{similar_to}')
+
+                        # Set other query options
+                        if current_page is not None:
+                            offset = (current_page - 1) * num_sounds
+                        query.set_query_options(start=offset,
+                                                rows=num_sounds,
+                                                field_list=["id", "score"],  # We only want the sound IDs of the results as we load data from DB
+                                                filter_query=filter_query,
+                                                sort=['score desc'])
+
+                        # Configure facets
+                        if facets is not None:
+                            facet_fields = [FIELD_NAMES_MAP[field_name] for field_name, _ in facets.items()]
+                            query.add_facet_fields(*facet_fields)
+                            query.set_facet_options_default(**SOLR_SOUND_FACET_DEFAULT_OPTIONS)
+                            for field_name, extra_options in facets.items():
+                                query.set_facet_options(FIELD_NAMES_MAP[field_name], **extra_options)
+
+                        # Configure grouping
+                        if group_by_pack:
+                            query.set_group_field(group_field="grouping_pack")
+                            query.set_group_options(
+                                group_func=None,
+                                group_query=None,
+                                group_rows=10,  # TODO: if limit is lower than rows and start=0, this should probably be equal to limit
+                                group_start=0,
+                                group_limit=num_sounds_per_pack_group,  # This is the number of documents that will be returned for each group.
+                                group_offset=0,
+                                group_sort=None,
+                                group_sort_ingroup=None,
+                                group_format='grouped',
+                                group_main=False,
+                                group_num_groups=True,
+                                group_cache_percent=0,
+                                group_truncate=group_counts_as_one_in_facets)
                     
-                    filter_query = [f'analyzer:{settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER}']  # Add basic filter to only get similarity vectors from selected analyzer
-                    for part in query_filter.split('+'):
-                        if part:
-                            # Add extra query filters to the search query, but using the approptiate prefix to make sure they are applied to the root documents
-                            filter_query.append('{!child of=\"is_sound:1\"}' + part)
-
-                    query.set_query_options(start=offset + extra_offset,
-                                            rows=num_sounds,
-                                            field_list=["id", "score"],  # We only want the sound IDs of the results as we load data from DB. In the future we might add timestamp_start/timestamp_end
-                                            filter_query=filter_query,
-                                            sort=['score desc'])
+                    elif mode == 'get_child':
+                        # "Get child" mode matches child documents and allows to match a single sound with several vectors (which could correspond to different
+                        # analyzer outputs or to vectors of different timestamps of the sound). However, this mode does not allow to use extra search features like
+                        # grouping results or faceting (although it does allow to the use of filters).
+
+                        max_similar_sounds = 500  # TODO: evaluate the performance impact of this so we can set it as high as possible. We want this to be high so that we can get a lot of similar sounds where to apply the filters to. Ideally we'd like this to be the whole collection.
+                        query.set_query(f'{{!knn f={vector_field_name} topK={max_similar_sounds}}}[{serialized_vector}]')
                     
-                    # NOTE: ATM we can not add more query options (faceting, etc) to similarity search because it does not return
-                    # root documents but child documents (so that we can get multiple matches per sound if there are multiple vectors indexed).
-                    # If we manage to improve the query so that it returns root documents, we can add more query options here. Or we could also
-                    # first do the similarity search query and then do a normal search with the results of the similarity search as a filter...
+                        # Process filter
+                        query_filter = self.search_process_filter(query_filter,
+                                                                only_sounds_within_ids=only_sounds_within_ids,
+                                                                only_sounds_with_pack=only_sounds_with_pack)
+
+                        # Set other query options
+                        if current_page is not None:
+                            offset = (current_page - 1) * num_sounds
+                        
+                        filter_query = ['is_sound:0', f'analyzer:{settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER}']  # Add basic filter to only get similarity vectors from selected analyzer and from child documents (this is because root documents can also have sim vectors)
+                        if isinstance(similar_to, int):
+                            # Also if target is specified as a sound ID, remove it from the list
+                            filter_query.append(f'-_nest_parent_:{similar_to}')
+                        for part in query_filter.split('+'):
+                            if part:
+                                # Add extra query filters to the search query, but using the approptiate prefix to make sure they are applied to the root documents
+                                filter_query.append('{!child of=\"is_sound:1\"}' + part)
+
+                        query.set_query_options(start=offset,
+                                                rows=num_sounds,
+                                                field_list=["id", "score"],  # We only want the sound IDs of the results as we load data from DB. In the future we might add timestamp_start/timestamp_end
+                                                filter_query=filter_query,
+                                                sort=['score desc'])
+                        
+                        # NOTE: ATM we can not add more query options (faceting, etc) to similarity search because it does not return
+                        # root documents but child documents (so that we can get multiple matches per sound if there are multiple vectors indexed).
+                        # If we manage to improve the query so that it returns root documents, we can add more query options here. Or we could also
+                        # first do the similarity search query and then do a normal search with the results of the similarity search as a filter...
 
 
         # Do the query!
@@ -591,6 +659,7 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of
         # We do it in this way to conform to SearchEngine.search_sounds definition which must return SearchResults
         try:
             results = self.get_sounds_index().search(**query.as_kwargs(force_sounds=similar_to is None))
+
             # Solr uses a string for the id field, but django uses an int. Convert the id in all results to int
             # before use to avoid issues
             for d in results.docs:

From 12531aa22a58e5f09a4360eafea0547ab3baa5c8 Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Fri, 26 Jan 2024 10:47:39 +0100
Subject: [PATCH 15/28] Add faceting and grouping support in similarity search

---
 utils/search/backends/solr555pysolr.py        | 307 ++++++++----------
 utils/search/backends/solr_common.py          |  37 ++-
 .../solr9/cores/freesound/conf/schema.xml     |   2 +-
 3 files changed, 148 insertions(+), 198 deletions(-)

diff --git a/utils/search/backends/solr555pysolr.py b/utils/search/backends/solr555pysolr.py
index ec7c1ef1a..70bf87f7f 100644
--- a/utils/search/backends/solr555pysolr.py
+++ b/utils/search/backends/solr555pysolr.py
@@ -93,9 +93,15 @@
 
 SOLR_SOUND_FACET_DEFAULT_OPTIONS = {
     'limit': 5,
-    'sort': True,
+    'type': 'terms',
+    'sort': 'count desc',
     'mincount': 1,
-    'count_missing': False
+    'missing': False
+}
+
+SOLR_DOC_CONTENT_TYPES = {
+    'sound': 's',
+    'similarity_vector': 'v'
 }
 
 
@@ -163,8 +169,7 @@ def convert_sound_to_search_engine_document(self, sound, fields_to_include=[]):
             to json (and also, freesound model code fixes this), but keep it in to ensure that docs are clean.
         TODO: Assert that sound object is correct?
         """
-        # Document ID (same as sound ID)
-        document = {'id': sound.id, 'is_sound': True}
+        document = {}
 
         # Basic sound fields
         keep_fields = ['username', 'created', 'is_explicit', 'is_remix', 'num_ratings', 'channels', 'md5',
@@ -235,10 +240,50 @@ def convert_sound_to_search_engine_document(self, sound, fields_to_include=[]):
             # Note that we could optimize this by never getting the data for these fields in the first place, but because
             # the data is already retrieved in the queryset, that optimization would be negligible so we keep it simple.
             document = {k: v for k, v in document.items() if k in fields_to_include}
-            document['id'] = sound.id  # Make sure we always include the ID
-            document['is_sound'] = True  # Make sure we always include the ID
+
+        # Finally add the sound ID and content type
+        document.update({'id': sound.id, 'content_type': SOLR_DOC_CONTENT_TYPES['sound']})
 
         return document
+    
+    def add_similarity_vectors_to_documents(self, sound_objects, documents):
+        similarity_data = defaultdict(list)
+        sound_ids = [s.id for s in sound_objects]
+        sound_objects_dict = {s.id: s for s in sound_objects}
+        for analyzer_name, config_options in settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS.items():
+            # If we should index similarity data, add it to the documents
+            vector_solr_field_type = SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(config_options['vector_size'], None)
+            if vector_solr_field_type is None:
+                # If the vector size is not supported, then we can't index the vectors generated by the requested analyzer
+                continue
+            for sa in SoundAnalysis.objects.filter(sound_id__in=sound_ids, analyzer=analyzer_name, analysis_status="OK"):
+                similarity_vectors_per_analyzer_per_sound=[]
+                data = sa.get_analysis_data_from_file()
+                if data is not None:
+                    if data[config_options['vector_property_name']] is not None:
+                        sim_vector_document_data = {
+                            'content_type': SOLR_DOC_CONTENT_TYPES['similarity_vector'],
+                            'analyzer': sa.analyzer,
+                            'timestamp_start': 0,  # This will be used in the future if analyzers generate multiple sound vectors
+                            'timestamp_end': -1,  # This will be used in the future if analyzers generate multiple sound vectors
+                            vector_solr_field_type: data[config_options['vector_property_name']][0:config_options['vector_size']]
+                        }
+                        # Because we still want to be able to group by pack when matching sim vector documents (sound child documents),
+                        # we add the grouping_pack field here as well. In the future we might be able to optimize this if we can tell solr
+                        # to group results by the field value of a parent document (just like we do to compute facets)
+                        if getattr(sound_objects_dict[sa.sound_id], "pack_id"):
+                            sim_vector_document_data['grouping_pack'] = str(getattr(sound_objects_dict[sa.sound_id], "pack_id")) + "_" + remove_control_chars(
+                                getattr(sound_objects_dict[sa.sound_id], "pack_name"))
+                        else:
+                            sim_vector_document_data['grouping_pack'] = str(getattr(sound_objects_dict[sa.sound_id], "id"))
+                        similarity_vectors_per_analyzer_per_sound.append(sim_vector_document_data)
+                if similarity_vectors_per_analyzer_per_sound:
+                    similarity_data[sa.sound_id] += similarity_vectors_per_analyzer_per_sound
+        
+        # Add collected vectors to the documents created as child documents
+        for document in documents:
+            if document['id'] in similarity_data:
+                document['similarity_vectors'] = similarity_data[document['id']]
 
     def convert_post_to_search_engine_document(self, post):
         body = remove_control_chars(post.body)
@@ -382,47 +427,22 @@ def search_process_filter(self, query_filter, only_sounds_within_ids=False, only
 
             return query_filter
 
+    def force_sounds(self, query_dict):
+        # Add an extra filter to the query parameters to make sure these return sound documents only
+        current_fq = query_dict['fq']
+        if isinstance(current_fq, list):
+            query_dict.update({'fq': current_fq + [f'content_type:{SOLR_DOC_CONTENT_TYPES["sound"]}']}) 
+        else:
+            query_dict.update({'fq': [current_fq, f'content_type:{SOLR_DOC_CONTENT_TYPES["sound"]}']}) 
+        return query_dict
+
     # Sound methods
     def add_sounds_to_index(self, sound_objects, update=False, fields_to_include=[]):
+        # Generate basic documents for Solr
         documents = [self.convert_sound_to_search_engine_document(s, fields_to_include=fields_to_include) for s in sound_objects]
-        
         # If required, collect similarity vectors from all configured analyzers
         if 'similarity_vectors' in fields_to_include or not fields_to_include:
-            similarity_data = defaultdict(list)
-            sound_ids = [s.id for s in sound_objects]
-            for analyzer_name, config_options in settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS.items():
-                # If we should index similarity data, add it to the documents
-                vector_solr_field_type = SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(config_options['vector_size'], None)
-                if vector_solr_field_type is None:
-                    # If the vector size is not supported, then we can't index the vectors generated by the requested analyzer
-                    continue
-                for sa in SoundAnalysis.objects.filter(sound_id__in=sound_ids, analyzer=analyzer_name, analysis_status="OK"):
-                    similarity_vectors_per_analyzer_per_sound=[]
-                    data = sa.get_analysis_data_from_file()
-                    if data is not None:
-                        if data[config_options['vector_property_name']] is not None:
-                            similarity_vectors_per_analyzer_per_sound.append({
-                                'is_sound': False,
-                                'analyzer': sa.analyzer,
-                                'timestamp_start': 0,  # This will be used in the future if analyzers generate multiple sound vectors
-                                'timestamp_end': -1,  # This will be used in the future if analyzers generate multiple sound vectors
-                                vector_solr_field_type: data[config_options['vector_property_name']][0:config_options['vector_size']]
-                            })
-                    if similarity_vectors_per_analyzer_per_sound:
-                        similarity_data[sa.sound_id] += similarity_vectors_per_analyzer_per_sound
-            
-            # Add collected vectors to the documents created
-            for document in documents:
-                if document['id'] in similarity_data:
-                    document['similarity_vectors'] = similarity_data[document['id']]
-                    for similarity_vector in similarity_data[document['id']]:
-                        if similarity_vector['analyzer'] == settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER:
-                            # Also add the first sim vector of the default analyzer to the root document so we can match root documents directly as well (albeit only with one vector)
-                            config_options = settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS[settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER]
-                            vector_solr_field_type = SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(config_options['vector_size'], None)
-                            document[vector_solr_field_type] = similarity_data[document['id']][0][vector_solr_field_type]  
-                            break
-        
+            self.add_similarity_vectors_to_documents(sound_objects, documents)
         if update:
             documents = [self.transform_document_into_update_document(d) for d in documents]
         try:
@@ -495,52 +515,13 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of
             # Set main query options
             query.set_dismax_query(textual_query, query_fields=query_fields)
 
-            # Process filter
-            query_filter = self.search_process_filter(query_filter,
-                                                    only_sounds_within_ids=only_sounds_within_ids,
-                                                    only_sounds_with_pack=only_sounds_with_pack)
-
-            # Set other query options
-            if current_page is not None:
-                offset = (current_page - 1) * num_sounds
-            query.set_query_options(start=offset,
-                                    rows=num_sounds,
-                                    field_list=["id", "score"],  # We only want the sound IDs of the results as we load data from DB
-                                    filter_query=query_filter,
-                                    sort=self.search_process_sort(sort))
-
-            # Configure facets
-            if facets is not None:
-                facet_fields = [FIELD_NAMES_MAP[field_name] for field_name, _ in facets.items()]
-                query.add_facet_fields(*facet_fields)
-                query.set_facet_options_default(**SOLR_SOUND_FACET_DEFAULT_OPTIONS)
-                for field_name, extra_options in facets.items():
-                    query.set_facet_options(FIELD_NAMES_MAP[field_name], **extra_options)
-
-            # Configure grouping
-            if group_by_pack:
-                query.set_group_field(group_field="grouping_pack")
-                query.set_group_options(
-                    group_func=None,
-                    group_query=None,
-                    group_rows=10,  # TODO: if limit is lower than rows and start=0, this should probably be equal to limit
-                    group_start=0,
-                    group_limit=num_sounds_per_pack_group,  # This is the number of documents that will be returned for each group.
-                    group_offset=0,
-                    group_sort=None,
-                    group_sort_ingroup=None,
-                    group_format='grouped',
-                    group_main=False,
-                    group_num_groups=True,
-                    group_cache_percent=0,
-                    group_truncate=group_counts_as_one_in_facets)
         else:
             # Similarity search!
             query.set_query('')
 
             # We fist set an empty query that will return no results and will be used by default if similarity can't be performed
             if similar_to_analyzer in settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS:
-                # Get target vector from sound or from parameter
+                # Similarity search will find documents close to a target vector. This will match "child" sound documents (of content_type "similarity vectpor")
                 vector = None
                 if isinstance(similar_to, list):
                     vector = similar_to  # we allow vectors to be passed directly
@@ -557,108 +538,76 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of
                             if vector_raw is not None:
                                 vector = vector_raw[0:config_options['vector_size']] 
                             
-                # Set query
                 if vector is not None and vector_field_name is not None:
+                    max_similar_sounds = 500  # Max number of results for similarity search search. Filters are applied before the similarity search, so this number will usually be the total number of results (unless filters are more restrictive)
                     serialized_vector = ','.join([str(n) for n in vector])
-                    mode = 'get_parent'
-
-                    if mode == 'get_parent':
-                        # "Get parent" mode matches root documents and therefore can only be matched with a single similarity
-                        # vector corresponding to a single analyzer type and a single timestamp of the sound. However, this
-                        # mode allows to do results grouping, faceing and all the fancy search options just like in other usual
-                        # search query.
-
-                        # Set main query options
-                        max_similar_sounds = 500
-                        query.set_query(f'{{!knn f={vector_field_name} topK={max_similar_sounds}}}[{serialized_vector}]')
-
-                        # Process filter
-                        query_filter = self.search_process_filter(query_filter,
-                                                                only_sounds_within_ids=only_sounds_within_ids,
-                                                                only_sounds_with_pack=only_sounds_with_pack)
-                        filter_query = [query_filter, 'is_sound:1']  # Add basic filter to only get sound documents
-                        if isinstance(similar_to, int):
-                            # Also if target is specified as a sound ID, remove it from the list
-                            filter_query.append(f'-id:{similar_to}')
-
-                        # Set other query options
-                        if current_page is not None:
-                            offset = (current_page - 1) * num_sounds
-                        query.set_query_options(start=offset,
-                                                rows=num_sounds,
-                                                field_list=["id", "score"],  # We only want the sound IDs of the results as we load data from DB
-                                                filter_query=filter_query,
-                                                sort=['score desc'])
-
-                        # Configure facets
-                        if facets is not None:
-                            facet_fields = [FIELD_NAMES_MAP[field_name] for field_name, _ in facets.items()]
-                            query.add_facet_fields(*facet_fields)
-                            query.set_facet_options_default(**SOLR_SOUND_FACET_DEFAULT_OPTIONS)
-                            for field_name, extra_options in facets.items():
-                                query.set_facet_options(FIELD_NAMES_MAP[field_name], **extra_options)
-
-                        # Configure grouping
-                        if group_by_pack:
-                            query.set_group_field(group_field="grouping_pack")
-                            query.set_group_options(
-                                group_func=None,
-                                group_query=None,
-                                group_rows=10,  # TODO: if limit is lower than rows and start=0, this should probably be equal to limit
-                                group_start=0,
-                                group_limit=num_sounds_per_pack_group,  # This is the number of documents that will be returned for each group.
-                                group_offset=0,
-                                group_sort=None,
-                                group_sort_ingroup=None,
-                                group_format='grouped',
-                                group_main=False,
-                                group_num_groups=True,
-                                group_cache_percent=0,
-                                group_truncate=group_counts_as_one_in_facets)
-                    
-                    elif mode == 'get_child':
-                        # "Get child" mode matches child documents and allows to match a single sound with several vectors (which could correspond to different
-                        # analyzer outputs or to vectors of different timestamps of the sound). However, this mode does not allow to use extra search features like
-                        # grouping results or faceting (although it does allow to the use of filters).
-
-                        max_similar_sounds = 500  # TODO: evaluate the performance impact of this so we can set it as high as possible. We want this to be high so that we can get a lot of similar sounds where to apply the filters to. Ideally we'd like this to be the whole collection.
-                        query.set_query(f'{{!knn f={vector_field_name} topK={max_similar_sounds}}}[{serialized_vector}]')
-                    
-                        # Process filter
-                        query_filter = self.search_process_filter(query_filter,
-                                                                only_sounds_within_ids=only_sounds_within_ids,
-                                                                only_sounds_with_pack=only_sounds_with_pack)
-
-                        # Set other query options
-                        if current_page is not None:
-                            offset = (current_page - 1) * num_sounds
-                        
-                        filter_query = ['is_sound:0', f'analyzer:{settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER}']  # Add basic filter to only get similarity vectors from selected analyzer and from child documents (this is because root documents can also have sim vectors)
-                        if isinstance(similar_to, int):
-                            # Also if target is specified as a sound ID, remove it from the list
-                            filter_query.append(f'-_nest_parent_:{similar_to}')
-                        for part in query_filter.split('+'):
-                            if part:
-                                # Add extra query filters to the search query, but using the approptiate prefix to make sure they are applied to the root documents
-                                filter_query.append('{!child of=\"is_sound:1\"}' + part)
-
-                        query.set_query_options(start=offset,
-                                                rows=num_sounds,
-                                                field_list=["id", "score"],  # We only want the sound IDs of the results as we load data from DB. In the future we might add timestamp_start/timestamp_end
-                                                filter_query=filter_query,
-                                                sort=['score desc'])
-                        
-                        # NOTE: ATM we can not add more query options (faceting, etc) to similarity search because it does not return
-                        # root documents but child documents (so that we can get multiple matches per sound if there are multiple vectors indexed).
-                        # If we manage to improve the query so that it returns root documents, we can add more query options here. Or we could also
-                        # first do the similarity search query and then do a normal search with the results of the similarity search as a filter...
+                    query.set_query(f'{{!knn f={vector_field_name} topK={max_similar_sounds}}}[{serialized_vector}]')
+        
 
+        # Process filter
+        query_filter = self.search_process_filter(query_filter,
+                                                only_sounds_within_ids=only_sounds_within_ids,
+                                                only_sounds_with_pack=only_sounds_with_pack)
+        
+        if similar_to is not None:
+            # If doing a similarity query, the filter needs to be further processed so we perform filters based on parent documents
+            filter_query = [f'content_type:{SOLR_DOC_CONTENT_TYPES["similarity_vector"]}', f'analyzer:{settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER}']  # Add basic filter to only get similarity vectors from selected analyzer and from child documents (this is because root documents can also have sim vectors)
+            if isinstance(similar_to, int):
+                # Also if target is specified as a sound ID, remove it from the list
+                filter_query.append(f'-_nest_parent_:{similar_to}')
+            for part in query_filter.split('+'):
+                if part:
+                    # Add extra query filters to the search query, but using the approptiate prefix to make sure they are applied to the root documents
+                    filter_query.append(f'{{!child of=\"content_type:{SOLR_DOC_CONTENT_TYPES["sound"]}\"}}' + part)
+
+        # Set query options
+        if current_page is not None:
+            offset = (current_page - 1) * num_sounds
+        query.set_query_options(start=offset,
+                                rows=num_sounds,
+                                field_list=["id", "score"],  # We only want the sound IDs of the results as we load data from DB
+                                filter_query=query_filter,
+                                sort=self.search_process_sort(sort) if not similar_to else ['score desc'])  # In similarity queries, we always sort by distance to target
+
+        # Configure facets
+        if facets is not None:
+            json_facets = {}
+            facet_fields = [FIELD_NAMES_MAP[field_name] for field_name, _ in facets.items()]
+            for field in facet_fields:
+                json_facets[field] = SOLR_SOUND_FACET_DEFAULT_OPTIONS.copy()
+                json_facets[field]['field'] = field
+                if similar_to is not None:
+                    # In similarity search we need to set the "domain" facet option to apply them to the parent documents of the child documents we will match
+                    json_facets[field]['domain'] = {'blockParent': f'content_type:{SOLR_DOC_CONTENT_TYPES["sound"]}'}
+            for field_name, extra_options in facets.items():
+                json_facets[FIELD_NAMES_MAP[field_name]].update(extra_options)
+            query.set_facet_json_api(json_facets)
+
+        # Configure grouping
+        if group_by_pack:
+            query.set_group_field(group_field="grouping_pack")  # This works both in similarity and non-similarity queries because we index this field in both parent and child documents
+            query.set_group_options(
+                group_func=None,
+                group_query=None,
+                group_rows=10,  # TODO: if limit is lower than rows and start=0, this should probably be equal to limit
+                group_start=0,
+                group_limit=num_sounds_per_pack_group,  # This is the number of documents that will be returned for each group.
+                group_offset=0,
+                group_sort=None,
+                group_sort_ingroup=None,
+                group_format='grouped',
+                group_main=False,
+                group_num_groups=True,
+                group_cache_percent=0,
+                group_truncate=group_counts_as_one_in_facets)
 
         # Do the query!
         # Note: we create a SearchResults with the same members as SolrResponseInterpreter (the response from .search()).
         # We do it in this way to conform to SearchEngine.search_sounds definition which must return SearchResults
         try:
-            results = self.get_sounds_index().search(**query.as_kwargs(force_sounds=similar_to is None))
+            # In non-similarity queries, we force the content_type to be sound so no child documents are returned
+            results = self.get_sounds_index().search(
+                **(self.force_sounds(query.as_kwargs()) if similar_to is None else query.as_kwargs()))
 
             # Solr uses a string for the id field, but django uses an int. Convert the id in all results to int
             # before use to avoid issues
@@ -686,7 +635,7 @@ def get_random_sound_id(self):
         query.set_query("*:*")
         query.set_query_options(start=0, rows=1, field_list=["id"], filter_query=filter_query, sort=sort)
         try:
-            response = self.get_sounds_index().search(search_handler="select", **query.as_kwargs(force_sounds=True))
+            response = self.get_sounds_index().search(search_handler="select", **self.force_sounds(query.as_kwargs()))
             docs = response.docs
             if docs:
                 return int(docs[0]['id'])
@@ -793,7 +742,7 @@ def get_user_tags(self, username):
         query.add_facet_fields("tag")
         query.set_facet_options("tag", limit=10, mincount=1)
         try:
-            results = self.get_sounds_index().search(**query.as_kwargs(force_sounds=True))
+            results = self.get_sounds_index().search(**self.force_sounds(query.as_kwargs()))
             return results.facets['tag']
         except pysolr.SolrError as e:
             raise SearchEngineException(e)
@@ -806,7 +755,7 @@ def get_pack_tags(self, username, pack_name):
         query.add_facet_fields("tag")
         query.set_facet_options("tag", limit=20, mincount=1)
         try:
-            results = self.get_sounds_index().search(**query.as_kwargs(force_sounds=True))
+            results = self.get_sounds_index().search(**self.force_sounds(query.as_kwargs()))
             return results.facets['tag']
         except pysolr.SolrError as e:
             raise SearchEngineException(e)
diff --git a/utils/search/backends/solr_common.py b/utils/search/backends/solr_common.py
index 1014e9b75..d6c58aacc 100644
--- a/utils/search/backends/solr_common.py
+++ b/utils/search/backends/solr_common.py
@@ -149,6 +149,10 @@ def set_facet_options(self, field, prefix=None, sort=None, limit=None, offset=No
         self.params[f'f.{field}.facet.mincount'] = mincount
         self.params[f'f.{field}.facet.missing'] = count_missing
 
+    def set_facet_json_api(self, json_facets):
+        # See https://solr.apache.org/guide/solr/9_0/query-guide/json-facet-api.html
+        self.params['json.facet'] = json.dumps(json_facets)
+
     def add_date_facet_fields(self, *args):
         """Add date facet fields
         """
@@ -268,19 +272,12 @@ def set_group_options(self, group_func=None, group_query=None, group_rows=10, gr
         self.params['group.truncate'] = group_truncate
         self.params['group.cache.percent'] = group_cache_percent
 
-    def as_kwargs(self, force_sounds=False):
+    def as_kwargs(self):
         """Return params in a way that can be passed to pysolr commands as kwargs"""
         params = {k: v for k, v in self.params.items() if v is not None}
         for k, v in params.items():
             if isinstance(v, bool):
                 params[k] = json.dumps(v)
-        # If 'force_sounds', we want to make sure we only include sound documents in the query and not any child documents. Add an extra fq to force that.
-        if force_sounds:
-            current_fq = params['fq']
-            if isinstance(current_fq, list):
-                params.update({'fq': current_fq + ['is_sound:1']}) 
-            else:
-                params.update({'fq': [current_fq, 'is_sound:1']}) 
         return params
 
 
@@ -311,17 +308,21 @@ def __init__(self, response, next_page_query=None):
             self.non_grouped_number_of_results = -1
 
         self.q_time = response["responseHeader"]["QTime"]
-        try:
-            self.facets = response["facet_counts"]["facet_fields"]
-        except KeyError:
-            self.facets = {}
 
-        """Facets are given in a list: [facet, number, facet, number, None, number] where the last one
-        is the missing field count. Converting all of them to a dict for easier usage:
-        {facet:number, facet:number, ..., None:number}
-        """
-        for facet, fields in list(self.facets.items()):
-            self.facets[facet] = [(fields[index], fields[index + 1]) for index in range(0, len(fields), 2)]
+        self.facets = {}
+        if 'facet_counts' in response:
+            # "old" Solr faceting format
+            # Facets are given in a list: [facet, number, facet, number, None, number] where the last one
+            # is the missing field count. Converting all of them to a dict for easier usage:
+            # {facet:number, facet:number, ..., None:number}
+            self.facets = response["facet_counts"]["facet_fields"]
+            for facet, fields in list(self.facets.items()):
+                self.facets[facet] = [(fields[index], fields[index + 1]) for index in range(0, len(fields), 2)]
+        if 'facets' in response:
+            # New faceting format, https://solr.apache.org/guide/solr/9_2/query-guide/json-facet-api.html
+            for facet_name, data in response['facets'].items():
+                if facet_name != 'count':
+                    self.facets[facet_name] = [(str(b['val']), b['count']) for b in data['buckets']]
 
         try:
             self.highlighting = response["highlighting"]
diff --git a/utils/search/solr9/cores/freesound/conf/schema.xml b/utils/search/solr9/cores/freesound/conf/schema.xml
index 6625443e3..775206a2f 100644
--- a/utils/search/solr9/cores/freesound/conf/schema.xml
+++ b/utils/search/solr9/cores/freesound/conf/schema.xml
@@ -182,7 +182,7 @@
   <field name="_text_" type="text_general" multiValued="true" indexed="true" stored="false"/>
   <field name="_version_" type="plong" indexed="false" stored="false"/>
   <field name="id" type="string" multiValued="false" indexed="true" required="true" stored="true"/>
-  <field name="is_sound" type="boolean" indexed="true" stored="true" required="true" />
+  <field name="content_type" type="string" indexed="true" stored="true" required="true" />
 
   <field name="username" type="alphaOnlySort" indexed="true" stored="true" required="false" />
   <field name="original_filename" type="freesound_text" indexed="true" stored="true" required="false" />

From c2c228bbef5b74e5c31abb1bf6982cbfdcda8552 Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Fri, 26 Jan 2024 13:36:15 +0100
Subject: [PATCH 16/28] Add support for similar_to param in search page

Also fixed an issue with grouping by pack in similarity search
---
 freesound/settings.py                         |  1 +
 search/templatetags/search.py                 |  2 +
 search/views.py                               | 12 ++----
 templates/search/search.html                  | 43 +++++++++++--------
 utils/search/backends/solr555pysolr.py        | 29 +++++++++----
 utils/search/backends/solr_common.py          |  2 +
 utils/search/search_sounds.py                 |  3 +-
 .../solr9/cores/freesound/conf/schema.xml     |  1 +
 8 files changed, 57 insertions(+), 36 deletions(-)

diff --git a/freesound/settings.py b/freesound/settings.py
index 3d06c31e8..c9b423f15 100644
--- a/freesound/settings.py
+++ b/freesound/settings.py
@@ -649,6 +649,7 @@
     },
 }
 SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER = FSDSINET_ANALYZER_NAME
+SEARCH_ENGINE_NUM_SIMILAR_SOUNDS_PER_QUERY = 500
 USE_SEARCH_ENGINE_SIMILARITY = False  # Does not currently apply to API
 
 # -------------------------------------------------------------------------------
diff --git a/search/templatetags/search.py b/search/templatetags/search.py
index 6f617de1c..5c2710b83 100644
--- a/search/templatetags/search.py
+++ b/search/templatetags/search.py
@@ -92,6 +92,8 @@ def display_facet(context, flt, facet, facet_type, title=""):
             context['sort'] if context['sort'] is not None else '',
             context['weights'] or ''
         )
+        if context['similar_to'] is not None:
+            element['add_filter_url'] += '&similar_to={}'.format(context['similar_to'])
         filtered_facet.append(element)
 
     # We sort the facets by count. Also, we apply an opacity filter on "could" type pacets
diff --git a/search/views.py b/search/views.py
index d2a687920..8d182d66d 100644
--- a/search/views.py
+++ b/search/views.py
@@ -131,6 +131,7 @@ def search_view_helper(request, tags_mode=False):
         'filter_query': query_params['query_filter'],
         'filter_query_split': filter_query_split,
         'search_query': query_params['textual_query'],
+        'similar_to': query_params['similar_to'],
         'group_by_pack_in_request': "1" if group_by_pack_in_request else "", 
         'disable_group_by_pack_option': disable_group_by_pack_option,
         'only_sounds_with_pack': only_sounds_with_pack,
@@ -152,16 +153,8 @@ def search_view_helper(request, tags_mode=False):
         'has_advanced_search_settings_set': contains_active_advanced_search_filters(request, query_params, extra_vars),
         'advanced_search_closed_on_load': settings.ADVANCED_SEARCH_MENU_ALWAYS_CLOSED_ON_PAGE_LOAD
     }
-
     tvars.update(advanced_search_params_dict)
 
-    if 'similar_to' in request.GET:
-        # If similar_to is passed as a query parameter, we add it to the query params so it is used by the search engine 
-        # to perform a similarity search query. This is just for test purposes, as search interface is not currently expected
-        # to be able to do such queries but similarity is only used in modals. However this is useful to test if the similarity
-        # search supports filters and othter parameters as expected
-        query_params.update({'similar_to': int(request.GET['similar_to'])})
-
     try:       
         results, paginator = perform_search_engine_query(query_params)
         if not only_sounds_with_pack:
@@ -212,7 +205,8 @@ def search_view_helper(request, tags_mode=False):
         # sure to remove the filters for the corresponding facet field thar are already active (so we remove
         # redundant information)
         if tags_in_filter:
-            results.facets['tag'] = [(tag, count) for tag, count in results.facets['tag'] if tag not in tags_in_filter]
+            if 'tag' in results.facets:
+                results.facets['tag'] = [(tag, count) for tag, count in results.facets['tag'] if tag not in tags_in_filter]
 
         tvars.update({
             'paginator': paginator,
diff --git a/templates/search/search.html b/templates/search/search.html
index 018b734cb..9e2988d7e 100644
--- a/templates/search/search.html
+++ b/templates/search/search.html
@@ -36,9 +36,10 @@ <h3 class="v-spacing-5 text-light-grey center">Choose a tag to start browsing</h
             
             <div class="v-spacing-5 {% if tags_mode %}display-none{% endif %}">
                 <div class="middle center">
-                    <div class="bw-search__input_wrapper"><i class="bw-icon-search input-icon text-light-grey"></i><input id="search-input-browse" name="q" type="text" value="{{ search_query }}" placeholder="Search {% if not only_sounds_with_pack %}sounds{% else %}packs{% endif %}..." class="bw-search__input" autocomplete="off"/></div>
+                    <div class="bw-search__input_wrapper"><i class="bw-icon-search input-icon text-light-grey"></i><input id="search-input-browse" {% if similar_to %}disabled{% endif %} name="q" type="text" value="{{ search_query }}" placeholder="{% if not similar_to %}Search {% if not only_sounds_with_pack %}sounds{% else %}packs{% endif %}...{% else %}Similar to {{ similar_to }}{% endif %}" class="bw-search__input" autocomplete="off"/></div>
                     <input id="filter_query" type="hidden" name="f" value="{{filter_query}}" />
                     <input id="weights" type="hidden" name="w" value="{{weights}}" />
+                    {% if similar_to %}<input type="hidden" name="similar_to" value="{{similar_to}}" />{% endif %}
                     <input id="tags-mode" type="hidden" name="tm" value="{% if tags_mode %}1{% else %}0{% endif %}" />  {% comment %}This is used so that we can know from JS whether we are in tags mode or not{% endcomment %}
                     <span class="bw-search_remove-query" id="remove-content-search">{% bw_icon 'close' %}</span>
                 </div>
@@ -81,11 +82,17 @@ <h3>
                     </div>
                     <div class="browse__search-overview-sorter">
                         <span class="font-weight-normal text-light-grey d-none d-md-inline">Sort by:</span>
-                        <select name="s" id="sort-by">
-                            {% for option in sort_options %}
-                            <option value="{{option.1}}"{% ifequal option.1 sort %}selected="selected"{% endifequal %}>{{option.0}}</option>
-                            {% endfor %}
-                        </select>
+                        {% if not similar_to %}
+                            <select name="s" id="sort-by">
+                                {% for option in sort_options %}
+                                <option value="{{option.1}}"{% ifequal option.1 sort %}selected="selected"{% endifequal %}>{{option.0}}</option>
+                                {% endfor %}
+                            </select>
+                        {% else %}
+                            <select name="s" id="sort-by" disabled>
+                                <option>Similarity to target</option>
+                            </select>
+                        {% endif %}
                     </div>
                 </div>
                 <div class="divider-light"></div>
@@ -97,13 +104,13 @@ <h3>
                             <div class="bw-search__filter-section-name caps text-light-grey between">
                                 <span>Search in</span>
                             </div>
-                            <div class="row no-gutters v-spacing-top-2 {% if tags_mode %}opacity-020{% endif %}">
+                            <div class="row no-gutters v-spacing-top-2 {% if tags_mode or similar_to %}opacity-020{% endif %}">
                                 <div class="col-md-5">
                                     <ul class="bw-search__filter-value-list no-margins">
                                         <li class="bw-search__filter-value v-padding-1">
                                             <label class="between w-100">
                                                 <div class="bw-search__filter-checkbox">
-                                                    <input type="checkbox" class="bw-checkbox" on name="a_tag" {% if a_tag %}checked{% endif %} {% if tags_mode %}disabled{% endif %}/>
+                                                    <input type="checkbox" class="bw-checkbox" on name="a_tag" {% if a_tag %}checked{% endif %} {% if tags_mode or similar_to %}disabled{% endif %}/>
                                                 </div>
                                                 <div class="bw-search__filter-name">Tags</div>
                                             </label>
@@ -111,7 +118,7 @@ <h3>
                                         <li class="bw-search__filter-value v-padding-1">
                                             <label class="between w-100">
                                                 <div class="bw-search__filter-checkbox">
-                                                    <input type="checkbox" class="bw-checkbox" name="a_filename" {% if a_filename %}checked{% endif %} {% if tags_mode %}disabled{% endif %}/>
+                                                    <input type="checkbox" class="bw-checkbox" name="a_filename" {% if a_filename %}checked{% endif %} {% if tags_mode  or similar_to %}disabled{% endif %}/>
                                                 </div>
                                                 <div class="bw-search__filter-name">Sound name</div>
                                             </label>
@@ -119,7 +126,7 @@ <h3>
                                         <li class="bw-search__filter-value v-padding-1">
                                             <label class="between w-100">
                                                 <div class="bw-search__filter-checkbox">
-                                                    <input type="checkbox" class="bw-checkbox" name="a_description" {% if a_description %}checked{% endif %} {% if tags_mode %}disabled{% endif %}/>
+                                                    <input type="checkbox" class="bw-checkbox" name="a_description" {% if a_description %}checked{% endif %} {% if tags_mode  or similar_to %}disabled{% endif %}/>
                                                 </div>
                                                 <div class="bw-search__filter-name">Description</div>
                                             </label>
@@ -131,7 +138,7 @@ <h3>
                                         <li class="bw-search__filter-value v-padding-1">
                                             <label class="between w-100">
                                                 <div class="bw-search__filter-checkbox">
-                                                    <input type="checkbox" class="bw-checkbox" name="a_packname" {% if a_packname %}checked{% endif %} {% if tags_mode %}disabled{% endif %}/>
+                                                    <input type="checkbox" class="bw-checkbox" name="a_packname" {% if a_packname %}checked{% endif %} {% if tags_mode  or similar_to %}disabled{% endif %}/>
                                                 </div>
                                                 <div class="bw-search__filter-name">Pack name</div>
                                             </label>
@@ -139,7 +146,7 @@ <h3>
                                         <li class="bw-search__filter-value v-padding-1">
                                             <label class="between w-100">
                                                 <div class="bw-search__filter-checkbox">
-                                                    <input type="checkbox" class="bw-checkbox" name="a_soundid" {% if a_soundid %}checked{% endif %} {% if tags_mode %}disabled{% endif %}/>
+                                                    <input type="checkbox" class="bw-checkbox" name="a_soundid" {% if a_soundid %}checked{% endif %} {% if tags_mode  or similar_to %}disabled{% endif %}/>
                                                 </div>
                                                 <div class="bw-search__filter-name">Sound ID</div>
                                             </label>
@@ -147,7 +154,7 @@ <h3>
                                         <li class="bw-search__filter-value v-padding-1">
                                             <label class="between w-100">
                                                 <div class="bw-search__filter-checkbox">
-                                                    <input type="checkbox" class="bw-checkbox" name="a_username" {% if a_username %}checked{% endif %} {% if tags_mode %}disabled{% endif %}/>
+                                                    <input type="checkbox" class="bw-checkbox" name="a_username" {% if a_username %}checked{% endif %} {% if tags_mode  or similar_to%}disabled{% endif %}/>
                                                 </div>
                                                 <div class="bw-search__filter-name">Username</div>
                                             </label>
@@ -286,7 +293,7 @@ <h3>
             {% if filter_query_split %}
             <div class="v-spacing-3 v-spacing-top-2 line-height-33">
                 {% for filter in filter_query_split %}
-                <a class="no-hover btn-inverse bw-search__active-filters-button" href="/search/?advanced={{ advanced }}&g={{ group_by_pack_in_request }}&only_p={{ only_sounds_with_pack_in_request }}&s={{ sort }}&cluster_id={{ filter.cluster_id }}&w={{ weights }}&q={{search_query}}&f={{ filter.remove_url }}" title="Remove this filter">
+                <a class="no-hover btn-inverse bw-search__active-filters-button" href="/search/?advanced={{ advanced }}&g={{ group_by_pack_in_request }}&only_p={{ only_sounds_with_pack_in_request }}&s={{ sort }}&cluster_id={{ filter.cluster_id }}&w={{ weights }}&q={{search_query}}&f={{ filter.remove_url }}{% if similar_to %}&similar_to={{ similar_to }}{% endif %}" title="Remove this filter">
                     {{ filter.name }}<span class="h-spacing-left-1">{% bw_icon 'close' %} </span>
                 </a>
                 {% endfor %}
@@ -302,13 +309,13 @@ <h3>
                         {% display_sound_small result.sound %}
                         {% if result.n_more_in_group and result.sound.pack_id is not None %}
                         <p class="text-grey text-right v-spacing-top-negative-2">
-                            {% bw_icon 'plus' %} <a href='/search/?q={{ search_query }}&f=grouping_pack:"{{ result.sound.pack_id }}_{{ result.sound.pack_name|urlencode }}"+{{filter_query_link_more_when_grouping_packs }}&s={{ sort }}&g={{ group_by_pack_in_request }}&only_p={{ only_sounds_with_pack_in_request }}&w={{ weights }}&advanced={{ advanced }}&a_tag={{ a_tag }}&a_filename={{ a_filename }}&a_description={{ a_description }}&a_packname={{ a_packname }}&a_soundid={{ a_soundid }}&a_username={{ a_username }}'>See {{result.n_more_in_group|add:1|bw_intcomma}} result{{ result.n_more_in_group|add:1|pluralize }} from pack:</a> <a class="bw-link--grey" href="{% url 'pack' result.sound.username result.sound.pack_id %}">{{ result.sound.pack_name|truncate_string:35 }}</a>
+                            {% bw_icon 'plus' %} <a href='/search/?q={{ search_query }}&f=grouping_pack:"{{ result.sound.pack_id }}_{{ result.sound.pack_name|urlencode }}"+{{filter_query_link_more_when_grouping_packs }}&s={{ sort }}&g={{ group_by_pack_in_request }}&only_p={{ only_sounds_with_pack_in_request }}&w={{ weights }}&advanced={{ advanced }}&a_tag={{ a_tag }}&a_filename={{ a_filename }}&a_description={{ a_description }}&a_packname={{ a_packname }}&a_soundid={{ a_soundid }}&a_username={{ a_username }}{% if similar_to %}&similar_to={{ similar_to }}{% endif %}'>See {{result.n_more_in_group|add:1|bw_intcomma}} result{{ result.n_more_in_group|add:1|pluralize }} from pack:</a> <a class="bw-link--grey" href="{% url 'pack' result.sound.username result.sound.pack_id %}">{{ result.sound.pack_name|truncate_string:35 }}</a>
                         </p>
                         {% endif %}
                         {% else %}
                         {% display_pack result.pack %}
                         <p class="text-grey text-right v-spacing-top-negative-2">
-                            {% bw_icon 'plus' %} <a href='/search/?q={{ search_query }}&f=grouping_pack:{{ result.pack.pack_filter_value }}+{{filter_query_link_more_when_grouping_packs }}&s={{ sort }}&g={{ group_by_pack_in_request }}&only_p={{ only_sounds_with_pack_in_request }}&w={{ weights }}&advanced={{ advanced }}&a_tag={{ a_tag }}&a_filename={{ a_filename }}&a_description={{ a_description }}&a_packname={{ a_packname }}&a_soundid={{ a_soundid }}&a_username={{ a_username }}'>See {{result.n_more_in_group|add:1|bw_intcomma}} result{{ result.n_more_in_group|add:1|pluralize }} from pack</a>
+                            {% bw_icon 'plus' %} <a href='/search/?q={{ search_query }}&f=grouping_pack:{{ result.pack.pack_filter_value }}+{{filter_query_link_more_when_grouping_packs }}&s={{ sort }}&g={{ group_by_pack_in_request }}&only_p={{ only_sounds_with_pack_in_request }}&w={{ weights }}&advanced={{ advanced }}&a_tag={{ a_tag }}&a_filename={{ a_filename }}&a_description={{ a_description }}&a_packname={{ a_packname }}&a_soundid={{ a_soundid }}&a_username={{ a_username }}{% if similar_to %}&similar_to={{ similar_to }}{% endif %}'>See {{result.n_more_in_group|add:1|bw_intcomma}} result{{ result.n_more_in_group|add:1|pluralize }} from pack</a>
                         </p>
                         {% endif %}
                     </div>
@@ -321,13 +328,13 @@ <h3>
                     {% display_sound_middle result.sound %}
                     {% if result.n_more_in_group and result.sound.pack_id is not None %}
                     <div class="text-grey text-right v-spacing-top-negative-1 v-spacing-2">
-                        {% bw_icon 'plus' %} <a href='/search/?q={{ search_query }}&f=grouping_pack:"{{ result.sound.pack_id }}_{{ result.sound.pack_name|urlencode }}"+{{filter_query_link_more_when_grouping_packs }}&s={{ sort }}&g={{ group_by_pack_in_request }}&only_p={{ only_sounds_with_pack_in_request }}&w={{ weights }}&advanced={{ advanced }}&a_tag={{ a_tag }}&a_filename={{ a_filename }}&a_description={{ a_description }}&a_packname={{ a_packname }}&a_soundid={{ a_soundid }}&a_username={{ a_username }}'>See {{result.n_more_in_group|add:1|bw_intcomma}} result{{ result.n_more_in_group|add:1|pluralize }} from same pack</a>
+                        {% bw_icon 'plus' %} <a href='/search/?q={{ search_query }}&f=grouping_pack:"{{ result.sound.pack_id }}_{{ result.sound.pack_name|urlencode }}"+{{filter_query_link_more_when_grouping_packs }}&s={{ sort }}&g={{ group_by_pack_in_request }}&only_p={{ only_sounds_with_pack_in_request }}&w={{ weights }}&advanced={{ advanced }}&a_tag={{ a_tag }}&a_filename={{ a_filename }}&a_description={{ a_description }}&a_packname={{ a_packname }}&a_soundid={{ a_soundid }}&a_username={{ a_username }}{% if similar_to %}&similar_to={{ similar_to }}{% endif %}'>See {{result.n_more_in_group|add:1|bw_intcomma}} result{{ result.n_more_in_group|add:1|pluralize }} from same pack</a>
                     </div>
                     {% endif %}
                     {% else %}
                     {% display_pack_big result.pack %}
                     <div class="text-grey text-right v-spacing-top-negative-1 v-spacing-2 padding-right-1">
-                        {% bw_icon 'plus' %} <a href='/search/?q={{ search_query }}&f=grouping_pack:{{ result.pack.pack_filter_value }}+{{filter_query_link_more_when_grouping_packs }}&s={{ sort }}&g={{ group_by_pack_in_request }}&only_p={{ only_sounds_with_pack_in_request }}&w={{ weights }}&advanced={{ advanced }}&a_tag={{ a_tag }}&a_filename={{ a_filename }}&a_description={{ a_description }}&a_packname={{ a_packname }}&a_soundid={{ a_soundid }}&a_username={{ a_username }}'>See {{result.n_more_in_group|add:1|bw_intcomma}} result{{ result.n_more_in_group|add:1|pluralize }} from this pack</a>
+                        {% bw_icon 'plus' %} <a href='/search/?q={{ search_query }}&f=grouping_pack:{{ result.pack.pack_filter_value }}+{{filter_query_link_more_when_grouping_packs }}&s={{ sort }}&g={{ group_by_pack_in_request }}&only_p={{ only_sounds_with_pack_in_request }}&w={{ weights }}&advanced={{ advanced }}&a_tag={{ a_tag }}&a_filename={{ a_filename }}&a_description={{ a_description }}&a_packname={{ a_packname }}&a_soundid={{ a_soundid }}&a_username={{ a_username }}{% if similar_to %}&similar_to={{ similar_to }}{% endif %}'>See {{result.n_more_in_group|add:1|bw_intcomma}} result{{ result.n_more_in_group|add:1|pluralize }} from this pack</a>
                     </div>
                     {% endif %}
                     {% if not forloop.last %}
diff --git a/utils/search/backends/solr555pysolr.py b/utils/search/backends/solr555pysolr.py
index 70bf87f7f..61e9af4f9 100644
--- a/utils/search/backends/solr555pysolr.py
+++ b/utils/search/backends/solr555pysolr.py
@@ -272,10 +272,10 @@ def add_similarity_vectors_to_documents(self, sound_objects, documents):
                         # we add the grouping_pack field here as well. In the future we might be able to optimize this if we can tell solr
                         # to group results by the field value of a parent document (just like we do to compute facets)
                         if getattr(sound_objects_dict[sa.sound_id], "pack_id"):
-                            sim_vector_document_data['grouping_pack'] = str(getattr(sound_objects_dict[sa.sound_id], "pack_id")) + "_" + remove_control_chars(
+                            sim_vector_document_data['grouping_pack_child'] = str(getattr(sound_objects_dict[sa.sound_id], "pack_id")) + "_" + remove_control_chars(
                                 getattr(sound_objects_dict[sa.sound_id], "pack_name"))
                         else:
-                            sim_vector_document_data['grouping_pack'] = str(getattr(sound_objects_dict[sa.sound_id], "id"))
+                            sim_vector_document_data['grouping_pack_child'] = str(getattr(sound_objects_dict[sa.sound_id], "id"))
                         similarity_vectors_per_analyzer_per_sound.append(sim_vector_document_data)
                 if similarity_vectors_per_analyzer_per_sound:
                     similarity_data[sa.sound_id] += similarity_vectors_per_analyzer_per_sound
@@ -539,7 +539,7 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of
                                 vector = vector_raw[0:config_options['vector_size']] 
                             
                 if vector is not None and vector_field_name is not None:
-                    max_similar_sounds = 500  # Max number of results for similarity search search. Filters are applied before the similarity search, so this number will usually be the total number of results (unless filters are more restrictive)
+                    max_similar_sounds = settings.SEARCH_ENGINE_NUM_SIMILAR_SOUNDS_PER_QUERY  # Max number of results for similarity search search. Filters are applied before the similarity search, so this number will usually be the total number of results (unless filters are more restrictive)
                     serialized_vector = ','.join([str(n) for n in vector])
                     query.set_query(f'{{!knn f={vector_field_name} topK={max_similar_sounds}}}[{serialized_vector}]')
         
@@ -551,14 +551,27 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of
         
         if similar_to is not None:
             # If doing a similarity query, the filter needs to be further processed so we perform filters based on parent documents
-            filter_query = [f'content_type:{SOLR_DOC_CONTENT_TYPES["similarity_vector"]}', f'analyzer:{settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER}']  # Add basic filter to only get similarity vectors from selected analyzer and from child documents (this is because root documents can also have sim vectors)
-            if isinstance(similar_to, int):
+            query_filter_modified = [f'content_type:{SOLR_DOC_CONTENT_TYPES["similarity_vector"]}', f'analyzer:{settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER}']  # Add basic filter to only get similarity vectors from selected analyzer and from child documents (this is because root documents can also have sim vectors)
+            top_similar_sounds_as_filter = query.as_kwargs()['q']
+            try:
                 # Also if target is specified as a sound ID, remove it from the list
-                filter_query.append(f'-_nest_parent_:{similar_to}')
+                query_filter_modified.append(f'-_nest_parent_:{int(similar_to)}')
+                # Update the top_similar_sounds_as_filter so we compensate for the fact that we are removing the target sound from the results
+                top_similar_sounds_as_filter=top_similar_sounds_as_filter.replace(f'topK={settings.SEARCH_ENGINE_NUM_SIMILAR_SOUNDS_PER_QUERY}', f'topK={settings.SEARCH_ENGINE_NUM_SIMILAR_SOUNDS_PER_QUERY + 1}')
+            except ValueError:
+                # Target is not a sound id, so we don't need to add the filter
+                pass
+            
+            # Also add the NN query as a filter so we don't get past the first settings.SEARCH_ENGINE_NUM_SIMILAR_SOUNDS_PER_QUERY results when applying extra filters
+            query_filter_modified += [top_similar_sounds_as_filter]  
+
+            # Now add all "usual" filters
             for part in query_filter.split('+'):
                 if part:
                     # Add extra query filters to the search query, but using the approptiate prefix to make sure they are applied to the root documents
-                    filter_query.append(f'{{!child of=\"content_type:{SOLR_DOC_CONTENT_TYPES["sound"]}\"}}' + part)
+                    modified_filter_part = f'{{!child of=\"content_type:{SOLR_DOC_CONTENT_TYPES["sound"]}\"}}' + part
+                    query_filter_modified.append(modified_filter_part)
+            query_filter = query_filter_modified
 
         # Set query options
         if current_page is not None:
@@ -585,7 +598,7 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of
 
         # Configure grouping
         if group_by_pack:
-            query.set_group_field(group_field="grouping_pack")  # This works both in similarity and non-similarity queries because we index this field in both parent and child documents
+            query.set_group_field(group_field="grouping_pack" if not similar_to else "grouping_pack_child")  # We name the fields differently to avoid solr conflicts with matches of both child and parent docs
             query.set_group_options(
                 group_func=None,
                 group_query=None,
diff --git a/utils/search/backends/solr_common.py b/utils/search/backends/solr_common.py
index d6c58aacc..18a4b129f 100644
--- a/utils/search/backends/solr_common.py
+++ b/utils/search/backends/solr_common.py
@@ -288,6 +288,8 @@ def __init__(self, response, next_page_query=None):
                 grouping_field = "thread_title_grouped"
             elif "grouping_pack" in list(response["grouped"].keys()):
                 grouping_field = "grouping_pack"
+            elif "grouping_pack_child" in list(response["grouped"].keys()):
+                grouping_field = "grouping_pack_child"
 
             self.docs = [{
                 'id': group['doclist']['docs'][0]['id'],
diff --git a/utils/search/search_sounds.py b/utils/search/search_sounds.py
index f56610fa5..af73d9383 100644
--- a/utils/search/search_sounds.py
+++ b/utils/search/search_sounds.py
@@ -217,7 +217,8 @@ def search_prepare_parameters(request):
         'num_sounds': num_sounds,
         'query_fields': field_weights,
         'group_by_pack': group_by_pack,
-        'only_sounds_with_pack': only_sounds_with_pack
+        'only_sounds_with_pack': only_sounds_with_pack,
+        'similar_to': request.GET.get('similar_to', None)
     }
 
     filter_query_link_more_when_grouping_packs = filter_query.replace(' ', '+')
diff --git a/utils/search/solr9/cores/freesound/conf/schema.xml b/utils/search/solr9/cores/freesound/conf/schema.xml
index 775206a2f..fa44baa78 100644
--- a/utils/search/solr9/cores/freesound/conf/schema.xml
+++ b/utils/search/solr9/cores/freesound/conf/schema.xml
@@ -246,6 +246,7 @@
   <field name="analyzer" type="string" indexed="true" stored="true" required="false" />
   <field name="timestamp_start" type="pdouble" indexed="true" stored="true" required="false" />
   <field name="timestamp_end" type="pdouble" indexed="true" stored="true" required="false" />
+  <field name="grouping_pack_child" type="string" indexed="true" stored="true" required="false" />
 
   <!-- Other fields -->
   <dynamicField name="random_*" type="random" indexed="true" stored="false"/>

From b8a6e98a636d793324af40df212fc45650c827ff Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Fri, 26 Jan 2024 13:56:58 +0100
Subject: [PATCH 17/28] Support for limiting max search sounds dynamically

---
 sounds/views.py                        |  4 +++-
 utils/search/__init__.py               |  4 +++-
 utils/search/backends/solr555pysolr.py | 20 ++++++++++----------
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/sounds/views.py b/sounds/views.py
index 8d3b871e8..0a8c67596 100644
--- a/sounds/views.py
+++ b/sounds/views.py
@@ -832,7 +832,9 @@ def similar(request, username, sound_id):
     else:
         # Get similar sounds from solr
         try:
-            results = get_search_engine().search_sounds(similar_to=sound.id)
+            results = get_search_engine().search_sounds(similar_to=sound.id, 
+                                                        similar_to_max_num_sounds=settings.NUM_SIMILAR_SOUNDS_PER_PAGE * settings.NUM_SIMILAR_SOUNDS_PAGES,
+                                                        num_sounds=settings.NUM_SIMILAR_SOUNDS_PER_PAGE * settings.NUM_SIMILAR_SOUNDS_PAGES)
             similarity_results = [(result['id'], result['score']) for result in results.docs]
         except SearchEngineException:
             # Search engine not available, return empty list
diff --git a/utils/search/__init__.py b/utils/search/__init__.py
index fd7e3ceb0..1ab918366 100644
--- a/utils/search/__init__.py
+++ b/utils/search/__init__.py
@@ -232,7 +232,8 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of
                       num_sounds=settings.SOUNDS_PER_PAGE, sort=settings.SEARCH_SOUNDS_SORT_OPTION_AUTOMATIC,
                       group_by_pack=False, num_sounds_per_pack_group=1, facets=None, only_sounds_with_pack=False, 
                       only_sounds_within_ids=False, group_counts_as_one_in_facets=False, 
-                      simialr_to=None, similar_to_analyzer=None):
+                      simialr_to=None, similar_to_max_num_sounds=settings.SEARCH_ENGINE_NUM_SIMILAR_SOUNDS_PER_QUERY, 
+                      similar_to_analyzer=settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER):
         """Search for sounds that match specific criteria and return them in a SearchResults object
 
         Args:
@@ -274,6 +275,7 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of
                 search. Note that when this parameter is passed, some of the other parameters will be ignored 
                 ('textual_query', 'facets', 'group_by_pack', 'num_sounds_per_pack_group', 'group_counts_as_one_in_facets'). 
                 'query_filter' should still be usable, although this remains to be throughly tested. 
+            similar_to_max_num_sounds (int, optional): max number of sounds to return in a similarity search query.
             similar_to_analyzer (str, optional): analyzer name from which to select similarity vectors for similarity search.
                 It defaults to settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER, but it could be change to something else
                 if we want to use a different type of similarity vectors for a similarity search query.
diff --git a/utils/search/backends/solr555pysolr.py b/utils/search/backends/solr555pysolr.py
index 61e9af4f9..c12822478 100644
--- a/utils/search/backends/solr555pysolr.py
+++ b/utils/search/backends/solr555pysolr.py
@@ -494,7 +494,8 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of
                       num_sounds=settings.SOUNDS_PER_PAGE, sort=settings.SEARCH_SOUNDS_SORT_OPTION_AUTOMATIC,
                       group_by_pack=False, num_sounds_per_pack_group=1, facets=None, only_sounds_with_pack=False, 
                       only_sounds_within_ids=False, group_counts_as_one_in_facets=False, 
-                      similar_to=None, similar_to_analyzer=settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER):
+                      similar_to=None, similar_to_max_num_sounds=settings.SEARCH_ENGINE_NUM_SIMILAR_SOUNDS_PER_QUERY ,
+                      similar_to_analyzer=settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER):
 
         query = SolrQuery()
 
@@ -539,7 +540,7 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of
                                 vector = vector_raw[0:config_options['vector_size']] 
                             
                 if vector is not None and vector_field_name is not None:
-                    max_similar_sounds = settings.SEARCH_ENGINE_NUM_SIMILAR_SOUNDS_PER_QUERY  # Max number of results for similarity search search. Filters are applied before the similarity search, so this number will usually be the total number of results (unless filters are more restrictive)
+                    max_similar_sounds = similar_to_max_num_sounds  # Max number of results for similarity search search. Filters are applied before the similarity search, so this number will usually be the total number of results (unless filters are more restrictive)
                     serialized_vector = ','.join([str(n) for n in vector])
                     query.set_query(f'{{!knn f={vector_field_name} topK={max_similar_sounds}}}[{serialized_vector}]')
         
@@ -557,20 +558,19 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of
                 # Also if target is specified as a sound ID, remove it from the list
                 query_filter_modified.append(f'-_nest_parent_:{int(similar_to)}')
                 # Update the top_similar_sounds_as_filter so we compensate for the fact that we are removing the target sound from the results
-                top_similar_sounds_as_filter=top_similar_sounds_as_filter.replace(f'topK={settings.SEARCH_ENGINE_NUM_SIMILAR_SOUNDS_PER_QUERY}', f'topK={settings.SEARCH_ENGINE_NUM_SIMILAR_SOUNDS_PER_QUERY + 1}')
+                top_similar_sounds_as_filter=top_similar_sounds_as_filter.replace(f'topK={similar_to_max_num_sounds}', f'topK={similar_to_max_num_sounds + 1}')
             except ValueError:
                 # Target is not a sound id, so we don't need to add the filter
                 pass
             
-            # Also add the NN query as a filter so we don't get past the first settings.SEARCH_ENGINE_NUM_SIMILAR_SOUNDS_PER_QUERY results when applying extra filters
+            # Also add the NN query as a filter so we don't get past the first similar_to_max_num_sounds results when applying extra filters
             query_filter_modified += [top_similar_sounds_as_filter]  
 
-            # Now add all "usual" filters
-            for part in query_filter.split('+'):
-                if part:
-                    # Add extra query filters to the search query, but using the approptiate prefix to make sure they are applied to the root documents
-                    modified_filter_part = f'{{!child of=\"content_type:{SOLR_DOC_CONTENT_TYPES["sound"]}\"}}' + part
-                    query_filter_modified.append(modified_filter_part)
+            # Now add the usual filter, but wrap it in "child of" modifier so it filters on parent documents instead of child documents
+            if query_filter:
+                query_filter_modified.append(f'{{!child of=\"content_type:{SOLR_DOC_CONTENT_TYPES["sound"]}\"}}({query_filter})')
+
+            # Replace query_filter with the modified version
             query_filter = query_filter_modified
 
         # Set query options

From 92461a2fe76d45703cb5320ca02bb9219f94f5bd Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Mon, 29 Jan 2024 13:50:43 +0100
Subject: [PATCH 18/28] Use euclidean distance in similarity field type

---
 utils/search/solr9/cores/freesound/conf/schema.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/search/solr9/cores/freesound/conf/schema.xml b/utils/search/solr9/cores/freesound/conf/schema.xml
index fa44baa78..6655ea878 100644
--- a/utils/search/solr9/cores/freesound/conf/schema.xml
+++ b/utils/search/solr9/cores/freesound/conf/schema.xml
@@ -176,7 +176,7 @@
          performing any ranking -->
     <similarity class="solr.BooleanSimilarityFactory"/>
   </fieldType>
-  <fieldType name="knn_vector100" class="solr.DenseVectorField" vectorDimension="100" similarityFunction="cosine" knnAlgorithm="hnsw" hnswMaxConnections="10" hnswBeamWidth="40"/>
+  <fieldType name="knn_vector100" class="solr.DenseVectorField" vectorDimension="100" similarityFunction="euclidean" knnAlgorithm="hnsw" hnswMaxConnections="10" hnswBeamWidth="40"/>
 
   <field name="_root_" type="string" docValues="false" indexed="true" stored="false"/>
   <field name="_text_" type="text_general" multiValued="true" indexed="true" stored="false"/>

From be81ad3a013fa55e6596a1aafba1b86900302fc3 Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Mon, 29 Jan 2024 15:59:42 +0100
Subject: [PATCH 19/28] Update default similarity analyzers

---
 freesound/settings.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/freesound/settings.py b/freesound/settings.py
index c9b423f15..ff37e3750 100644
--- a/freesound/settings.py
+++ b/freesound/settings.py
@@ -643,12 +643,12 @@
         'vector_property_name': 'embeddings', 
         'vector_size': 100,
     },
-    AUDIOSET_YAMNET_ANALYZER_NAME: {
-        'vector_property_name': 'embeddings', 
-        'vector_size': 100,  # Note yamnet has higher dimensionality and here we're cropping dimensions
-    },
+    FREESOUND_ESSENTIA_EXTRACTOR_NAME: {
+        'vector_property_name': 'sim_vector', 
+        'vector_size': 100,
+    }
 }
-SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER = FSDSINET_ANALYZER_NAME
+SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER = FREESOUND_ESSENTIA_EXTRACTOR_NAME
 SEARCH_ENGINE_NUM_SIMILAR_SOUNDS_PER_QUERY = 500
 USE_SEARCH_ENGINE_SIMILARITY = False  # Does not currently apply to API
 

From 4da141d3ebbaba6af76f2f29e861706a60643aab Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Tue, 30 Jan 2024 11:09:19 +0100
Subject: [PATCH 20/28] Make test pass after new simialr_to parameter

---
 utils/tests/test_search_general.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utils/tests/test_search_general.py b/utils/tests/test_search_general.py
index 9210a8434..7ebd973f4 100644
--- a/utils/tests/test_search_general.py
+++ b/utils/tests/test_search_general.py
@@ -48,6 +48,7 @@ def test_search_prepare_parameters_without_query_params(self):
             'group_by_pack': True,
             'query_filter': '',
             'textual_query': '',
+            'similar_to': None,
             'only_sounds_with_pack': False,
         }
 
@@ -92,6 +93,7 @@ def test_search_prepare_parameters_with_query_params(self):
             'group_by_pack': False,
             'query_filter': 'duration:[1 TO 10] is_geotagged:1',
             'textual_query': 'dog',
+            'similar_to': None,
             'only_sounds_with_pack': False,
         }
 

From f109ab70634fd076ca1757a7b0cb98c354050282 Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Fri, 2 Feb 2024 14:34:20 +0100
Subject: [PATCH 21/28] Add option to display search results in map

---
 clustering/interface.py                       |  27 ++++
 freesound/settings.py                         |   3 +
 .../static/bw-frontend/src/pages/search.js    |  17 ++-
 geotags/urls.py                               |   1 +
 geotags/views.py                              |  33 +++++
 search/templatetags/search.py                 |   2 +
 search/views.py                               | 134 ++++++++----------
 templates/search/search.html                  |  62 +++++---
 utils/search/search_sounds.py                 |  17 ++-
 9 files changed, 195 insertions(+), 101 deletions(-)

diff --git a/clustering/interface.py b/clustering/interface.py
index 064b8d905..ccbbca602 100644
--- a/clustering/interface.py
+++ b/clustering/interface.py
@@ -109,5 +109,32 @@ def cluster_sound_results(request, features=DEFAULT_FEATURES):
         return {'finished': False, 'error': False}
 
 
+def get_ids_in_cluster(request, requested_cluster_id):
+    """Get the sound ids in the requested cluster. Used for applying a filter by id when using a cluster facet.
+    """
+    try:
+        requested_cluster_id = int(requested_cluster_id) - 1
+
+        # results are cached in clustering_utilities, available features are defined in the clustering settings file.
+        result = cluster_sound_results(request, features=DEFAULT_FEATURES)
+        results = result['result']
+
+        sounds_from_requested_cluster = results[int(requested_cluster_id)]
+
+    except ValueError:
+        return []
+    except IndexError:
+        return []
+    except KeyError:
+        # If the clustering is not in cache the 'result' key won't exist
+        # This means that the clustering computation will be triggered asynchronously.
+        # Moreover, the applied clustering filter will have no effect.
+        # Somehow, we should inform the user that the clustering results were not available yet, and that
+        # he should try again later to use a clustering facet.
+        return []
+
+    return sounds_from_requested_cluster
+
+
 def hash_cache_key(key):
     return create_hash(key, limit=32)
diff --git a/freesound/settings.py b/freesound/settings.py
index ff37e3750..45cfe6265 100644
--- a/freesound/settings.py
+++ b/freesound/settings.py
@@ -652,6 +652,9 @@
 SEARCH_ENGINE_NUM_SIMILAR_SOUNDS_PER_QUERY = 500
 USE_SEARCH_ENGINE_SIMILARITY = False  # Does not currently apply to API
 
+SEARCH_ALLOW_DISPLAY_RESULTS_IN_MAP = True
+MAX_SEARCH_RESULTS_IN_MAP_DISPLAY = 1000  # This is the maximum number of sounds that will be shown when using "display results in map" mode
+
 # -------------------------------------------------------------------------------
 # Similarity client settings
 SIMILARITY_ADDRESS = 'similarity'
diff --git a/freesound/static/bw-frontend/src/pages/search.js b/freesound/static/bw-frontend/src/pages/search.js
index 2b9c13428..6f73a7178 100644
--- a/freesound/static/bw-frontend/src/pages/search.js
+++ b/freesound/static/bw-frontend/src/pages/search.js
@@ -96,7 +96,8 @@ var filter_in_remix_group_element = document.getElementById('filter_in_remix_gro
 var sort_by_element = document.getElementById('sort-by');
 var group_by_pack_element  = document.getElementById('group_by_pack');
 var only_sounds_with_pack_element  = document.getElementById('only_sounds_with_pack');
-var use_compact_mode_element  = document.getElementById('use_compact_mode');
+var use_compact_mode_element = document.getElementById('use_compact_mode');
+var use_map_mode_element = document.getElementById('use_map_mode');
 
 function update_hidden_compact_mode_element() {
   var hiddenElement = document.getElementById('use_compact_mode_hidden');
@@ -112,6 +113,20 @@ use_compact_mode_element.addEventListener('change', function() {
   update_hidden_compact_mode_element()
 })
 
+function update_hidden_map_mode_element() {
+  var hiddenElement = document.getElementById('use_map_mode_hidden');
+  if (use_map_mode_element.checked) {
+    hiddenElement.value = "1";
+  } else {
+    hiddenElement.value = "0";
+  }
+}
+
+update_hidden_map_mode_element()
+use_map_mode_element.addEventListener('change', function() {
+  update_hidden_map_mode_element()
+})
+
 function advancedSearchOptionsIsVisible()
 {
   return advanced_search_hidden_field.value === "1";
diff --git a/geotags/urls.py b/geotags/urls.py
index 2778b48a8..de9a3e387 100644
--- a/geotags/urls.py
+++ b/geotags/urls.py
@@ -26,6 +26,7 @@
     path('sounds_barray/user_latest/<username>/', geotags.geotags_for_user_latest_barray, name="geotags-for-user-latest-barray"),
     path('sounds_barray/pack/<int:pack_id>/', geotags.geotags_for_pack_barray, name="geotags-for-pack-barray"),
     path('sounds_barray/sound/<int:sound_id>/', geotags.geotag_for_sound_barray, name="geotags-for-sound-barray"),
+    path('sounds_barray/query/', geotags.gelotags_for_query_barray, name="geotags-for-query-barray"),
     re_path(r'^sounds_barray/(?P<tag>[\w-]+)?/?$', geotags.geotags_barray, name="geotags-barray"),
     path('geotags_box_barray/', geotags.geotags_box_barray, name="geotags-box-barray"),
     path('infowindow/<int:sound_id>/', geotags.infowindow, name="geotags-infowindow"),
diff --git a/geotags/views.py b/geotags/views.py
index 4df4ab535..df62fca18 100644
--- a/geotags/views.py
+++ b/geotags/views.py
@@ -32,8 +32,10 @@
 from django.views.decorators.cache import cache_page
 from django.views.decorators.clickjacking import xframe_options_exempt
 
+from search.views import search_prepare_parameters
 from sounds.models import Sound, Pack
 from utils.logging_filters import get_client_ip
+from utils.search.search_sounds import perform_search_engine_query
 from utils.username import redirect_if_old_username_or_404, raise_404_if_user_is_deleted
 
 web_logger = logging.getLogger('web')
@@ -44,6 +46,26 @@ def log_map_load(map_type, num_geotags, request):
         'map_type': map_type, 'num_geotags': num_geotags, 'ip': get_client_ip(request)}))
 
 
+def update_query_params_for_map_query(query_params, preserve_facets=False):
+    # Force is_geotagged filter to be present
+    if query_params['query_filter']:
+        if 'is_geotagged' not in query_params['query_filter']:
+            query_params['query_filter'] = query_params['query_filter'] + ' +is_geotagged:1'
+    else:
+        query_params['query_filter'] = 'is_geotagged:1'
+    # Force one single page with "all" results, and don't group by pack
+    query_params.update({
+        'current_page': 1, 
+        'num_sounds': settings.MAX_SEARCH_RESULTS_IN_MAP_DISPLAY,
+        'group_by_pack': False,
+        'only_sounds_with_pack': False,
+    })
+    if not preserve_facets:
+        # No need to compute facets for the bytearray, but it might be needed for the main query
+        if 'facets' in query_params:
+            del query_params['facets']
+
+
 def generate_bytearray(sound_queryset):
     # sounds as bytearray
     packed_sounds = io.BytesIO()
@@ -139,6 +161,17 @@ def geotag_for_sound_barray(request, sound_id):
     return HttpResponse(generated_bytearray, content_type='application/octet-stream')
 
 
+def gelotags_for_query_barray(request):
+    query_params, _, _ = search_prepare_parameters(request)
+    update_query_params_for_map_query(query_params)
+    results, _ = perform_search_engine_query(query_params)
+    resultids = [d.get("id") for d in results.docs]
+    generated_bytearray, num_geotags = generate_bytearray(Sound.objects.select_related('geotag').filter(id__in=resultids))
+    if num_geotags > 0:
+        log_map_load('query', num_geotags, request)
+    return HttpResponse(generated_bytearray, content_type='application/octet-stream')
+
+
 def _get_geotags_query_params(request):
     return {
         'center_lat': request.GET.get('c_lat', None),
diff --git a/search/templatetags/search.py b/search/templatetags/search.py
index 5c2710b83..b847f8a7a 100644
--- a/search/templatetags/search.py
+++ b/search/templatetags/search.py
@@ -94,6 +94,8 @@ def display_facet(context, flt, facet, facet_type, title=""):
         )
         if context['similar_to'] is not None:
             element['add_filter_url'] += '&similar_to={}'.format(context['similar_to'])
+        if context['use_map_mode'] == True:
+            element['add_filter_url'] += '&mm=1'
         filtered_facet.append(element)
 
     # We sort the facets by count. Also, we apply an opacity filter on "could" type pacets
diff --git a/search/views.py b/search/views.py
index 8d182d66d..a67e75509 100644
--- a/search/views.py
+++ b/search/views.py
@@ -33,6 +33,7 @@
 
 import forum
 import sounds
+import geotags
 from clustering.clustering_settings import DEFAULT_FEATURES, NUM_SOUND_EXAMPLES_PER_CLUSTER_FACET, \
     NUM_TAGS_SHOWN_PER_CLUSTER_FACET
 from clustering.interface import cluster_sound_results, get_sound_ids_from_search_engine_query
@@ -49,28 +50,17 @@
 def search_view_helper(request, tags_mode=False):
     query_params, advanced_search_params_dict, extra_vars = search_prepare_parameters(request)
 
-    # check if there was a filter parsing error
+    # Check if there was a filter parsing error
     if extra_vars['parsing_error']:
         search_logger.info(f"Query filter parsing error. filter: {request.GET.get('f', '')}")
         extra_vars.update({'error_text': 'There was an error while searching, is your query correct?'})
         return extra_vars
 
-    # get the url query params for later sending it to the clustering engine
+    # Get the url query params for later sending it to the clustering engine (this is only used with the clustering feature)
     url_query_params_string = request.META['QUERY_STRING']
 
-    # get sound ids of the requested cluster when applying a clustering facet
-    # the list of ids is used later on to create a Solr query with filter by ids in
-    cluster_id = request.GET.get('cluster_id')
-
-    if settings.ENABLE_SEARCH_RESULTS_CLUSTERING and cluster_id:
-        in_ids = _get_ids_in_cluster(request, cluster_id)
-    else:
-        in_ids = []
-    query_params.update({'only_sounds_within_ids': in_ids})
-
-    query_params.update({'facets': settings.SEARCH_SOUNDS_DEFAULT_FACETS})
-
-    filter_query_split = split_filter_query(query_params['query_filter'], extra_vars['parsed_filters'], cluster_id)
+    # Get a "split" version of the filter which is used to display filters in UI and for some other checks (see below)
+    filter_query_split = split_filter_query(query_params['query_filter'], extra_vars['parsed_filters'], extra_vars['cluster_id'])
     
     # Get tags taht are being used in filters (this is used later to remove them from the facet and also for tags mode)
     tags_in_filter = []
@@ -85,7 +75,6 @@ def search_view_helper(request, tags_mode=False):
     # Process tags mode stuff
     initial_tagcloud = None
     if tags_mode:
-
         # In tags mode, we increase the size of the tags facet so we include more related tags
         query_params['facets'][settings.SEARCH_SOUNDS_FIELD_TAGS]['limit'] = 50
 
@@ -110,7 +99,6 @@ def search_view_helper(request, tags_mode=False):
                 'initial_tagcloud': initial_tagcloud,
             }
 
-
     # In the tvars section we pass the original group_by_pack value to avoid it being set to false if there is a pack filter (see search_prepare_parameters)
     # This is so that we keep track of the original setting of group_by_pack before the filter was applied, and so that if the pack filter is removed, we can 
     # automatically revert to the previous group_by_pack setting. Also, we compute "disable_group_by_pack_option" so that when we have changed the real
@@ -122,10 +110,21 @@ def search_view_helper(request, tags_mode=False):
     disable_only_sounds_by_pack_option= 'pack:' in query_params['query_filter']
     only_sounds_with_pack = "1" if query_params['only_sounds_with_pack'] else ""
     if only_sounds_with_pack:
-        # If displaying seachr results as packs, include 3 sounds per pack group in the results so we can display these sounds as selected sounds in the
+        # If displaying search results as packs, include 3 sounds per pack group in the results so we can display these sounds as selected sounds in the
         # display_pack templatetag
         query_params['num_sounds_per_pack_group'] = 3
 
+    # Parpare variables for map view
+    disable_display_results_in_grid_option = False
+    map_bytearray_url = ''
+    use_map_mode = settings.SEARCH_ALLOW_DISPLAY_RESULTS_IN_MAP and request.GET.get("mm", "0") == "1"
+    if use_map_mode:
+        disable_group_by_pack_option = True
+        disable_only_sounds_by_pack_option = True
+        disable_display_results_in_grid_option = True
+        geotags.views.update_query_params_for_map_query(query_params, preserve_facets=True)
+        map_bytearray_url = reverse('geotags-for-query-barray') + '?' + request.get_full_path().split('?')[-1]
+
     tvars = {
         'error_text': None,
         'filter_query': query_params['query_filter'],
@@ -151,43 +150,53 @@ def search_view_helper(request, tags_mode=False):
         'tags_mode': tags_mode,
         'tags_in_filter': tags_in_filter,
         'has_advanced_search_settings_set': contains_active_advanced_search_filters(request, query_params, extra_vars),
-        'advanced_search_closed_on_load': settings.ADVANCED_SEARCH_MENU_ALWAYS_CLOSED_ON_PAGE_LOAD
+        'advanced_search_closed_on_load': settings.ADVANCED_SEARCH_MENU_ALWAYS_CLOSED_ON_PAGE_LOAD,
+        'allow_map_mode': settings.SEARCH_ALLOW_DISPLAY_RESULTS_IN_MAP,
+        'use_map_mode': use_map_mode,
+        'map_bytearray_url': map_bytearray_url,
+        'disable_display_results_in_grid_option': disable_display_results_in_grid_option,
+        'max_search_results_map_mode': settings.MAX_SEARCH_RESULTS_IN_MAP_DISPLAY
     }
     tvars.update(advanced_search_params_dict)
 
     try:       
         results, paginator = perform_search_engine_query(query_params)
-        if not only_sounds_with_pack:
-            resultids = [d.get("id") for d in results.docs]
-            resultsounds = sounds.models.Sound.objects.bulk_query_id(resultids)
-            allsounds = {}
-            for s in resultsounds:
-                allsounds[s.id] = s
-            # allsounds will contain info from all the sounds returned by bulk_query_id. This should
-            # be all sounds in docs, but if solr and db are not synchronised, it might happen that there
-            # are ids in docs which are not found in bulk_query_id. To avoid problems we remove elements
-            # in docs that have not been loaded in allsounds.
-            docs = [doc for doc in results.docs if doc["id"] in allsounds]
-            for d in docs:
-                d["sound"] = allsounds[d["id"]]
+        if not use_map_mode:
+            if not only_sounds_with_pack:
+                resultids = [d.get("id") for d in results.docs]
+                resultsounds = sounds.models.Sound.objects.bulk_query_id(resultids)
+                allsounds = {}
+                for s in resultsounds:
+                    allsounds[s.id] = s
+                # allsounds will contain info from all the sounds returned by bulk_query_id. This should
+                # be all sounds in docs, but if solr and db are not synchronised, it might happen that there
+                # are ids in docs which are not found in bulk_query_id. To avoid problems we remove elements
+                # in docs that have not been loaded in allsounds.
+                docs = [doc for doc in results.docs if doc["id"] in allsounds]
+                for d in docs:
+                    d["sound"] = allsounds[d["id"]]
+            else:
+                resultspackids = []
+                sound_ids_for_pack_id = {}
+                for d in results.docs:
+                    pack_id = int(d.get("group_name").split('_')[0])
+                    resultspackids.append(pack_id)
+                    sound_ids_for_pack_id[pack_id] = [int(sound['id']) for sound in d.get('group_docs', [])]
+                resultpacks = sounds.models.Pack.objects.bulk_query_id(resultspackids, sound_ids_for_pack_id=sound_ids_for_pack_id)
+                allpacks = {}
+                for p in resultpacks:
+                    allpacks[p.id] = p
+                # allpacks will contain info from all the packs returned by bulk_query_id. This should
+                # be all packs in docs, but if solr and db are not synchronised, it might happen that there
+                # are ids in docs which are not found in bulk_query_id. To avoid problems we remove elements
+                # in docs that have not been loaded in allsounds.
+                docs = [d for d in results.docs if int(d.get("group_name").split('_')[0]) in allpacks]
+                for d in docs:
+                    d["pack"] = allpacks[int(d.get("group_name").split('_')[0])]
         else:
-            resultspackids = []
-            sound_ids_for_pack_id = {}
-            for d in results.docs:
-                pack_id = int(d.get("group_name").split('_')[0])
-                resultspackids.append(pack_id)
-                sound_ids_for_pack_id[pack_id] = [int(sound['id']) for sound in d.get('group_docs', [])]
-            resultpacks = sounds.models.Pack.objects.bulk_query_id(resultspackids, sound_ids_for_pack_id=sound_ids_for_pack_id)
-            allpacks = {}
-            for p in resultpacks:
-                allpacks[p.id] = p
-            # allpacks will contain info from all the packs returned by bulk_query_id. This should
-            # be all packs in docs, but if solr and db are not synchronised, it might happen that there
-            # are ids in docs which are not found in bulk_query_id. To avoid problems we remove elements
-            # in docs that have not been loaded in allsounds.
-            docs = [d for d in results.docs if int(d.get("group_name").split('_')[0]) in allpacks]
-            for d in docs:
-                d["pack"] = allpacks[int(d.get("group_name").split('_')[0])]
+            # In map mode we don't need to retrieve any information about sounds as we'll make another query
+            # to generate the geotags bytearray
+            docs = []
 
         search_logger.info('Search (%s)' % json.dumps({
             'ip': get_client_ip(request),
@@ -235,33 +244,6 @@ def search(request):
     return render(request, template, tvars)
 
 
-def _get_ids_in_cluster(request, requested_cluster_id):
-    """Get the sound ids in the requested cluster. Used for applying a filter by id when using a cluster facet.
-    """
-    try:
-        requested_cluster_id = int(requested_cluster_id) - 1
-
-        # results are cached in clustering_utilities, available features are defined in the clustering settings file.
-        result = cluster_sound_results(request, features=DEFAULT_FEATURES)
-        results = result['result']
-
-        sounds_from_requested_cluster = results[int(requested_cluster_id)]
-
-    except ValueError:
-        return []
-    except IndexError:
-        return []
-    except KeyError:
-        # If the clustering is not in cache the 'result' key won't exist
-        # This means that the clustering computation will be triggered asynchronously.
-        # Moreover, the applied clustering filter will have no effect.
-        # Somehow, we should inform the user that the clustering results were not available yet, and that
-        # he should try again later to use a clustering facet.
-        return []
-
-    return sounds_from_requested_cluster
-
-
 def clustering_facet(request):
     """Triggers the computation of the clustering, returns the state of processing or the clustering facet.
     """
diff --git a/templates/search/search.html b/templates/search/search.html
index 9e2988d7e..a39f5ff02 100644
--- a/templates/search/search.html
+++ b/templates/search/search.html
@@ -76,23 +76,25 @@ <h3>
                     enable this feature. Instead, we add here the element to toggle advanced search options.
                     In the future we might need to redesign that.
                     {% endcomment %}
-                    <div class="col-4 middle">
+                    <div class="col-4 middle center">
                         <a href="#" class="bw-link--grey-light" id="toggle_advanced_search_options"></a>
                         {% if has_advanced_search_settings_set %}<span class="text-red h-spacing-left-1 text-44 line-height-14 padding-bottom-1" title="There are active filters in the advanced search options">·</span>{% endif %}
                     </div>
-                    <div class="browse__search-overview-sorter">
-                        <span class="font-weight-normal text-light-grey d-none d-md-inline">Sort by:</span>
-                        {% if not similar_to %}
-                            <select name="s" id="sort-by">
-                                {% for option in sort_options %}
-                                <option value="{{option.1}}"{% ifequal option.1 sort %}selected="selected"{% endifequal %}>{{option.0}}</option>
-                                {% endfor %}
-                            </select>
-                        {% else %}
-                            <select name="s" id="sort-by" disabled>
-                                <option>Similarity to target</option>
-                            </select>
-                        {% endif %}
+                    <div class="col-4 right browse__search-overview-sorter">
+                        <div class="{% if use_map_mode %}display-none{% endif %}"> 
+                            <span class="font-weight-normal text-light-grey d-none d-md-inline">Sort by:</span>
+                            {% if not similar_to %}
+                                <select name="s" id="sort-by">
+                                    {% for option in sort_options %}
+                                    <option value="{{option.1}}"{% ifequal option.1 sort %}selected="selected"{% endifequal %}>{{option.0}}</option>
+                                    {% endfor %}
+                                </select>
+                            {% else %}
+                                <select name="s" id="sort-by" disabled>
+                                    <option>Similarity to target</option>
+                                </select>
+                            {% endif %}
+                        </div>
                     </div>
                 </div>
                 <div class="divider-light"></div>
@@ -179,9 +181,9 @@ <h3>
                             </div>
                             <ul class="bw-search__filter-value-list">
                                 <li class="bw-search__filter-value v-padding-1">
-                                    <label class="between w-100" title="Only find sounds that have geolocation information">
+                                    <label class="between w-100 {% if use_map_mode %}opacity-020{% endif %}" title="Only find sounds that have geolocation information">
                                         <div class="bw-search__filter-checkbox">
-                                            <input id="filter_is_geotagged" type="checkbox" class="bw-checkbox"/>
+                                            <input id="filter_is_geotagged" type="checkbox" class="bw-checkbox" {% if use_map_mode %}disabled="disabled"{% endif %} />
                                         </div>
                                         <div class="bw-search__filter-name">Only geotagged sounds</div>
                                     </label>
@@ -213,14 +215,25 @@ <h3>
                                     </label>
                                 </li>
                                 <li class="bw-search__filter-value v-padding-1">
-                                    <label class="between w-100" title="Dipslay search results in a grid so that more sounds are visible per search results page">
+                                    <label class="between w-100 {% if disable_display_results_in_grid_option %}opacity-020{% endif %}" title="Display search results in a grid so that more sounds are visible per search results page">
                                         <div class="bw-search__filter-checkbox">
-                                            <input id="use_compact_mode" type="checkbox" class="bw-checkbox" {% if use_compact_mode %}checked{% endif %}/>
+                                            <input id="use_compact_mode" type="checkbox" class="bw-checkbox" {% if use_compact_mode %}checked{% endif %} {% if disable_display_results_in_grid_option %}disabled="disabled"{% endif %}/>
                                             <input id="use_compact_mode_hidden" type="hidden" name="cm" value="{{ use_compact_mode|yesno:"1,0" }}"/>
                                         </div>
                                         <div class="bw-search__filter-name">Display results in grid</div>
                                     </label>
                                 </li>
+                                {% if allow_map_mode %}
+                                <li class="bw-search__filter-value v-padding-1">
+                                    <label class="between w-100" title="Display search results in a map. Note that a maximum of {{ max_search_results_map_mode }} sounds will be displayed.">
+                                        <div class="bw-search__filter-checkbox">
+                                            <input id="use_map_mode" type="checkbox" class="bw-checkbox" {% if use_map_mode %}checked{% endif %} />
+                                            <input id="use_map_mode_hidden" type="hidden" name="mm" value="{{ use_map_mode|yesno:"1,0" }}"/>
+                                        </div>
+                                        <div class="bw-search__filter-name">Display results in map</div>
+                                    </label>
+                                </li>
+                                {% endif %}
                             </ul>
                         </div>
                     </div>
@@ -293,12 +306,13 @@ <h3>
             {% if filter_query_split %}
             <div class="v-spacing-3 v-spacing-top-2 line-height-33">
                 {% for filter in filter_query_split %}
-                <a class="no-hover btn-inverse bw-search__active-filters-button" href="/search/?advanced={{ advanced }}&g={{ group_by_pack_in_request }}&only_p={{ only_sounds_with_pack_in_request }}&s={{ sort }}&cluster_id={{ filter.cluster_id }}&w={{ weights }}&q={{search_query}}&f={{ filter.remove_url }}{% if similar_to %}&similar_to={{ similar_to }}{% endif %}" title="Remove this filter">
+                <a class="no-hover btn-inverse bw-search__active-filters-button" href="/search/?advanced={{ advanced }}&g={{ group_by_pack_in_request }}&only_p={{ only_sounds_with_pack_in_request }}&s={{ sort }}&cluster_id={{ filter.cluster_id }}&w={{ weights }}&q={{search_query}}&f={{ filter.remove_url }}{% if similar_to %}&similar_to={{ similar_to }}{% endif %}{% if use_map_mode %}&mm=1{% endif %}" title="Remove this filter">
                     {{ filter.name }}<span class="h-spacing-left-1">{% bw_icon 'close' %} </span>
                 </a>
                 {% endfor %}
             </div>
             {% endif %}
+            {% if not use_map_mode %}
             <div class="v-spacing-6 v-spacing-top-2">
                 {% if paginator.count > 0 %}
                 {% if use_compact_mode %}
@@ -353,6 +367,16 @@ <h5>No results... &#128543</h5>
             <div class="v-spacing-6">
                 {% bw_paginator paginator page current_page request "sound" non_grouped_number_of_results %}
             </div>
+            {% else %}
+            <div class="col-12 v-spacing-top-3 no-paddings">
+                <div id="mapCanvas" class="map main-map"
+                     data-map-show-search="{% if sound %}false{% else %}true{% endif %}"
+                     data-geotags-url="{{ map_bytearray_url }}"
+                     data-geotags-url-box="{% url "geotags-box-barray" %}"
+                     data-access-token="pk.eyJ1IjoiZnJlZXNvdW5kIiwiYSI6ImNrd3E0Mm9lbjBqM2Qyb2wwdmwxaWI3a3oifQ.MZkgLSByRuk_Xql67CySAg"
+                ><span id="mapLoadingIndicator">Loading map... <img width="12px" height="12px" src="{% static 'bw-frontend/public/bw_indicator.gif' %}"></span></div>
+            </div>
+            {% endif %}
         </main>
     </div>
     {% endif %}
diff --git a/utils/search/search_sounds.py b/utils/search/search_sounds.py
index af73d9383..d357e13c5 100644
--- a/utils/search/search_sounds.py
+++ b/utils/search/search_sounds.py
@@ -25,6 +25,7 @@
 from urllib.parse import quote_plus
 from pyparsing import ParseException
 
+import clustering
 from utils.search import SearchEngineException, get_search_engine, SearchResultsPaginator
 from utils.search.lucene_parser import parse_query_filter_string
 
@@ -199,7 +200,6 @@ def search_prepare_parameters(request):
         parsing_error = True
 
     filter_query = ' '.join([''.join(filter_str) for filter_str in parsed_filters])
-
     filter_query_non_facets, has_facet_filter = remove_facet_filters(parsed_filters)
 
     if only_sounds_with_pack:
@@ -209,6 +209,12 @@ def search_prepare_parameters(request):
     else:
         num_sounds = settings.SOUNDS_PER_PAGE if not should_use_compact_mode(request) else settings.SOUNDS_PER_PAGE_COMPACT_MODE
 
+    if settings.ENABLE_SEARCH_RESULTS_CLUSTERING:
+        cluster_id = request.GET.get('cluster_id')
+        if cluster_id:
+            in_ids = clustering.interface.get_ids_in_cluster(request, cluster_id)
+            query_params.update({'only_sounds_within_ids': in_ids})
+
     query_params = {
         'textual_query': search_query,
         'query_filter': filter_query,
@@ -218,14 +224,15 @@ def search_prepare_parameters(request):
         'query_fields': field_weights,
         'group_by_pack': group_by_pack,
         'only_sounds_with_pack': only_sounds_with_pack,
-        'similar_to': request.GET.get('similar_to', None)
+        'only_sounds_within_ids': [],
+        'similar_to': request.GET.get('similar_to', None),
+        'facets': settings.SEARCH_SOUNDS_DEFAULT_FACETS,
     }
 
-    filter_query_link_more_when_grouping_packs = filter_query.replace(' ', '+')
-
     # These variables are not used for querying the sound collection
     # We keep them separated in order to facilitate the distinction between variables used for performing
     # the Solr query and these extra ones needed for rendering the search template page
+    filter_query_link_more_when_grouping_packs = filter_query.replace(' ', '+')
     extra_vars = {
         'filter_query_link_more_when_grouping_packs': filter_query_link_more_when_grouping_packs,
         'advanced': advanced,
@@ -234,7 +241,7 @@ def search_prepare_parameters(request):
         'has_facet_filter': has_facet_filter,
         'parsed_filters': parsed_filters,
         'parsing_error': parsing_error,
-        'raw_weights_parameter': weights_parameter
+        'raw_weights_parameter': weights_parameter,
     }
 
     return query_params, advanced_search_params_dict, extra_vars

From 3abb353a554603b11ec89cc229c1c1c46291e7aa Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Fri, 2 Feb 2024 14:50:41 +0100
Subject: [PATCH 22/28] Fix failing test

---
 utils/tests/test_search_general.py | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/utils/tests/test_search_general.py b/utils/tests/test_search_general.py
index 7ebd973f4..6f253612d 100644
--- a/utils/tests/test_search_general.py
+++ b/utils/tests/test_search_general.py
@@ -39,7 +39,7 @@ def test_search_prepare_parameters_without_query_params(self):
         AuthenticationMiddleware().process_request(request)
         request.session.save()
         query_params, advanced_search_params_dict, extra_vars = search_prepare_parameters(request)
-
+        
         expected_default_query_params = {
             'query_fields': settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS,
             'sort': settings.SEARCH_SOUNDS_SORT_OPTION_DATE_NEW_FIRST,
@@ -50,6 +50,17 @@ def test_search_prepare_parameters_without_query_params(self):
             'textual_query': '',
             'similar_to': None,
             'only_sounds_with_pack': False,
+            'only_sounds_within_ids': [],
+            'facets': {'bitdepth': {},
+                'bitrate': {},
+                'channels': {},
+                'license': {'limit': 10},
+                'pack_grouping': {'limit': 10},
+                'samplerate': {},
+                'tags': {'limit': 30},
+                'type': {'limit': 7},
+                'username': {'limit': 30}
+            },
         }
 
         expected_extra_vars = {
@@ -75,7 +86,6 @@ def test_search_prepare_parameters_with_query_params(self):
         AuthenticationMiddleware().process_request(request)
         request.session.save()
         query_params, advanced_search_params_dict, extra_vars = search_prepare_parameters(request)
-
         expected_default_query_params = {
             'query_fields': {
                 settings.SEARCH_SOUNDS_FIELD_ID: 0,
@@ -95,6 +105,17 @@ def test_search_prepare_parameters_with_query_params(self):
             'textual_query': 'dog',
             'similar_to': None,
             'only_sounds_with_pack': False,
+            'only_sounds_within_ids': [],
+            'facets': {'bitdepth': {},
+                'bitrate': {},
+                'channels': {},
+                'license': {'limit': 10},
+                'pack_grouping': {'limit': 10},
+                'samplerate': {},
+                'tags': {'limit': 30},
+                'type': {'limit': 7},
+                'username': {'limit': 30}
+            },
         }
 
         expected_extra_vars = {
@@ -105,7 +126,7 @@ def test_search_prepare_parameters_with_query_params(self):
             'has_facet_filter': False,
             'parsed_filters': [['duration', ':', '[', '1', ' TO ', '10', ']'], ['is_geotagged', ':', '1']],
             'parsing_error': False,
-            'raw_weights_parameter': ''
+            'raw_weights_parameter': '',
         }
 
         expected_advanced_search_params_dict = {

From 44f706487a311feda9b52137a472a0a7e022f0ad Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Fri, 2 Feb 2024 15:39:25 +0100
Subject: [PATCH 23/28] Make tests pass

---
 freesound/test_settings.py         |  1 +
 utils/search/search_sounds.py      |  2 +-
 utils/spam.py                      | 53 +++++++++++++++---------------
 utils/tests/test_search_general.py | 22 ++-----------
 4 files changed, 31 insertions(+), 47 deletions(-)

diff --git a/freesound/test_settings.py b/freesound/test_settings.py
index d95458f5a..3308d0030 100644
--- a/freesound/test_settings.py
+++ b/freesound/test_settings.py
@@ -14,6 +14,7 @@
         '--with-xunit',
     ]
 
+AKISMET_KEY = ''  # Avoid making requests to "real" Akismet server if running
 SECRET_KEY = "testsecretwhichhastobeatleast16characterslong"
 SUPPORT = (('Name Surname', 'support@freesound.org'),)
 STATICFILES_STORAGE = 'django.contrib.staticfiles.storage.StaticFilesStorage'
diff --git a/utils/search/search_sounds.py b/utils/search/search_sounds.py
index d357e13c5..81f084a00 100644
--- a/utils/search/search_sounds.py
+++ b/utils/search/search_sounds.py
@@ -226,7 +226,7 @@ def search_prepare_parameters(request):
         'only_sounds_with_pack': only_sounds_with_pack,
         'only_sounds_within_ids': [],
         'similar_to': request.GET.get('similar_to', None),
-        'facets': settings.SEARCH_SOUNDS_DEFAULT_FACETS,
+        'facets': settings.SEARCH_SOUNDS_DEFAULT_FACETS.copy(),
     }
 
     # These variables are not used for querying the sound collection
diff --git a/utils/spam.py b/utils/spam.py
index 88dbec62c..8ae7a64d9 100644
--- a/utils/spam.py
+++ b/utils/spam.py
@@ -39,31 +39,32 @@ def is_spam(request, comment):
         if spam_chunk in comment:
             return True
 
-    # Akismet check
-    domain = f"https://{Site.objects.get_current().domain}"
-    try:
-        api = Akismet(key=settings.AKISMET_KEY, blog_url=domain)
-    except (APIKeyError, ConfigurationError):
-        return False
+    if settings.AKISMET_KEY:
+        # Akismet check
+        domain = f"https://{Site.objects.get_current().domain}"
+        try:
+            api = Akismet(key=settings.AKISMET_KEY, blog_url=domain)
+        except (APIKeyError, ConfigurationError):
+            return False
 
-    try:
-        x_forwarded_for = request.headers.get('x-forwarded-for')
-        if x_forwarded_for:
-            user_ip = x_forwarded_for.split(',')[0].strip()
-        else:
-            user_ip = '127.0.0.1'
-        if api.comment_check(
-                user_ip=user_ip,
-                user_agent=request.headers.get("user-agent", None),
-                referrer=request.headers.get("referer", None),
-                comment_type="comment",
-                comment_author=request.user.username.encode("utf-8") if request.user.is_authenticated else None,
-                comment_content=comment.encode("utf-8"),
-        ):
-            if request.user.is_authenticated:
-                AkismetSpam.objects.create(user=request.user, spam=comment)
-            return True
-        else:
+        try:
+            x_forwarded_for = request.headers.get('x-forwarded-for')
+            if x_forwarded_for:
+                user_ip = x_forwarded_for.split(',')[0].strip()
+            else:
+                user_ip = '127.0.0.1'
+            if api.comment_check(
+                    user_ip=user_ip,
+                    user_agent=request.headers.get("user-agent", None),
+                    referrer=request.headers.get("referer", None),
+                    comment_type="comment",
+                    comment_author=request.user.username.encode("utf-8") if request.user.is_authenticated else None,
+                    comment_content=comment.encode("utf-8"),
+            ):
+                if request.user.is_authenticated:
+                    AkismetSpam.objects.create(user=request.user, spam=comment)
+                return True
+            else:
+                return False
+        except (AkismetError, HTTPError, URLError):  # failed to contact akismet...
             return False
-    except (AkismetError, HTTPError, URLError):  # failed to contact akismet...
-        return False
diff --git a/utils/tests/test_search_general.py b/utils/tests/test_search_general.py
index 6f253612d..fc6bc589c 100644
--- a/utils/tests/test_search_general.py
+++ b/utils/tests/test_search_general.py
@@ -51,16 +51,7 @@ def test_search_prepare_parameters_without_query_params(self):
             'similar_to': None,
             'only_sounds_with_pack': False,
             'only_sounds_within_ids': [],
-            'facets': {'bitdepth': {},
-                'bitrate': {},
-                'channels': {},
-                'license': {'limit': 10},
-                'pack_grouping': {'limit': 10},
-                'samplerate': {},
-                'tags': {'limit': 30},
-                'type': {'limit': 7},
-                'username': {'limit': 30}
-            },
+            'facets': settings.SEARCH_SOUNDS_DEFAULT_FACETS
         }
 
         expected_extra_vars = {
@@ -106,16 +97,7 @@ def test_search_prepare_parameters_with_query_params(self):
             'similar_to': None,
             'only_sounds_with_pack': False,
             'only_sounds_within_ids': [],
-            'facets': {'bitdepth': {},
-                'bitrate': {},
-                'channels': {},
-                'license': {'limit': 10},
-                'pack_grouping': {'limit': 10},
-                'samplerate': {},
-                'tags': {'limit': 30},
-                'type': {'limit': 7},
-                'username': {'limit': 30}
-            },
+            'facets': settings.SEARCH_SOUNDS_DEFAULT_FACETS
         }
 
         expected_extra_vars = {

From 5f4b01374fb7f41b3d678f8f6af3285a43365d09 Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Tue, 6 Feb 2024 17:12:49 +0100
Subject: [PATCH 24/28] Add field_list parameter to search_sounds

---
 utils/search/__init__.py               | 8 ++++++--
 utils/search/backends/solr555pysolr.py | 7 ++++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/utils/search/__init__.py b/utils/search/__init__.py
index 1ab918366..d2f904f22 100644
--- a/utils/search/__init__.py
+++ b/utils/search/__init__.py
@@ -228,8 +228,9 @@ def get_all_sound_ids_from_index(self):
         """
         raise NotImplementedError
 
-    def search_sounds(self, textual_query='', query_fields=None, query_filter='', offset=0, current_page=None,
-                      num_sounds=settings.SOUNDS_PER_PAGE, sort=settings.SEARCH_SOUNDS_SORT_OPTION_AUTOMATIC,
+    def search_sounds(self, textual_query='', query_fields=None, query_filter='', field_list=['id', 'score'], 
+                      offset=0, current_page=None, num_sounds=settings.SOUNDS_PER_PAGE, 
+                      sort=settings.SEARCH_SOUNDS_SORT_OPTION_AUTOMATIC,
                       group_by_pack=False, num_sounds_per_pack_group=1, facets=None, only_sounds_with_pack=False, 
                       only_sounds_within_ids=False, group_counts_as_one_in_facets=False, 
                       simialr_to=None, similar_to_max_num_sounds=settings.SEARCH_ENGINE_NUM_SIMILAR_SOUNDS_PER_QUERY, 
@@ -244,6 +245,9 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of
                     query_fields = [settings.SEARCH_SOUNDS_FIELD_ID, settings.SEARCH_SOUNDS_FIELD_USER_NAME]
                     query_fields = {settings.SEARCH_SOUNDS_FIELD_ID:1 , settings.SEARCH_SOUNDS_FIELD_USER_NAME: 4}
             query_filter (str, optional): filter expression following lucene filter syntax
+            field_list (List[str], optional): list of fields to return by the search engine. Typically we're only interested
+                in sound IDs because we don't use data form the search engine to display sounds, but in some cases it can 
+                be necessary to return further data.
             offset (int, optional): offset for the returned results
             current_page (int, optional): alternative way to set offset using page numbers. Using current_page will
                 set offset like offset=current_page*num_sounds
diff --git a/utils/search/backends/solr555pysolr.py b/utils/search/backends/solr555pysolr.py
index c12822478..d304b7af2 100644
--- a/utils/search/backends/solr555pysolr.py
+++ b/utils/search/backends/solr555pysolr.py
@@ -490,8 +490,9 @@ def get_all_sound_ids_from_index(self):
             current_page += 1
         return sorted(solr_ids)
 
-    def search_sounds(self, textual_query='', query_fields=None, query_filter='', offset=0, current_page=None,
-                      num_sounds=settings.SOUNDS_PER_PAGE, sort=settings.SEARCH_SOUNDS_SORT_OPTION_AUTOMATIC,
+    def search_sounds(self, textual_query='', query_fields=None, query_filter='', field_list=['id', 'score'], 
+                      offset=0, current_page=None, num_sounds=settings.SOUNDS_PER_PAGE, 
+                      sort=settings.SEARCH_SOUNDS_SORT_OPTION_AUTOMATIC,
                       group_by_pack=False, num_sounds_per_pack_group=1, facets=None, only_sounds_with_pack=False, 
                       only_sounds_within_ids=False, group_counts_as_one_in_facets=False, 
                       similar_to=None, similar_to_max_num_sounds=settings.SEARCH_ENGINE_NUM_SIMILAR_SOUNDS_PER_QUERY ,
@@ -578,7 +579,7 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of
             offset = (current_page - 1) * num_sounds
         query.set_query_options(start=offset,
                                 rows=num_sounds,
-                                field_list=["id", "score"],  # We only want the sound IDs of the results as we load data from DB
+                                field_list=field_list,  # We generally only want the sound IDs of the results as we load data from DB
                                 filter_query=query_filter,
                                 sort=self.search_process_sort(sort) if not similar_to else ['score desc'])  # In similarity queries, we always sort by distance to target
 

From 2c13e1256f02061073dc92403a5f032b570344fd Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Tue, 6 Feb 2024 17:18:47 +0100
Subject: [PATCH 25/28] Optimize map query page load

Now we save geotags data returned from solr in the cache, and load it when computing the bytearray. In this way we only make one solr query (and no DB queries) for showing results in the map, which makes page load quite fast.
---
 freesound/settings.py        |  2 +-
 geotags/urls.py              |  2 +-
 geotags/views.py             | 39 +++++++++++++++++++++++-------------
 search/views.py              | 14 ++++++++++---
 templates/search/search.html |  7 ++++++-
 5 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/freesound/settings.py b/freesound/settings.py
index 45cfe6265..668c2a1e9 100644
--- a/freesound/settings.py
+++ b/freesound/settings.py
@@ -653,7 +653,7 @@
 USE_SEARCH_ENGINE_SIMILARITY = False  # Does not currently apply to API
 
 SEARCH_ALLOW_DISPLAY_RESULTS_IN_MAP = True
-MAX_SEARCH_RESULTS_IN_MAP_DISPLAY = 1000  # This is the maximum number of sounds that will be shown when using "display results in map" mode
+MAX_SEARCH_RESULTS_IN_MAP_DISPLAY = 10000  # This is the maximum number of sounds that will be shown when using "display results in map" mode
 
 # -------------------------------------------------------------------------------
 # Similarity client settings
diff --git a/geotags/urls.py b/geotags/urls.py
index de9a3e387..c21541c94 100644
--- a/geotags/urls.py
+++ b/geotags/urls.py
@@ -26,7 +26,7 @@
     path('sounds_barray/user_latest/<username>/', geotags.geotags_for_user_latest_barray, name="geotags-for-user-latest-barray"),
     path('sounds_barray/pack/<int:pack_id>/', geotags.geotags_for_pack_barray, name="geotags-for-pack-barray"),
     path('sounds_barray/sound/<int:sound_id>/', geotags.geotag_for_sound_barray, name="geotags-for-sound-barray"),
-    path('sounds_barray/query/', geotags.gelotags_for_query_barray, name="geotags-for-query-barray"),
+    path('sounds_barray/query/', geotags.geotags_for_query_barray, name="geotags-for-query-barray"),
     re_path(r'^sounds_barray/(?P<tag>[\w-]+)?/?$', geotags.geotags_barray, name="geotags-barray"),
     path('geotags_box_barray/', geotags.geotags_box_barray, name="geotags-box-barray"),
     path('infowindow/<int:sound_id>/', geotags.infowindow, name="geotags-infowindow"),
diff --git a/geotags/views.py b/geotags/views.py
index df62fca18..8755362c9 100644
--- a/geotags/views.py
+++ b/geotags/views.py
@@ -59,6 +59,7 @@ def update_query_params_for_map_query(query_params, preserve_facets=False):
         'num_sounds': settings.MAX_SEARCH_RESULTS_IN_MAP_DISPLAY,
         'group_by_pack': False,
         'only_sounds_with_pack': False,
+        'field_list': ['id', 'score', 'geotag']
     })
     if not preserve_facets:
         # No need to compute facets for the bytearray, but it might be needed for the main query
@@ -66,17 +67,29 @@ def update_query_params_for_map_query(query_params, preserve_facets=False):
             del query_params['facets']
 
 
-def generate_bytearray(sound_queryset):
+def generate_bytearray(sound_queryset_or_list):
     # sounds as bytearray
     packed_sounds = io.BytesIO()
     num_sounds_in_bytearray = 0
-    for s in sound_queryset:
-        if not math.isnan(s.geotag.lat) and not math.isnan(s.geotag.lon):
-            packed_sounds.write(struct.pack("i", s.id))
-            packed_sounds.write(struct.pack("i", int(s.geotag.lat*1000000)))
-            packed_sounds.write(struct.pack("i", int(s.geotag.lon*1000000)))
-            num_sounds_in_bytearray += 1
-
+    for s in sound_queryset_or_list:
+        if type(s) == Sound:
+            if not math.isnan(s.geotag.lat) and not math.isnan(s.geotag.lon):
+                packed_sounds.write(struct.pack("i", s.id))
+                packed_sounds.write(struct.pack("i", int(s.geotag.lat * 1000000)))
+                packed_sounds.write(struct.pack("i", int(s.geotag.lon * 1000000)))
+                num_sounds_in_bytearray += 1
+        elif type(s) == dict:
+            try:
+                lon, lat = s['geotag'][0].split(' ')
+                lat = max(min(float(lat), 90), -90) 
+                lon = max(min(float(lon), 180), -180) 
+                packed_sounds.write(struct.pack("i", s['id']))
+                packed_sounds.write(struct.pack("i", int(lat * 1000000)))
+                packed_sounds.write(struct.pack("i", int(lon * 1000000)))
+                num_sounds_in_bytearray += 1
+            except:
+                pass
+            
     return packed_sounds.getvalue(), num_sounds_in_bytearray
 
 
@@ -161,12 +174,10 @@ def geotag_for_sound_barray(request, sound_id):
     return HttpResponse(generated_bytearray, content_type='application/octet-stream')
 
 
-def gelotags_for_query_barray(request):
-    query_params, _, _ = search_prepare_parameters(request)
-    update_query_params_for_map_query(query_params)
-    results, _ = perform_search_engine_query(query_params)
-    resultids = [d.get("id") for d in results.docs]
-    generated_bytearray, num_geotags = generate_bytearray(Sound.objects.select_related('geotag').filter(id__in=resultids))
+def geotags_for_query_barray(request):
+    results_cache_key = request.GET.get('key', None)
+    results_docs = cache.get(results_cache_key)
+    generated_bytearray, num_geotags = generate_bytearray(results_docs)
     if num_geotags > 0:
         log_map_load('query', num_geotags, request)
     return HttpResponse(generated_bytearray, content_type='application/octet-stream')
diff --git a/search/views.py b/search/views.py
index a67e75509..39313a051 100644
--- a/search/views.py
+++ b/search/views.py
@@ -22,6 +22,7 @@
 import json
 import logging
 import re
+import uuid
 import sentry_sdk
 from collections import defaultdict, Counter
 
@@ -118,12 +119,14 @@ def search_view_helper(request, tags_mode=False):
     disable_display_results_in_grid_option = False
     map_bytearray_url = ''
     use_map_mode = settings.SEARCH_ALLOW_DISPLAY_RESULTS_IN_MAP and request.GET.get("mm", "0") == "1"
+    map_mode_query_results_cache_key = None
     if use_map_mode:
         disable_group_by_pack_option = True
         disable_only_sounds_by_pack_option = True
         disable_display_results_in_grid_option = True
         geotags.views.update_query_params_for_map_query(query_params, preserve_facets=True)
-        map_bytearray_url = reverse('geotags-for-query-barray') + '?' + request.get_full_path().split('?')[-1]
+        map_mode_query_results_cache_key = f'map-query-results-{str(uuid.uuid4())[0:10]}'
+        map_bytearray_url = reverse('geotags-for-query-barray') + f'?key={map_mode_query_results_cache_key}'
 
     tvars = {
         'error_text': None,
@@ -194,8 +197,13 @@ def search_view_helper(request, tags_mode=False):
                 for d in docs:
                     d["pack"] = allpacks[int(d.get("group_name").split('_')[0])]
         else:
-            # In map mode we don't need to retrieve any information about sounds as we'll make another query
-            # to generate the geotags bytearray
+            # In map we configure the search query to already return geotags data. Here we collect all this data
+            # and save it to the cache so we can collect it in the 'geotags_for_query_barray' view which prepares
+            # data points for the map of sounds. 
+            cache.set(map_mode_query_results_cache_key, results.docs, 60)  # cache for 1 minute
+
+            # Nevertheless we set docs to empty list as we won't displat anything in the search results page (the map
+            # will make an extra request that will load the cached data and display it in the map)
             docs = []
 
         search_logger.info('Search (%s)' % json.dumps({
diff --git a/templates/search/search.html b/templates/search/search.html
index a39f5ff02..5a061ce3a 100644
--- a/templates/search/search.html
+++ b/templates/search/search.html
@@ -225,7 +225,7 @@ <h3>
                                 </li>
                                 {% if allow_map_mode %}
                                 <li class="bw-search__filter-value v-padding-1">
-                                    <label class="between w-100" title="Display search results in a map. Note that a maximum of {{ max_search_results_map_mode }} sounds will be displayed.">
+                                    <label class="between w-100" title="Display search results in a map">
                                         <div class="bw-search__filter-checkbox">
                                             <input id="use_map_mode" type="checkbox" class="bw-checkbox" {% if use_map_mode %}checked{% endif %} />
                                             <input id="use_map_mode_hidden" type="hidden" name="mm" value="{{ use_map_mode|yesno:"1,0" }}"/>
@@ -376,6 +376,11 @@ <h5>No results... &#128543</h5>
                      data-access-token="pk.eyJ1IjoiZnJlZXNvdW5kIiwiYSI6ImNrd3E0Mm9lbjBqM2Qyb2wwdmwxaWI3a3oifQ.MZkgLSByRuk_Xql67CySAg"
                 ><span id="mapLoadingIndicator">Loading map... <img width="12px" height="12px" src="{% static 'bw-frontend/public/bw_indicator.gif' %}"></span></div>
             </div>
+            {% if paginator.count > max_search_results_map_mode %}
+            <div class="right">
+                <p class="text-grey">{% bw_icon 'notification' %} Note that only the first {{ max_search_results_map_mode|bw_intcomma }} search results are shown on the map</p>
+            </div>
+            {% endif %}
             {% endif %}
         </main>
     </div>

From eafe53f0efcbad96c8492ef9666a17b5bfe3e1c1 Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Wed, 7 Feb 2024 14:05:19 +0100
Subject: [PATCH 26/28] Add option to make query barray from request params

This will be useful in map embeds
---
 geotags/views.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/geotags/views.py b/geotags/views.py
index 8755362c9..155d5e587 100644
--- a/geotags/views.py
+++ b/geotags/views.py
@@ -176,7 +176,16 @@ def geotag_for_sound_barray(request, sound_id):
 
 def geotags_for_query_barray(request):
     results_cache_key = request.GET.get('key', None)
-    results_docs = cache.get(results_cache_key)
+    if results_cache_key is not None:
+        # If cache key is present, use it to get the results
+        results_docs = cache.get(results_cache_key)
+    else:
+        # Otherwise, perform a search query to get the results
+        query_params, _, _ = search_prepare_parameters(request)
+        update_query_params_for_map_query(query_params)
+        results, _ = perform_search_engine_query(query_params)
+        results_docs = results.docs
+    
     generated_bytearray, num_geotags = generate_bytearray(results_docs)
     if num_geotags > 0:
         log_map_load('query', num_geotags, request)

From 30a1dc8ee481c4c39529cafd39f60ca5f8a3a630 Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Wed, 7 Feb 2024 14:45:35 +0100
Subject: [PATCH 27/28] WIP Support for query-based map embeds

---
 geotags/views.py                         | 4 +++-
 templates/embeds/geotags_box_iframe.html | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/geotags/views.py b/geotags/views.py
index 155d5e587..08bd26f17 100644
--- a/geotags/views.py
+++ b/geotags/views.py
@@ -23,6 +23,7 @@
 import logging
 import math
 import struct
+import urllib.parse
 
 from django.conf import settings
 from django.core.cache import cache
@@ -199,7 +200,8 @@ def _get_geotags_query_params(request):
         'zoom': request.GET.get('z', None),
         'username': request.GET.get('username', None),
         'pack': request.GET.get('pack', None),
-        'tag': request.GET.get('tag', None)
+        'tag': request.GET.get('tag', None),
+        'query_params': urllib.parse.unquote(request.GET['qp']) if 'qp' in request.GET else None  # This is used for map embeds based on general queries
     }
 
 
diff --git a/templates/embeds/geotags_box_iframe.html b/templates/embeds/geotags_box_iframe.html
index 7b65fecb2..7d110b812 100644
--- a/templates/embeds/geotags_box_iframe.html
+++ b/templates/embeds/geotags_box_iframe.html
@@ -25,7 +25,9 @@
     var center_lon;
     var zoom;
 
-    {% if username %}
+    {% if query_params %}
+        url = '{% url "geotags-for-query-barray" %}?{{ query_params|safe }}';
+    {% elif username %}
         url = '{% url "geotags-for-user-barray" username %}?embed=1';
     {% elif pack %}
         url = '{% url "geotags-for-pack-barray" pack %}?embed=1';

From ef508e1392693876931bfaa953207a2f987455df Mon Sep 17 00:00:00 2001
From: ffont <frederic.font@upf.edu>
Date: Thu, 8 Feb 2024 15:53:51 +0100
Subject: [PATCH 28/28] Several improvements in solr-based maps

* Add support for solr-based queries in map
* Add navigation links from search page to map page and vice versa
 * Improvements in map embeds code
* Remove old unneeded geotags box code
* Use solr backend in tag/user/pack map pages
---
 accounts/tests/test_views.py                  |  9 +-
 .../bw-frontend/src/components/mapsMapbox.js  |  2 +-
 freesound/static/bw-frontend/src/pages/map.js | 45 +++++-----
 freesound/urls.py                             |  4 +-
 geotags/tests.py                              | 14 ++--
 geotags/urls.py                               |  1 -
 geotags/views.py                              | 82 +++++++++++--------
 search/views.py                               | 16 +++-
 ...ags_box_iframe.html => geotags_embed.html} | 19 ++---
 templates/geotags/geotags.html                |  4 +
 templates/geotags/geotags_content.html        | 28 +++----
 templates/search/search.html                  | 16 ++--
 12 files changed, 124 insertions(+), 116 deletions(-)
 rename templates/embeds/{geotags_box_iframe.html => geotags_embed.html} (84%)

diff --git a/accounts/tests/test_views.py b/accounts/tests/test_views.py
index e29b3e3ed..2c6cd9e6a 100644
--- a/accounts/tests/test_views.py
+++ b/accounts/tests/test_views.py
@@ -287,14 +287,9 @@ def test_sound_search_response(self):
         resp = self.client.get(reverse('sounds-search'))
         self.assertEqual(resp.status_code, 200)
 
-    def test_geotags_box_response(self):
-        # 200 response on geotag box page access
-        resp = self.client.get(reverse('geotags-box'))
-        self.assertEqual(resp.status_code, 200)
-
-    def test_geotags_box_iframe_response(self):
+    def test_geotags_embed_response(self):
         # 200 response on geotag box iframe
-        resp = self.client.get(reverse('embed-geotags-box-iframe'))
+        resp = self.client.get(reverse('embed-geotags'))
         self.assertEqual(resp.status_code, 200)
 
     def test_accounts_manage_pages(self):
diff --git a/freesound/static/bw-frontend/src/components/mapsMapbox.js b/freesound/static/bw-frontend/src/components/mapsMapbox.js
index e72a97597..98c6ab476 100644
--- a/freesound/static/bw-frontend/src/components/mapsMapbox.js
+++ b/freesound/static/bw-frontend/src/components/mapsMapbox.js
@@ -292,7 +292,7 @@ function makeSoundsMap(geotags_url, map_element_id, on_built_callback, on_bounds
                     if (nSounds > 1){
                         // The padding and offset "manual" adjustments of bounds below are to make the boudns more similar to
                         // those created in the mapbox static maps
-                        map.fitBounds(bounds, {duration:0, offset:[-10, 0],  padding: {top:60, right:60, left:0, bottom:50}});
+                        map.fitBounds(bounds, {duration:0, offset:[0, 0],  padding: {top:60, right:60, left:60, bottom:60}});
                     } else {
                         map.setZoom(3);
                         if (nSounds > 0){
diff --git a/freesound/static/bw-frontend/src/pages/map.js b/freesound/static/bw-frontend/src/pages/map.js
index 4c1a512e4..8c8c1182e 100644
--- a/freesound/static/bw-frontend/src/pages/map.js
+++ b/freesound/static/bw-frontend/src/pages/map.js
@@ -12,10 +12,6 @@ const tagFilterInput = document.getElementById("tagFilter");
 let currentLat;
 let currentLon;
 let currentZoom;
-let currentBoxBlLa;
-let currentBoxBlLon;
-let currentBoxTrLat;
-let currentBoxTrLon;
 
 const toggleEmbedControls = () => {
     if (embedControls.classList.contains('display-none')){
@@ -42,22 +38,21 @@ const updateQueryStringParameter = (uri, key, value) => {
     }
 }
 
-const updateEmbedCode = (mapElementId, lat, lon, zoom, boxBlLat, boxBlLon, boxTrLat, boxTrLon) => {
+const updateEmbedCode = (mapElementId, lat, lon, zoom) => {
     if (embedCodeElement === null){ return; }
-
-    const mapCanvas = document.getElementById(mapElementId);
+    let mapCanvas;
+    if (mapElementId === undefined){
+        mapCanvas = document.getElementsByClassName('main-map')[0];
+    } else {
+        mapCanvas = document.getElementById(mapElementId);
+    }
 
     // Store lat, lon and zoom globally so we can use them later to call updateEmbedCode without accessing map
     currentLat = lat;
     currentLon = lon;
     currentZoom = zoom;
-    currentBoxBlLa = boxBlLat;
-    currentBoxBlLon = boxBlLon;
-    currentBoxTrLat = boxTrLat;
-    currentBoxTrLon = boxTrLon;
 
     // Generate embed code
-    const box = "#box=" + boxBlLat + "," + boxBlLon+"," + boxTrLat+"," + boxTrLon;
     const width = parseInt(embedWidthInputElement.value, 10);
     const height = parseInt(embedHeightInputElement.value, 10);
     let cluster = 'on';
@@ -66,16 +61,19 @@ const updateEmbedCode = (mapElementId, lat, lon, zoom, boxBlLat, boxBlLon, boxTr
     }
     let embedCode = "<iframe frameborder=\"0\" scrolling=\"no\" src=\"" + mapCanvas.dataset.geotagsEmbedBaseUrl
         + "?c_lat=" + lat + "&c_lon=" + lon + "&z=" + zoom + "&c=" + cluster + "&w=" + width + "&h=" + height;
-    if (mapCanvas.dataset.mapUsername !== "None"){
+    if (mapCanvas.dataset.mapUsername !== ""){
         embedCode += "&username=" + mapCanvas.dataset.mapUsername;
     }
-    if (mapCanvas.dataset.mapTag !== "None"){
+    if (mapCanvas.dataset.mapTag !== ""){
         embedCode += "&tag=" + mapCanvas.dataset.mapTag;
     }
-    if (mapCanvas.dataset.mapPackId !== "None"){
+    if (mapCanvas.dataset.mapPackId !== ""){
         embedCode += "&pack=" + mapCanvas.dataset.mapPackId;
     }
-    embedCode += box + "\" width=\"" + width + "\" height=\"" + height + "\"></iframe>";
+    if (mapCanvas.dataset.mapQp !== ""){
+        embedCode += "&qp=" + mapCanvas.dataset.mapQp;
+    }
+    embedCode += "\" width=\"" + width + "\" height=\"" + height + "\"></iframe>";
     embedCodeElement.innerText = embedCode;
 
     // Update page URL so it can directly be used to share the map
@@ -87,7 +85,7 @@ const updateEmbedCode = (mapElementId, lat, lon, zoom, boxBlLat, boxBlLon, boxTr
 }
 
 const changeEmbedWidthHeightCluster = () => {
-    updateEmbedCode(undefined, currentLat, currentLon, currentZoom, currentBoxBlLa, currentBoxBlLon, currentBoxTrLat, currentBoxTrLon);
+    updateEmbedCode(undefined, currentLat, currentLon, currentZoom);
 }
 
 const initMap = (mapCanvas) => {
@@ -107,7 +105,7 @@ const initMap = (mapCanvas) => {
     [embedWidthInputElement, embedHeightInputElement, embedClusterCheckElement].forEach(element => {
         if (element !== null){
             element.addEventListener('change', () => {
-            changeEmbedWidthHeightCluster();
+                changeEmbedWidthHeightCluster();
             });
         }
     });
@@ -140,13 +138,6 @@ const initMap = (mapCanvas) => {
         zoom = mapCanvas.dataset.mapZoom;
     }
     let url = mapCanvas.dataset.geotagsUrl;
-    const urlBox = mapCanvas.dataset.geotagsUrlBox;
-    const box = document.location.hash.slice(5, document.location.hash.length);
-    if (box !== ''){
-        // If box is given, get the geotags only from that box
-        url = `${urlBox}?box=${box}`;
-    }
-
     const showSearch = (mapCanvas.dataset.mapShowSearch !== undefined && mapCanvas.dataset.mapShowSearch === 'true');
     const showStyleSelector = true;
     const clusterGeotags = true;
@@ -156,7 +147,11 @@ const initMap = (mapCanvas) => {
         if (loadingIndicator !== null){
             loadingIndicator.innerText = `${numLoadedSounds} sound${ numLoadedSounds === 1 ? '': 's'}`;
         }
+        embedWidthInputElement.value = mapCanvas.offsetWidth;
+        embedHeightInputElement.value = mapCanvas.offsetHeight;
     }, updateEmbedCode, centerLat, centerLon, zoom, showSearch, showStyleSelector, clusterGeotags, showMapEvenIfNoGeotags);
+
+    
 }
 
 export { initMap };
\ No newline at end of file
diff --git a/freesound/urls.py b/freesound/urls.py
index 532294fae..0a87522af 100644
--- a/freesound/urls.py
+++ b/freesound/urls.py
@@ -82,7 +82,7 @@
     path('charts/', accounts.views.charts, name="charts"),
 
     path('embed/sound/iframe/<int:sound_id>/simple/<player_size>/', sounds.views.embed_iframe, name="embed-simple-sound-iframe"),
-    path('embed/geotags_box/iframe/', geotags.views.embed_iframe, name="embed-geotags-box-iframe"),
+    path('embed/geotags_box/iframe/', geotags.views.embed_iframe, name="embed-geotags"),
     path('oembed/', sounds.views.oembed, name="oembed-sound"),
 
     path('after-download-modal/', sounds.views.after_download_modal, name="after-download-modal"),
@@ -93,7 +93,7 @@
     path('browse/packs/', sounds.views.packs, name="packs"),
     path('browse/random/', sounds.views.random, name="sounds-random"),
     re_path(r'^browse/geotags/(?P<tag>[\w-]+)?/?$', geotags.views.geotags, name="geotags"),
-    path('browse/geotags_box/', geotags.views.geotags_box, name="geotags-box"),
+    path('browse/query/', geotags.views.for_query, name="geotags-query"),
 
     path('contact/', support.views.contact, name="contact"),
 
diff --git a/geotags/tests.py b/geotags/tests.py
index 53c0fb7c4..30778961a 100644
--- a/geotags/tests.py
+++ b/geotags/tests.py
@@ -40,13 +40,8 @@ def test_browse_geotags(self):
         check_values = {'tag': 'soundscape', 'username': None}
         self.check_context(resp.context, check_values)
 
-    def test_browse_geotags_box(self):
-        resp = self.client.get(reverse('geotags-box'))
-        check_values = {'center_lat': None, 'center_lon': None, 'zoom': None, 'username': None}
-        self.check_context(resp.context, check_values)
-
-    def test_geotags_box_iframe(self):
-        resp = self.client.get(reverse('embed-geotags-box-iframe'))
+    def test_geotags_embed(self):
+        resp = self.client.get(reverse('embed-geotags'))
         check_values = {'m_width': 942, 'm_height': 600, 'cluster': True, 'center_lat': None, 'center_lon': None,
                         'zoom': None, 'username': None}
         self.check_context(resp.context, check_values)
@@ -96,3 +91,8 @@ def test_browse_geotags_case_insensitive(self):
         # Response contains 3 int32 objects per sound: id, lat and lng. Total size = 3 * 4 bytes = 12 bytes
         n_sounds = len(resp.content) // 12
         self.assertEqual(n_sounds, 2)
+
+    def test_browse_geotags_for_query(self):
+        resp = self.client.get(reverse('geotags-query') + f'?q=barcelona')
+        check_values = {'query_description': 'barcelona'}
+        self.check_context(resp.context, check_values)
diff --git a/geotags/urls.py b/geotags/urls.py
index c21541c94..be31973b3 100644
--- a/geotags/urls.py
+++ b/geotags/urls.py
@@ -28,6 +28,5 @@
     path('sounds_barray/sound/<int:sound_id>/', geotags.geotag_for_sound_barray, name="geotags-for-sound-barray"),
     path('sounds_barray/query/', geotags.geotags_for_query_barray, name="geotags-for-query-barray"),
     re_path(r'^sounds_barray/(?P<tag>[\w-]+)?/?$', geotags.geotags_barray, name="geotags-barray"),
-    path('geotags_box_barray/', geotags.geotags_box_barray, name="geotags-box-barray"),
     path('infowindow/<int:sound_id>/', geotags.infowindow, name="geotags-infowindow"),
 ]
diff --git a/geotags/views.py b/geotags/views.py
index 08bd26f17..b7ca034ba 100644
--- a/geotags/views.py
+++ b/geotags/views.py
@@ -32,6 +32,7 @@
 from django.urls import reverse
 from django.views.decorators.cache import cache_page
 from django.views.decorators.clickjacking import xframe_options_exempt
+from accounts.models import Profile
 
 from search.views import search_prepare_parameters
 from sounds.models import Sound, Pack
@@ -51,7 +52,7 @@ def update_query_params_for_map_query(query_params, preserve_facets=False):
     # Force is_geotagged filter to be present
     if query_params['query_filter']:
         if 'is_geotagged' not in query_params['query_filter']:
-            query_params['query_filter'] = query_params['query_filter'] + ' +is_geotagged:1'
+            query_params['query_filter'] = query_params['query_filter'] + ' is_geotagged:1'
     else:
         query_params['query_filter'] = 'is_geotagged:1'
     # Force one single page with "all" results, and don't group by pack
@@ -113,37 +114,18 @@ def geotags_barray(request, tag=None):
             return HttpResponse(generated_bytearray, content_type='application/octet-stream')
 
 
-def geotags_box_barray(request):
-    box = request.GET.get("box", "-180,-90,180,90")
-    is_embed = request.GET.get("embed", "0") == "1"
-    try:
-        min_lat, min_lon, max_lat, max_lon = box.split(",")
-        qs = Sound.objects.select_related("geotag").exclude(geotag=None).filter(moderation_state="OK", processing_state="OK")
-        sounds = []
-        if min_lat <= max_lat and min_lon <= max_lon:
-            sounds = qs.filter(geotag__lat__range=(min_lat, max_lat)).filter(geotag__lon__range=(min_lon, max_lon))
-        elif min_lat > max_lat and min_lon <= max_lon:
-            sounds = qs.exclude(geotag__lat__range=(max_lat, min_lat)).filter(geotag__lon__range=(min_lon, max_lon))
-        elif min_lat <= max_lat and min_lon > max_lon:
-            sounds = qs.filter(geotag__lat__range=(min_lat, max_lat)).exclude(geotag__lon__range=(max_lon, min_lon))
-        elif min_lat > max_lat and min_lon > max_lon:
-            sounds = qs.exclude(geotag__lat__range=(max_lat, min_lat)).exclude(geotag__lon__range=(max_lon, min_lon))
-
-        generated_bytearray, num_geotags = generate_bytearray(sounds)
-        if num_geotags > 0:
-            log_map_load('box-embed' if is_embed else 'box', num_geotags, request)
-        return HttpResponse(generated_bytearray, content_type='application/octet-stream')
-    except ValueError:
-        raise Http404
-
-
 @redirect_if_old_username_or_404
 @raise_404_if_user_is_deleted
 @cache_page(60 * 15)
 def geotags_for_user_barray(request, username):
+    profile = get_object_or_404(Profile, user__username=username)
     is_embed = request.GET.get("embed", "0") == "1"
-    sounds = Sound.public.select_related('geotag').filter(user__username__iexact=username).exclude(geotag=None)
-    generated_bytearray, num_geotags = generate_bytearray(sounds)
+    results, _ = perform_search_engine_query({
+        'query_filter': f'username:"{username}" is_geotagged:1',  # No need to urlencode here as it will happpen somwhere before sending query to solr
+        'field_list': ['id', 'score', 'geotag'],
+        'num_sounds': profile.num_sounds,
+    })
+    generated_bytearray, num_geotags = generate_bytearray(results.docs)
     if num_geotags > 0:
         log_map_load('user-embed' if is_embed else 'user', num_geotags, request)
     return HttpResponse(generated_bytearray, content_type='application/octet-stream')
@@ -160,8 +142,13 @@ def geotags_for_user_latest_barray(request, username):
 
 
 def geotags_for_pack_barray(request, pack_id):
-    sounds = Sound.public.select_related('geotag').filter(pack__id=pack_id).exclude(geotag=None)
-    generated_bytearray, num_geotags = generate_bytearray(sounds)
+    pack = get_object_or_404(Pack, id=pack_id)
+    results, _ = perform_search_engine_query({
+        'query_filter': f'grouping_pack:"{pack.id}_{pack.name}" is_geotagged:1',  # No need to urlencode here as it will happpen somwhere before sending query to solr
+        'field_list': ['id', 'score', 'geotag'],
+        'num_sounds': pack.num_sounds,
+    })
+    generated_bytearray, num_geotags = generate_bytearray(results.docs)
     if num_geotags > 0:
         log_map_load('pack', num_geotags, request)
     return HttpResponse(generated_bytearray, content_type='application/octet-stream')
@@ -208,6 +195,7 @@ def _get_geotags_query_params(request):
 def geotags(request, tag=None):
     tvars = _get_geotags_query_params(request)
     if tag is None:
+        query_search_page_url = ''
         url = reverse('geotags-barray')
         # If "all geotags map" and no lat/lon/zoom is indicated, center map so whole world is visible
         if tvars['center_lat'] is None:
@@ -218,12 +206,14 @@ def geotags(request, tag=None):
             tvars['zoom'] = 2
     else:
         url = reverse('geotags-barray', args=[tag])
+        query_search_page_url = reverse('sounds-search') + f'?f=tag:{tag}&mm=1'
 
     tvars.update({  # Overwrite tag and username query params (if present)
         'tag': tag,
         'username': None,
         'pack': None,
         'url': url,
+        'query_search_page_url': query_search_page_url
     })
     return render(request, 'geotags/geotags.html', tvars)
 
@@ -238,6 +228,7 @@ def for_user(request, username):
         'pack': None,
         'sound': None,
         'url': reverse('geotags-for-user-barray', args=[username]),
+        'query_search_page_url': reverse('sounds-search') + f'?f=username:{username}&mm=1'
     })
     return render(request, 'geotags/geotags.html', tvars)
 
@@ -278,6 +269,7 @@ def for_pack(request, username, pack_id):
         'pack': pack,
         'sound': None,
         'url': reverse('geotags-for-pack-barray', args=[pack.id]),
+        'query_search_page_url': reverse('sounds-search') + f'?f=grouping_pack:"{pack.id}_{urllib.parse.quote(pack.name)}"&mm=1',
         'modal_version': request.GET.get('ajax'),
     })
     if request.GET.get('ajax'):
@@ -288,12 +280,32 @@ def for_pack(request, username, pack_id):
         return render(request, 'geotags/geotags.html', tvars)
 
 
-def geotags_box(request):
-    # This view works the same as "geotags" but it takes the username/tag parameter from query parameters and
-    # onyl gets the geotags for a specific bounding box specified via hash parameters.
-    # Currently we are only keeping this as legacy because it is not used anymore but there might still be
-    # links pointing to it.
+def for_query(request):
     tvars = _get_geotags_query_params(request)
+    request_parameters_string = request.get_full_path().split('?')[-1]
+    q = request.GET.get('q', None)
+    f = request.GET.get('f', None)
+    query_description = ''
+    if q is None and f is None:
+        query_description = 'Empty query'
+    elif q is not None and f is not None:
+        query_description = f'{q} (some filters applied)'
+    else:
+        if q is not None:
+            query_description = q
+        if f is not None:
+            query_description = f'Empty query with some filtes applied'
+    tvars.update({
+        'tag': None,
+        'username': None,
+        'pack': None,
+        'sound': None,
+        'query_params': request_parameters_string,
+        'query_params_encoded': urllib.parse.quote(request_parameters_string),
+        'query_search_page_url': reverse('sounds-search') + f'?{request_parameters_string}',
+        'query_description': query_description,
+        'url': reverse('geotags-for-query-barray') + f'?{request_parameters_string}',
+    })
     return render(request, 'geotags/geotags.html', tvars)
 
 
@@ -306,7 +318,7 @@ def embed_iframe(request):
         'cluster': request.GET.get('c', 'on') != 'off'
     })
     tvars.update({'mapbox_access_token': settings.MAPBOX_ACCESS_TOKEN})
-    return render(request, 'embeds/geotags_box_iframe.html', tvars)
+    return render(request, 'embeds/geotags_embed.html', tvars)
 
 
 def infowindow(request, sound_id):
diff --git a/search/views.py b/search/views.py
index 39313a051..2ea1cfea6 100644
--- a/search/views.py
+++ b/search/views.py
@@ -39,6 +39,7 @@
     NUM_TAGS_SHOWN_PER_CLUSTER_FACET
 from clustering.interface import cluster_sound_results, get_sound_ids_from_search_engine_query
 from forum.models import Post
+from utils.encryption import create_hash
 from utils.logging_filters import get_client_ip
 from utils.ratelimit import key_for_ratelimiting, rate_per_ip
 from utils.search.search_sounds import perform_search_engine_query, search_prepare_parameters, \
@@ -120,13 +121,19 @@ def search_view_helper(request, tags_mode=False):
     map_bytearray_url = ''
     use_map_mode = settings.SEARCH_ALLOW_DISPLAY_RESULTS_IN_MAP and request.GET.get("mm", "0") == "1"
     map_mode_query_results_cache_key = None
+    open_in_map_url = None
     if use_map_mode:
+        # Prepare some URLs for loading sounds and providing links to map
+        current_query_params = request.get_full_path().split("?")[-1]
+        open_in_map_url = reverse('geotags-query') + f'?{current_query_params}'
+        map_mode_query_results_cache_key = f'map-query-results-{create_hash(current_query_params, 10)}'
+        map_bytearray_url = reverse('geotags-for-query-barray') + f'?key={map_mode_query_results_cache_key}'
+        # Update some query parameters and options to adapt to map mode
         disable_group_by_pack_option = True
         disable_only_sounds_by_pack_option = True
         disable_display_results_in_grid_option = True
         geotags.views.update_query_params_for_map_query(query_params, preserve_facets=True)
-        map_mode_query_results_cache_key = f'map-query-results-{str(uuid.uuid4())[0:10]}'
-        map_bytearray_url = reverse('geotags-for-query-barray') + f'?key={map_mode_query_results_cache_key}'
+        
 
     tvars = {
         'error_text': None,
@@ -140,6 +147,7 @@ def search_view_helper(request, tags_mode=False):
         'only_sounds_with_pack_in_request': "1" if only_sounds_with_pack_in_request else "",
         'disable_only_sounds_by_pack_option': disable_only_sounds_by_pack_option,
         'use_compact_mode': should_use_compact_mode(request),
+        'disable_display_results_in_grid_option': disable_display_results_in_grid_option,
         'advanced': extra_vars['advanced'],
         'sort': query_params['sort'],
         'sort_options': [(option, option) for option in settings.SEARCH_SOUNDS_SORT_OPTIONS_WEB],
@@ -157,7 +165,7 @@ def search_view_helper(request, tags_mode=False):
         'allow_map_mode': settings.SEARCH_ALLOW_DISPLAY_RESULTS_IN_MAP,
         'use_map_mode': use_map_mode,
         'map_bytearray_url': map_bytearray_url,
-        'disable_display_results_in_grid_option': disable_display_results_in_grid_option,
+        'open_in_map_url': open_in_map_url,
         'max_search_results_map_mode': settings.MAX_SEARCH_RESULTS_IN_MAP_DISPLAY
     }
     tvars.update(advanced_search_params_dict)
@@ -200,7 +208,7 @@ def search_view_helper(request, tags_mode=False):
             # In map we configure the search query to already return geotags data. Here we collect all this data
             # and save it to the cache so we can collect it in the 'geotags_for_query_barray' view which prepares
             # data points for the map of sounds. 
-            cache.set(map_mode_query_results_cache_key, results.docs, 60)  # cache for 1 minute
+            cache.set(map_mode_query_results_cache_key, results.docs, 60 * 15)  # cache for 5 minutes
 
             # Nevertheless we set docs to empty list as we won't displat anything in the search results page (the map
             # will make an extra request that will load the cached data and display it in the map)
diff --git a/templates/embeds/geotags_box_iframe.html b/templates/embeds/geotags_embed.html
similarity index 84%
rename from templates/embeds/geotags_box_iframe.html
rename to templates/embeds/geotags_embed.html
index 7d110b812..05f8b7564 100644
--- a/templates/embeds/geotags_box_iframe.html
+++ b/templates/embeds/geotags_embed.html
@@ -31,20 +31,13 @@
         url = '{% url "geotags-for-user-barray" username %}?embed=1';
     {% elif pack %}
         url = '{% url "geotags-for-pack-barray" pack %}?embed=1';
+    {% elif tag %}
+        url = '{% url "geotags-barray" tag %}?embed=1';
     {% else %}
-        {% if tag %}
-            url = '{% url "geotags-barray" tag %}?embed=1';
-        {% else %}
-            url = '{% url "geotags-barray" %}?embed=1';
-            center_lat = 24;
-            center_lon = 20;
-            zoom = 2;
-            var box = document.location.hash.slice(5,document.location.hash.length);
-            if (box !== ''){
-                // If box is given, get the geotags only from that box
-                 url = '{% url "geotags-box-barray" %}?embed=1&box=' + box;
-            }
-        {% endif %}
+        url = '{% url "geotags-barray" %}?embed=1';
+        center_lat = 24;
+        center_lon = 20;
+        zoom = 2;
     {% endif %}
 
     {% if center_lat and center_lon and zoom %}
diff --git a/templates/geotags/geotags.html b/templates/geotags/geotags.html
index b607b9f58..8a51edf87 100644
--- a/templates/geotags/geotags.html
+++ b/templates/geotags/geotags.html
@@ -11,6 +11,8 @@
         Map of sounds for pack {{ pack.name }}
     {% elif sound %}
         Map for sound {{ sound.original_filename }}
+    {% elif query_params %}
+        Map for query {{ query_description}}
     {% else %}
         Map of sounds
     {% endif %}
@@ -26,6 +28,8 @@ <h1 class="line-height-percentage-140 ellipsis">
                 Map of sounds for pack <a href="{% url "pack" pack.user.username pack.id %}">{{ pack.name }}</a>
             {% elif sound %}
                 Map for sound <a href="{% url "sound" sound.user.username sound.id %}">{{ sound.original_filename }}</a>
+            {% elif query_params %}
+                Map for query <a href="{{ query_search_page_url }}">{{ query_description}}</a
             {% else %}
                 Map of sounds
             {% endif %}
diff --git a/templates/geotags/geotags_content.html b/templates/geotags/geotags_content.html
index 6fdd7a948..2e7c7d1f3 100644
--- a/templates/geotags/geotags_content.html
+++ b/templates/geotags/geotags_content.html
@@ -6,33 +6,29 @@
          data-map-center-lon="{{ center_lon }}"
          data-map-zoom="{{ zoom }}"
          data-map-show-search="{% if sound %}false{% else %}true{% endif %}"
-         data-map-username="{{ username }}"
-         data-map-pack-id="{{ pack.id }}"
-         data-map-tag="{{ tag }}"
+         data-map-username="{% if username %}{{ username }}{% endif %}"
+         data-map-pack-id="{% if pack %}{{ pack.id }}{% endif %}"
+         data-map-qp="{% if query_params_encoded %}{{ query_params_encoded }}{% endif %}"
+         data-map-tag="{% if tag %}{{ tag }}{% endif %}"
          data-geotags-url="{{ url }}"
-         data-geotags-url-box="{% url "geotags-box-barray" %}"
-         data-geotags-embed-base-url="{% absurl 'embed-geotags-box-iframe' %}"
+         data-geotags-embed-base-url="{% absurl 'embed-geotags' %}"
          data-access-token="pk.eyJ1IjoiZnJlZXNvdW5kIiwiYSI6ImNrd3E0Mm9lbjBqM2Qyb2wwdmwxaWI3a3oifQ.MZkgLSByRuk_Xql67CySAg"
     ></div>
 </div>
 
 {% if not modal_version %}
 <div class="row middle">
-    <div class="col-md-10 v-spacing-top-3 v-spacing-2">
-        {% if username or sound or pack %}
+    <div class="col-md-8 v-spacing-top-4 v-spacing-2">
+        {% if query_search_page_url %}
+            <a class="no-hover btn-secondary btn-inverse" href="{{ query_search_page_url }}">See results in search page</a>
+        {% endif %}
+        {% if username or sound or pack or query_params %}
             <a class="no-hover btn-secondary btn-inverse" href="{% url "geotags" %}">View all geotags</a>
-        {% else %}
-            <div class="start">
-                <div class="input-wrapper">
-                    <input data-base-url="{% url "geotags" %}" id="tagFilter" type="search" name="t" class="bw-forums__input-search" placeholder="Filter by tag..." value="{% if tag %}{{ tag }}{% endif %}" />
-                </div>
-                <a class="no-hover" href="{% url "geotags" %}"><button class="btn-secondary btn-inverse h-spacing-left-1">Clear filter</button></a>
-            </div>
         {% endif %}
     </div>
-    <div class="col-md-2 v-spacing-top-2 v-spacing-2">
+    <div class="col-md-4 v-spacing-top-2 v-spacing-2">
         {% if not sound %}
-        <div><a id="embedControlsLabel" href="javascript:void(0);">Embed this map</a></div>
+        <div class="text-md-right"><a id="embedControlsLabel" href="javascript:void(0);">Embed this map</a></div>
         {% endif %}
     </div>
 </div>
diff --git a/templates/search/search.html b/templates/search/search.html
index 5a061ce3a..6ddd8adac 100644
--- a/templates/search/search.html
+++ b/templates/search/search.html
@@ -372,16 +372,22 @@ <h5>No results... &#128543</h5>
                 <div id="mapCanvas" class="map main-map"
                      data-map-show-search="{% if sound %}false{% else %}true{% endif %}"
                      data-geotags-url="{{ map_bytearray_url }}"
-                     data-geotags-url-box="{% url "geotags-box-barray" %}"
                      data-access-token="pk.eyJ1IjoiZnJlZXNvdW5kIiwiYSI6ImNrd3E0Mm9lbjBqM2Qyb2wwdmwxaWI3a3oifQ.MZkgLSByRuk_Xql67CySAg"
                 ><span id="mapLoadingIndicator">Loading map... <img width="12px" height="12px" src="{% static 'bw-frontend/public/bw_indicator.gif' %}"></span></div>
             </div>
-            {% if paginator.count > max_search_results_map_mode %}
-            <div class="right">
-                <p class="text-grey">{% bw_icon 'notification' %} Note that only the first {{ max_search_results_map_mode|bw_intcomma }} search results are shown on the map</p>
+            <div class="row middle">
+                <div class="col-lg-4 v-spacing-top-4">
+                    <a class="no-hover btn-secondary btn-inverse" href="{{ open_in_map_url }}">See results in full map</a>
+                </div>
+                <div class="col-lg-8 v-spacing-top-2">
+                    {% if paginator.count < max_search_results_map_mode %}
+                    <div class="text-lg-right">
+                        <p class="text-grey">{% bw_icon 'notification' %} Note that only the first {{ max_search_results_map_mode|bw_intcomma }} search results are shown on the map</p>
+                    </div>
+                    {% endif %}
+                </div>
             </div>
             {% endif %}
-            {% endif %}
         </main>
     </div>
     {% endif %}