From d235fd2b9eb0126fc1a37304a6865e480e66a7e0 Mon Sep 17 00:00:00 2001 From: ffont Date: Tue, 23 Jan 2024 12:30:04 +0100 Subject: [PATCH 01/28] Add support for search engine document updates This means that now documents can be partially updated instead of always being completely replaced. This features is not used yet anywhere, but it will be useful when including similarity data to the search engine. https://github.com/MTG/freesound/issues/1714 --- utils/search/__init__.py | 7 +- utils/search/backends/solr555pysolr.py | 519 ++++++++---------- utils/search/backends/solr9pysolr.py | 4 +- .../backends/tests/test_solr555pysolr.py | 4 +- .../search/backends/tests/test_solr_common.py | 5 +- utils/search/search_sounds.py | 10 +- 6 files changed, 264 insertions(+), 285 deletions(-) diff --git a/utils/search/__init__.py b/utils/search/__init__.py index 44873af81..50fbe5518 100644 --- a/utils/search/__init__.py +++ b/utils/search/__init__.py @@ -184,11 +184,16 @@ class SearchEngineBase: # Sound search related methods - def add_sounds_to_index(self, sound_objects): + def add_sounds_to_index(self, sound_objects, fields_to_include=[], update_mode=False): """Indexes the provided sound objects in the search index Args: sound_objects (list[sounds.models.Sound]): Sound objects of the sounds to index + fields_to_include (list[str]): Specific sound fields that will be included in the document to + be indexed. If empty, all available sound fields will be included. + update_mode (bool): Whether to perform an update of the existing documents in the index or to + completely replace them. An update is useful so that fields not included in the document are + not removed from the index. """ raise NotImplementedError diff --git a/utils/search/backends/solr555pysolr.py b/utils/search/backends/solr555pysolr.py index e23493b9f..1dd258103 100644 --- a/utils/search/backends/solr555pysolr.py +++ b/utils/search/backends/solr555pysolr.py @@ -93,227 +93,6 @@ } -def convert_sound_to_search_engine_document(sound): - """ - TODO: Document that this includes remove_control_chars due to originally sending XML. not strictly necessary when submitting - to json (and also, freesound model code fixes this), but keep it in to ensure that docs are clean. - TODO: Assert that sound object is correct? - """ - document = {} - - # Basic sound fields - keep_fields = ['username', 'created', 'is_explicit', 'is_remix', 'num_ratings', 'channels', 'md5', - 'was_remixed', 'original_filename', 'duration', 'id', 'num_downloads', 'filesize'] - for key in keep_fields: - document[key] = getattr(sound, key) - if sound.type == '': - document["type"] = "wav" - else: - document["type"] = sound.type - document["original_filename"] = remove_control_chars(getattr(sound, "original_filename")) - document["description"] = remove_control_chars(getattr(sound, "description")) - document["tag"] = list(set([t.lower() for t in getattr(sound, "tag_array")])) - document["license"] = getattr(sound, "license_name") - if document["num_ratings"] >= settings.MIN_NUMBER_RATINGS: - document["avg_rating"] = getattr(sound, "avg_rating") - else: - document["avg_rating"] = 0 - - if getattr(sound, "pack_id"): - document["pack"] = remove_control_chars(getattr(sound, "pack_name")) - document["grouping_pack"] = str(getattr(sound, "pack_id")) + "_" + remove_control_chars( - getattr(sound, "pack_name")) - else: - document["grouping_pack"] = str(getattr(sound, "id")) - - document["is_geotagged"] = False - if getattr(sound, "geotag_id"): - document["is_geotagged"] = True - if not math.isnan(getattr(sound, "geotag_lon")) and not math.isnan(getattr(sound, "geotag_lat")): - document["geotag"] = str(getattr(sound, "geotag_lon")) + " " + str(getattr(sound, "geotag_lat")) - - document["in_remix_group"] = getattr(sound, "was_remixed") or getattr(sound, "is_remix") - - document["bitdepth"] = getattr(sound, "bitdepth") if getattr(sound, "bitdepth") else 0 - document["bitrate"] = getattr(sound, "bitrate") if getattr(sound, "bitrate") else 0 - document["samplerate"] = int(getattr(sound, "samplerate")) if getattr(sound, "samplerate") else 0 - - document["comment"] = [remove_control_chars(comment_text) for comment_text in getattr(sound, "comments_array")] - document["comments"] = getattr(sound, "num_comments") - locations = sound.locations() - document["waveform_path_m"] = locations["display"]["wave"]["M"]["path"] - document["waveform_path_l"] = locations["display"]["wave"]["L"]["path"] - document["spectral_path_m"] = locations["display"]["spectral"]["M"]["path"] - document["spectral_path_l"] = locations["display"]["spectral"]["L"]["path"] - document["preview_path"] = locations["preview"]["LQ"]["mp3"]["path"] - - # Analyzer's output - for analyzer_name, analyzer_info in settings.ANALYZERS_CONFIGURATION.items(): - if 'descriptors_map' in analyzer_info: - query_select_name = analyzer_name.replace('-', '_') - analysis_data = getattr(sound, query_select_name, None) - if analysis_data is not None: - # If analysis is present, index all existing analysis fields using SOLR dynamic fields depending on - # the value type (see SOLR_DYNAMIC_FIELDS_SUFFIX_MAP) so solr knows how to treat when filtering, etc. - for key, value in json.loads(analysis_data).items(): - if isinstance(value, list): - # Make sure that the list is formed by strings - value = [f'{item}' for item in value] - suffix = SOLR_DYNAMIC_FIELDS_SUFFIX_MAP.get(type(value), None) - if suffix: - document[f'{key}{suffix}'] = value - return document - - -def convert_post_to_search_engine_document(post): - body = remove_control_chars(post.body) - if not body: - return None - - document = { - "id": post.id, - "thread_id": post.thread.id, - "thread_title": remove_control_chars(post.thread.title), - "thread_author": post.thread.author.username, - "thread_created": post.thread.created, - - "forum_name": post.thread.forum.name, - "forum_name_slug": post.thread.forum.name_slug, - - "post_author": post.author.username, - "post_created": post.created, - "post_body": body, - - "num_posts": post.thread.num_posts, - "has_posts": False if post.thread.num_posts == 0 else True - } - return document - - -def add_solr_suffix_to_dynamic_fieldname(fieldname): - """Add the corresponding SOLR dynamic field suffix to the given fieldname. If the fieldname does not correspond - to a dynamic field, leave it unchanged. See docstring in 'add_solr_suffix_to_dynamic_fieldnames_in_filter' for - more information""" - dynamic_fields_map = {} - for analyzer, analyzer_data in settings.ANALYZERS_CONFIGURATION.items(): - if 'descriptors_map' in analyzer_data: - descriptors_map = settings.ANALYZERS_CONFIGURATION[analyzer]['descriptors_map'] - for _, db_descriptor_key, descriptor_type in descriptors_map: - if descriptor_type is not None: - dynamic_fields_map[db_descriptor_key] = '{}{}'.format( - db_descriptor_key, SOLR_DYNAMIC_FIELDS_SUFFIX_MAP[descriptor_type]) - return dynamic_fields_map.get(fieldname, fieldname) - - - -def add_solr_suffix_to_dynamic_fieldnames_in_filter(query_filter): - """Processes a filter string containing field names and replaces the occurrences of fieldnames that match with - descriptor names from the descriptors_map of different configured analyzers with updated fieldnames with - the required SOLR dynamic field suffix. This is needed because fields from analyzers are indexed as dynamic - fields which need to end with a specific suffi that SOLR uses to learn about the type of the field and how it - should treat it. - """ - for analyzer, analyzer_data in settings.ANALYZERS_CONFIGURATION.items(): - if 'descriptors_map' in analyzer_data: - descriptors_map = settings.ANALYZERS_CONFIGURATION[analyzer]['descriptors_map'] - for _, db_descriptor_key, descriptor_type in descriptors_map: - if descriptor_type is not None: - query_filter = query_filter.replace( - f'{db_descriptor_key}:','{}{}:'.format( - db_descriptor_key, SOLR_DYNAMIC_FIELDS_SUFFIX_MAP[descriptor_type])) - return query_filter - - -def search_process_sort(sort, forum=False): - """Translates sorting criteria to solr sort criteria and add extra criteria if sorting by ratings. - - If order by rating, when rating is the same sort also by number of ratings. - - Args: - sort (str): sorting criteria as defined in settings.SEARCH_SOUNDS_SORT_OPTIONS_WEB. - forum (bool, optional): use the forum sort options map instead of the standard sort map - - Returns: - List[str]: list containing the sorting field names list for the search engine. - """ - search_map = SORT_OPTIONS_MAP_FORUM if forum else SORT_OPTIONS_MAP - if sort in [sort_web_name for sort_web_name, _ in search_map.items()]: - if search_map[sort] == "avg_rating desc" or search_map[sort] == "avg_rating asc": - sort = [search_map[sort], "num_ratings desc"] - else: - sort = [search_map[sort]] - else: - sort = [search_map[settings.SEARCH_FORUM_SORT_DEFAULT if forum else settings.SEARCH_SOUNDS_SORT_DEFAULT]] - return sort - - -def search_filter_make_intersection(query_filter): - # In solr 4, fq="a:1 b:2" will take the AND of these two filters, but in solr 5+, this will use OR - # fq=a:1&fq=b:2 can be used to take an AND, however we don't support this syntax - # The AND behaviour can be approximated by using fq="+a:1 +b:2", therefore we add a + to the beginning of each - # filter item to force AND. Because we use Dismax query parser, if we have a filter like fq="a:1 OR b:2" which will - # be converted to fq="+a:1 OR +b:2" by this function, this will still correctly use the OR operator (this would not - # be the case with standard lucene query parser). - # NOTE: for the filter names we match "a-zA-Z_" instead of using \w as using \w would cause problems for filters - # which have date ranges inside. - # NOTE: in the future filter handling should be refactored and we should use a proper filter parser - # that allows us to define our own filter syntax and then represent filters as some intermediate structure that can later - # be converted to valid lucene/dismax syntax. - query_filter = re.sub(r'\b([a-zA-Z_]+:)', r'+\1', query_filter) - query_filter = re.sub(r"(\+)\1+", r"\1", query_filter) # This is to avoid having multiple + in a row if user already has added them - if len(query_filter) > 0 and query_filter[-1] == '+': - query_filter = query_filter[:-1] - return query_filter - - -def search_process_filter(query_filter, only_sounds_within_ids=False, only_sounds_with_pack=False): - """Process the filter to make a number of adjustments - - 1) Add type suffix to human-readable audio analyzer descriptor names (needed for dynamic solr field names). - 2) If only sounds with pack should be returned, add such a filter. - 3) Add filter for sound IDs if only_sounds_within_ids is passed. - 4) Rewrite geotag bounding box queries to use solr 5+ syntax - - Step 1) is used for the dynamic field names used in Solr (e.g. ac_tonality -> ac_tonality_s, ac_tempo -> - ac_tempo_i). The dynamic field names we define in Solr schema are '*_b' (for bool), '*_d' (for float), - '*_i' (for integer) and '*_s' (for string). At indexing time, we append these suffixes to the analyzer - descriptor names that need to be indexed so Solr can treat the types properly. Now we automatically append the - suffices to the filter names so users do not need to deal with that and Solr understands recognizes the field name. - - Args: - query_filter (str): query filter string. - only_sounds_with_pack (bool, optional): whether to only include sounds that belong to a pack - only_sounds_within_ids (List[int], optional): restrict search results to sounds with these IDs - - Returns: - str: processed filter query string. - """ - # Add type suffix to human-readable audio analyzer descriptor names which is needed for solr dynamic fields - query_filter = add_solr_suffix_to_dynamic_fieldnames_in_filter(query_filter) - - # If we only want sounds with packs and there is no pack filter, add one - if only_sounds_with_pack and not 'pack:' in query_filter: - query_filter += ' pack:*' - - if 'geotag:"Intersects(' in query_filter: - # Replace geotag:"Intersects( )" - # with geotag:[", " TO " "] - query_filter = re.sub(r'geotag:"Intersects\((.+?) (.+?) (.+?) (.+?)\)"', r'geotag:["\2,\1" TO "\4,\3"]', query_filter) - - query_filter = search_filter_make_intersection(query_filter) - - # When calculating results form clustering, the "only_sounds_within_ids" argument is passed and we filter - # our query to the sounds in that list of IDs. - if only_sounds_within_ids: - sounds_within_ids_filter = ' OR '.join([f'id:{sound_id}' for sound_id in only_sounds_within_ids]) - if query_filter: - query_filter += f' AND ({sounds_within_ids_filter})' - else: - query_filter = f'({sounds_within_ids_filter})' - - return query_filter - - class FreesoundSoundJsonEncoder(json.JSONEncoder): def default(self, value): if isinstance(value, datetime): @@ -358,10 +137,247 @@ def get_forum_index(self): always_commit=True ) return self.forum_index + + # Util functions + def transform_document_into_update_document(self, document): + """ + In order to update a document in SOLR, we need to send a document with the same ID of the document we want to update and the + list of fields with the values we want to set wrapped in a {'set': value} dictionary. This function transforms a normal solr + document with {key:value} pairs into a document that will update all the fields. This is useful when we only want to update some + fields but not remove those not updated. Using this method we can update similarity-related sound fields and the rest of the + fields independently. + """ + new_document = {'id': document['id']} + new_document.update({key: {'set': value} for key, value in document.items() if key != 'id'}) + return new_document + + def convert_sound_to_search_engine_document(self, sound, fields_to_include=[]): + """ + TODO: Document that this includes remove_control_chars due to originally sending XML. not strictly necessary when submitting + to json (and also, freesound model code fixes this), but keep it in to ensure that docs are clean. + TODO: Assert that sound object is correct? + """ + # Document ID (same as sound ID) + document = {'id': sound.id} + + # Basic sound fields + keep_fields = ['username', 'created', 'is_explicit', 'is_remix', 'num_ratings', 'channels', 'md5', + 'was_remixed', 'original_filename', 'duration', 'num_downloads', 'filesize'] + for key in keep_fields: + document[key] = getattr(sound, key) + if sound.type == '': + document["type"] = "wav" + else: + document["type"] = sound.type + document["original_filename"] = remove_control_chars(getattr(sound, "original_filename")) + document["description"] = remove_control_chars(getattr(sound, "description")) + document["tag"] = list(set([t.lower() for t in getattr(sound, "tag_array")])) + document["license"] = getattr(sound, "license_name") + + if document["num_ratings"] >= settings.MIN_NUMBER_RATINGS: + document["avg_rating"] = getattr(sound, "avg_rating") + else: + document["avg_rating"] = 0 + + if getattr(sound, "pack_id"): + document["pack"] = remove_control_chars(getattr(sound, "pack_name")) + document["grouping_pack"] = str(getattr(sound, "pack_id")) + "_" + remove_control_chars( + getattr(sound, "pack_name")) + else: + document["grouping_pack"] = str(getattr(sound, "id")) + + document["is_geotagged"] = False + if getattr(sound, "geotag_id"): + document["is_geotagged"] = True + if not math.isnan(getattr(sound, "geotag_lon")) and not math.isnan(getattr(sound, "geotag_lat")): + document["geotag"] = str(getattr(sound, "geotag_lon")) + " " + str(getattr(sound, "geotag_lat")) + + document["in_remix_group"] = getattr(sound, "was_remixed") or getattr(sound, "is_remix") + + document["bitdepth"] = getattr(sound, "bitdepth") if getattr(sound, "bitdepth") else 0 + document["bitrate"] = getattr(sound, "bitrate") if getattr(sound, "bitrate") else 0 + document["samplerate"] = int(getattr(sound, "samplerate")) if getattr(sound, "samplerate") else 0 + + document["comment"] = [remove_control_chars(comment_text) for comment_text in getattr(sound, "comments_array")] + document["comments"] = getattr(sound, "num_comments") + + locations = sound.locations() + document["waveform_path_m"] = locations["display"]["wave"]["M"]["path"] + document["waveform_path_l"] = locations["display"]["wave"]["L"]["path"] + document["spectral_path_m"] = locations["display"]["spectral"]["M"]["path"] + document["spectral_path_l"] = locations["display"]["spectral"]["L"]["path"] + document["preview_path"] = locations["preview"]["LQ"]["mp3"]["path"] + + # Analyzer's output + for analyzer_name, analyzer_info in settings.ANALYZERS_CONFIGURATION.items(): + if 'descriptors_map' in analyzer_info: + query_select_name = analyzer_name.replace('-', '_') + analysis_data = getattr(sound, query_select_name, None) + if analysis_data is not None: + # If analysis is present, index all existing analysis fields using SOLR dynamic fields depending on + # the value type (see SOLR_DYNAMIC_FIELDS_SUFFIX_MAP) so solr knows how to treat when filtering, etc. + for key, value in json.loads(analysis_data).items(): + if isinstance(value, list): + # Make sure that the list is formed by strings + value = [f'{item}' for item in value] + suffix = SOLR_DYNAMIC_FIELDS_SUFFIX_MAP.get(type(value), None) + if suffix: + document[f'{key}{suffix}'] = value + + # Remove fields that should not be included + # Note that we could optimize this by never getting the data for these fields in the first place, but because + # the data is already retrieved in the queryset, that optimization would be negligible so we keep it simple. + document = {k: v for k, v in document.items() if k in fields_to_include or not fields_to_include} + + return document + + def convert_post_to_search_engine_document(self, post): + body = remove_control_chars(post.body) + if not body: + return None + + document = { + "id": post.id, + "thread_id": post.thread.id, + "thread_title": remove_control_chars(post.thread.title), + "thread_author": post.thread.author.username, + "thread_created": post.thread.created, + + "forum_name": post.thread.forum.name, + "forum_name_slug": post.thread.forum.name_slug, + + "post_author": post.author.username, + "post_created": post.created, + "post_body": body, + + "num_posts": post.thread.num_posts, + "has_posts": False if post.thread.num_posts == 0 else True + } + return document + + def add_solr_suffix_to_dynamic_fieldname(self, fieldname): + """Add the corresponding SOLR dynamic field suffix to the given fieldname. If the fieldname does not correspond + to a dynamic field, leave it unchanged. See docstring in 'add_solr_suffix_to_dynamic_fieldnames_in_filter' for + more information""" + dynamic_fields_map = {} + for analyzer, analyzer_data in settings.ANALYZERS_CONFIGURATION.items(): + if 'descriptors_map' in analyzer_data: + descriptors_map = settings.ANALYZERS_CONFIGURATION[analyzer]['descriptors_map'] + for _, db_descriptor_key, descriptor_type in descriptors_map: + if descriptor_type is not None: + dynamic_fields_map[db_descriptor_key] = '{}{}'.format( + db_descriptor_key, SOLR_DYNAMIC_FIELDS_SUFFIX_MAP[descriptor_type]) + return dynamic_fields_map.get(fieldname, fieldname) + + def add_solr_suffix_to_dynamic_fieldnames_in_filter(self, query_filter): + """Processes a filter string containing field names and replaces the occurrences of fieldnames that match with + descriptor names from the descriptors_map of different configured analyzers with updated fieldnames with + the required SOLR dynamic field suffix. This is needed because fields from analyzers are indexed as dynamic + fields which need to end with a specific suffi that SOLR uses to learn about the type of the field and how it + should treat it. + """ + for analyzer, analyzer_data in settings.ANALYZERS_CONFIGURATION.items(): + if 'descriptors_map' in analyzer_data: + descriptors_map = settings.ANALYZERS_CONFIGURATION[analyzer]['descriptors_map'] + for _, db_descriptor_key, descriptor_type in descriptors_map: + if descriptor_type is not None: + query_filter = query_filter.replace( + f'{db_descriptor_key}:','{}{}:'.format( + db_descriptor_key, SOLR_DYNAMIC_FIELDS_SUFFIX_MAP[descriptor_type])) + return query_filter + + def search_process_sort(self, sort, forum=False): + """Translates sorting criteria to solr sort criteria and add extra criteria if sorting by ratings. + + If order by rating, when rating is the same sort also by number of ratings. + + Args: + sort (str): sorting criteria as defined in settings.SEARCH_SOUNDS_SORT_OPTIONS_WEB. + forum (bool, optional): use the forum sort options map instead of the standard sort map + + Returns: + List[str]: list containing the sorting field names list for the search engine. + """ + search_map = SORT_OPTIONS_MAP_FORUM if forum else SORT_OPTIONS_MAP + if sort in [sort_web_name for sort_web_name, _ in search_map.items()]: + if search_map[sort] == "avg_rating desc" or search_map[sort] == "avg_rating asc": + sort = [search_map[sort], "num_ratings desc"] + else: + sort = [search_map[sort]] + else: + sort = [search_map[settings.SEARCH_FORUM_SORT_DEFAULT if forum else settings.SEARCH_SOUNDS_SORT_DEFAULT]] + return sort + + def search_filter_make_intersection(self, query_filter): + # In solr 4, fq="a:1 b:2" will take the AND of these two filters, but in solr 5+, this will use OR + # fq=a:1&fq=b:2 can be used to take an AND, however we don't support this syntax + # The AND behaviour can be approximated by using fq="+a:1 +b:2", therefore we add a + to the beginning of each + # filter item to force AND. Because we use Dismax query parser, if we have a filter like fq="a:1 OR b:2" which will + # be converted to fq="+a:1 OR +b:2" by this function, this will still correctly use the OR operator (this would not + # be the case with standard lucene query parser). + # NOTE: for the filter names we match "a-zA-Z_" instead of using \w as using \w would cause problems for filters + # which have date ranges inside. + # NOTE: in the future filter handling should be refactored and we should use a proper filter parser + # that allows us to define our own filter syntax and then represent filters as some intermediate structure that can later + # be converted to valid lucene/dismax syntax. + query_filter = re.sub(r'\b([a-zA-Z_]+:)', r'+\1', query_filter) + query_filter = re.sub(r"(\+)\1+", r"\1", query_filter) # This is to avoid having multiple + in a row if user already has added them + if len(query_filter) > 0 and query_filter[-1] == '+': + query_filter = query_filter[:-1] + return query_filter + + def search_process_filter(self, query_filter, only_sounds_within_ids=False, only_sounds_with_pack=False): + """Process the filter to make a number of adjustments + + 1) Add type suffix to human-readable audio analyzer descriptor names (needed for dynamic solr field names). + 2) If only sounds with pack should be returned, add such a filter. + 3) Add filter for sound IDs if only_sounds_within_ids is passed. + 4) Rewrite geotag bounding box queries to use solr 5+ syntax + + Step 1) is used for the dynamic field names used in Solr (e.g. ac_tonality -> ac_tonality_s, ac_tempo -> + ac_tempo_i). The dynamic field names we define in Solr schema are '*_b' (for bool), '*_d' (for float), + '*_i' (for integer) and '*_s' (for string). At indexing time, we append these suffixes to the analyzer + descriptor names that need to be indexed so Solr can treat the types properly. Now we automatically append the + suffices to the filter names so users do not need to deal with that and Solr understands recognizes the field name. + + Args: + query_filter (str): query filter string. + only_sounds_with_pack (bool, optional): whether to only include sounds that belong to a pack + only_sounds_within_ids (List[int], optional): restrict search results to sounds with these IDs + + Returns: + str: processed filter query string. + """ + # Add type suffix to human-readable audio analyzer descriptor names which is needed for solr dynamic fields + query_filter = self.add_solr_suffix_to_dynamic_fieldnames_in_filter(query_filter) + + # If we only want sounds with packs and there is no pack filter, add one + if only_sounds_with_pack and not 'pack:' in query_filter: + query_filter += ' pack:*' + + if 'geotag:"Intersects(' in query_filter: + # Replace geotag:"Intersects( )" + # with geotag:[", " TO " "] + query_filter = re.sub('geotag:"Intersects\((.+?) (.+?) (.+?) (.+?)\)"', r'geotag:["\2,\1" TO "\4,\3"]', query_filter) + + query_filter = self.search_filter_make_intersection(query_filter) + + # When calculating results form clustering, the "only_sounds_within_ids" argument is passed and we filter + # our query to the sounds in that list of IDs. + if only_sounds_within_ids: + sounds_within_ids_filter = ' OR '.join(['id:{}'.format(sound_id) for sound_id in only_sounds_within_ids]) + if query_filter: + query_filter += ' AND ({})'.format(sounds_within_ids_filter) + else: + query_filter = '({})'.format(sounds_within_ids_filter) + + return query_filter # Sound methods - def add_sounds_to_index(self, sound_objects): - documents = [convert_sound_to_search_engine_document(s) for s in sound_objects] + def add_sounds_to_index(self, sound_objects, update_mode=False, fields_to_include=[]): + documents = [self.convert_sound_to_search_engine_document(s, fields_to_include=fields_to_include) for s in sound_objects] + if update_mode: + documents = [self.transform_document_into_update_document(d) for d in documents] try: self.get_sounds_index().add(documents) except pysolr.SolrError as e: @@ -394,53 +410,6 @@ def sound_exists_in_index(self, sound_object_or_id): response = self.search_sounds(query_filter=f'id:{sound_id}', offset=0, num_sounds=1) return response.num_found > 0 - def search_process_filter(self, query_filter, only_sounds_within_ids=False, only_sounds_with_pack=False): - """Process the filter to make a number of adjustments - - 1) Add type suffix to human-readable audio analyzer descriptor names (needed for dynamic solr field names). - 2) If only sounds with pack should be returned, add such a filter. - 3) Add filter for sound IDs if only_sounds_within_ids is passed. - 4) Rewrite geotag bounding box queries to use solr 5+ syntax - - Step 1) is used for the dynamic field names used in Solr (e.g. ac_tonality -> ac_tonality_s, ac_tempo -> - ac_tempo_i). The dynamic field names we define in Solr schema are '*_b' (for bool), '*_d' (for float), - '*_i' (for integer) and '*_s' (for string). At indexing time, we append these suffixes to the analyzer - descriptor names that need to be indexed so Solr can treat the types properly. Now we automatically append the - suffices to the filter names so users do not need to deal with that and Solr understands recognizes the field name. - - Args: - query_filter (str): query filter string. - only_sounds_with_pack (bool, optional): whether to only include sounds that belong to a pack - only_sounds_within_ids (List[int], optional): restrict search results to sounds with these IDs - - Returns: - str: processed filter query string. - """ - # Add type suffix to human-readable audio analyzer descriptor names which is needed for solr dynamic fields - query_filter = add_solr_suffix_to_dynamic_fieldnames_in_filter(query_filter) - - # If we only want sounds with packs and there is no pack filter, add one - if only_sounds_with_pack and not 'pack:' in query_filter: - query_filter += ' pack:*' - - if 'geotag:"Intersects(' in query_filter: - # Replace geotag:"Intersects( )" - # with geotag:[", " TO " "] - query_filter = re.sub('geotag:"Intersects\((.+?) (.+?) (.+?) (.+?)\)"', r'geotag:["\2,\1" TO "\4,\3"]', query_filter) - - query_filter = search_filter_make_intersection(query_filter) - - # When calculating results form clustering, the "only_sounds_within_ids" argument is passed and we filter - # our query to the sounds in that list of IDs. - if only_sounds_within_ids: - sounds_within_ids_filter = ' OR '.join(['id:{}'.format(sound_id) for sound_id in only_sounds_within_ids]) - if query_filter: - query_filter += ' AND ({})'.format(sounds_within_ids_filter) - else: - query_filter = '({})'.format(sounds_within_ids_filter) - - return query_filter - def search_sounds(self, textual_query='', query_fields=None, query_filter='', offset=0, current_page=None, num_sounds=settings.SOUNDS_PER_PAGE, sort=settings.SEARCH_SOUNDS_SORT_OPTION_AUTOMATIC, group_by_pack=False, num_sounds_per_pack_group=1, facets=None, only_sounds_with_pack=False, @@ -454,10 +423,10 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of # If no fields provided, use the default query_fields = settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS if isinstance(query_fields, list): - query_fields = [add_solr_suffix_to_dynamic_fieldname(FIELD_NAMES_MAP.get(field, field)) for field in query_fields] + query_fields = [self.add_solr_suffix_to_dynamic_fieldname(FIELD_NAMES_MAP.get(field, field)) for field in query_fields] elif isinstance(query_fields, dict): # Also remove fields with weight <= 0 - query_fields = [(add_solr_suffix_to_dynamic_fieldname(FIELD_NAMES_MAP.get(field, field)), weight) + query_fields = [(self.add_solr_suffix_to_dynamic_fieldname(FIELD_NAMES_MAP.get(field, field)), weight) for field, weight in query_fields.items() if weight > 0] # Set main query options @@ -475,7 +444,7 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of rows=num_sounds, field_list=["id", "score"], # We only want the sound IDs of the results as we load data from DB filter_query=query_filter, - sort=search_process_sort(sort)) + sort=self.search_process_sort(sort)) # Configure facets if facets is not None: @@ -544,7 +513,7 @@ def get_random_sound_id(self): # Forum posts methods def add_forum_posts_to_index(self, forum_post_objects): - documents = [convert_post_to_search_engine_document(p) for p in forum_post_objects] + documents = [self.convert_post_to_search_engine_document(p) for p in forum_post_objects] documents = [d for d in documents if d is not None] try: self.get_forum_index().add(documents) @@ -609,7 +578,7 @@ def search_forum_posts(self, textual_query='', query_filter='', sort=settings.SE "post_created", "num_posts"], filter_query=query_filter, - sort=search_process_sort(sort, forum=True)) + sort=self.search_process_sort(sort, forum=True)) if group_by_thread: query.set_group_field("thread_title_grouped") diff --git a/utils/search/backends/solr9pysolr.py b/utils/search/backends/solr9pysolr.py index 396b6aa79..8827d8545 100644 --- a/utils/search/backends/solr9pysolr.py +++ b/utils/search/backends/solr9pysolr.py @@ -84,7 +84,7 @@ def search_process_filter(self, query_filter, only_sounds_within_ids=False, only str: processed filter query string. """ # Add type suffix to human-readable audio analyzer descriptor names which is needed for solr dynamic fields - query_filter = solr555pysolr.add_solr_suffix_to_dynamic_fieldnames_in_filter(query_filter) + query_filter = self.add_solr_suffix_to_dynamic_fieldnames_in_filter(query_filter) # When filtering by the created field, use the `created_range` DateRangeType field instead # which include the ability to filter on exact values and ranges of values. @@ -100,7 +100,7 @@ def search_process_filter(self, query_filter, only_sounds_within_ids=False, only # with geotag:[", " TO " "] query_filter = re.sub('geotag:"Intersects\((.+?) (.+?) (.+?) (.+?)\)"', r'geotag:["\2,\1" TO "\4,\3"]', query_filter) - query_filter = solr555pysolr.search_filter_make_intersection(query_filter) + query_filter = self.search_filter_make_intersection(query_filter) # When calculating results form clustering, the "only_sounds_within_ids" argument is passed and we filter # our query to the sounds in that list of IDs. diff --git a/utils/search/backends/tests/test_solr555pysolr.py b/utils/search/backends/tests/test_solr555pysolr.py index 63a7a5f11..eab0e31ca 100644 --- a/utils/search/backends/tests/test_solr555pysolr.py +++ b/utils/search/backends/tests/test_solr555pysolr.py @@ -6,9 +6,9 @@ class Solr555PySolrTest(TestCase): def test_search_filter_make_intersection(self): filter_query = "username:alastairp" - updated = solr555pysolr.search_filter_make_intersection(filter_query) + updated = solr555pysolr.Solr555PySolrSearchEngine().search_filter_make_intersection(filter_query) self.assertEqual(updated, "+username:alastairp") filter_query = "username:alastairp license:(a OR b)" - updated = solr555pysolr.search_filter_make_intersection(filter_query) + updated = solr555pysolr.Solr555PySolrSearchEngine().search_filter_make_intersection(filter_query) self.assertEqual(updated, "+username:alastairp +license:(a OR b)") diff --git a/utils/search/backends/tests/test_solr_common.py b/utils/search/backends/tests/test_solr_common.py index 337b5aadb..048ce5840 100644 --- a/utils/search/backends/tests/test_solr_common.py +++ b/utils/search/backends/tests/test_solr_common.py @@ -1,15 +1,14 @@ from django.test import TestCase -from utils.search.backends import solr_common from utils.search.backends import solr555pysolr class SolrCommonTest(TestCase): def test_search_filter_make_intersection(self): filter_query = "username:alastairp" - updated = solr555pysolr.search_filter_make_intersection(filter_query) + updated = solr555pysolr.Solr555PySolrSearchEngine().search_filter_make_intersection(filter_query) self.assertEqual(updated, "+username:alastairp") filter_query = "username:alastairp license:(a OR b)" - updated = solr555pysolr.search_filter_make_intersection(filter_query) + updated = solr555pysolr.Solr555PySolrSearchEngine().search_filter_make_intersection(filter_query) self.assertEqual(updated, "+username:alastairp +license:(a OR b)") diff --git a/utils/search/search_sounds.py b/utils/search/search_sounds.py index bea52458f..1ec991203 100644 --- a/utils/search/search_sounds.py +++ b/utils/search/search_sounds.py @@ -379,11 +379,17 @@ def perform_search_engine_query(query_params): return results, paginator -def add_sounds_to_search_engine(sound_objects): +def add_sounds_to_search_engine(sound_objects, fields_to_include=[], update_mode=False): """Add the Sounds from the queryset to the search engine Args: sound_objects (list[sounds.models.Sound]): list (or queryset) of Sound objects to index + fields_to_include (list[str]): use this list to indicate the specific field names of the sounds + that need to be included in the documents that will be indexed. If no fields are specified + (fields_to_update=[]), then all available fields will be included. + update_mode (bool): if True, the sounds' data will be updated in the index, otherwise it will be + replaced by the new generated documents. This is specially useful in combination with + fields_to_include so that different fields of the indexed can be updated separately. Returns: int: number of sounds added to the index @@ -395,7 +401,7 @@ def add_sounds_to_search_engine(sound_objects): try: console_logger.info("Adding %d sounds to the search engine" % num_sounds) search_logger.info("Adding %d sounds to the search engine" % num_sounds) - get_search_engine().add_sounds_to_index(sound_objects) + get_search_engine().add_sounds_to_index(sound_objects, fields_to_include=fields_to_include, update_mode=update_mode) return num_sounds except SearchEngineException as e: console_logger.info(f"Failed to add sounds to search engine index: {str(e)}") From 9e3b67840a739273d78676b7b9de021f4cfaff5e Mon Sep 17 00:00:00 2001 From: ffont Date: Tue, 23 Jan 2024 13:11:06 +0100 Subject: [PATCH 02/28] Small fixes in search test command --- utils/search/backends/test_search_engine_backend.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/utils/search/backends/test_search_engine_backend.py b/utils/search/backends/test_search_engine_backend.py index eba5f80cb..99f58b24b 100644 --- a/utils/search/backends/test_search_engine_backend.py +++ b/utils/search/backends/test_search_engine_backend.py @@ -235,7 +235,7 @@ def sound_check_group_by_pack(self): assert_and_continue('group_name' in result, 'No group_name field in doc from results') assert_and_continue('group_docs' in result, 'No group_docs field in doc from results') assert_and_continue('n_more_in_group' in result, 'No n_more_in_group field in doc from results') - group_sounds = Sound.objects.bulk_query_id(sound_ids=[r['id'] for r in result['group_docs']]) + group_sounds = Sound.objects.bulk_query_id(sound_ids=[int(r['id']) for r in result['group_docs']]) first_sound_pack = group_sounds[0].pack for sound in group_sounds: assert_and_continue(sound.pack == first_sound_pack, 'Different packs in pack group') @@ -376,9 +376,7 @@ def test_search_enginge_backend_sounds(self): self.sound_check_get_user_tags(sounds[0]) self.sound_check_get_pack_tags(sounds) - console_logger.info('Testing of sound search methods finished. You might want to run the ' - 'reindex_search_engine_sounds -c command to make sure the index is left in a correct ' - 'state after having run these tests') + console_logger.info('Testing of sound search methods finished!') def forum_check_mandatory_doc_fields(self): # Check that returned forum posts (docs) from search engine include the mandatory fields @@ -519,6 +517,4 @@ def test_search_enginge_backend_forum(self): self.forum_check_highlighting() self.forum_check_extra_queries() - console_logger.info('Testing of forum search methods finished. You might want to run the ' - 'reindex_search_engine_forum -c command to make sure the index is left in a correct ' - 'state after having run these tests') + console_logger.info('Testing of forum search methods finished!') From 300201eebfebc82327bfc5ab15978ca7a5917922 Mon Sep 17 00:00:00 2001 From: ffont Date: Tue, 23 Jan 2024 13:12:49 +0100 Subject: [PATCH 03/28] Rename comments field to num_comments --- _docs/api/source/resources.rst | 2 +- utils/search/backends/solr555pysolr.py | 2 +- utils/search/solr9/cores/freesound/conf/schema.xml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/_docs/api/source/resources.rst b/_docs/api/source/resources.rst index bb96e3841..6c5c1259c 100644 --- a/_docs/api/source/resources.rst +++ b/_docs/api/source/resources.rst @@ -80,7 +80,7 @@ Filter name Type Description ``avg_rating`` numerical Average rating for the sound in the range [0, 5]. ``num_ratings`` integer Number of times the sound has been rated. ``comment`` string Textual content of the comments of a sound (tokenized). The filter is satisfied if sound contains the filter value in at least one of its comments. -``comments`` integer Number of times the sound has been commented. +``num_comments`` integer Number of times the sound has been commented. ====================== ============= ==================================================== diff --git a/utils/search/backends/solr555pysolr.py b/utils/search/backends/solr555pysolr.py index 1dd258103..174a0d56f 100644 --- a/utils/search/backends/solr555pysolr.py +++ b/utils/search/backends/solr555pysolr.py @@ -199,7 +199,7 @@ def convert_sound_to_search_engine_document(self, sound, fields_to_include=[]): document["samplerate"] = int(getattr(sound, "samplerate")) if getattr(sound, "samplerate") else 0 document["comment"] = [remove_control_chars(comment_text) for comment_text in getattr(sound, "comments_array")] - document["comments"] = getattr(sound, "num_comments") + document["num_comments"] = getattr(sound, "num_comments") locations = sound.locations() document["waveform_path_m"] = locations["display"]["wave"]["M"]["path"] diff --git a/utils/search/solr9/cores/freesound/conf/schema.xml b/utils/search/solr9/cores/freesound/conf/schema.xml index 4cc4fbccd..cfe8a5f10 100644 --- a/utils/search/solr9/cores/freesound/conf/schema.xml +++ b/utils/search/solr9/cores/freesound/conf/schema.xml @@ -231,7 +231,7 @@ - + From 21349a9a55f3020cc8bd9e2f29e192f2da1d1a67 Mon Sep 17 00:00:00 2001 From: ffont Date: Tue, 23 Jan 2024 14:00:07 +0100 Subject: [PATCH 04/28] Make fields non-required, some cleanups --- .../solr9/cores/freesound/conf/schema.xml | 76 ++++++++----------- 1 file changed, 32 insertions(+), 44 deletions(-) diff --git a/utils/search/solr9/cores/freesound/conf/schema.xml b/utils/search/solr9/cores/freesound/conf/schema.xml index cfe8a5f10..323429efb 100644 --- a/utils/search/solr9/cores/freesound/conf/schema.xml +++ b/utils/search/solr9/cores/freesound/conf/schema.xml @@ -1,15 +1,7 @@ id - - - - - - - - - + @@ -145,7 +137,6 @@ - - - - + - - - - + + + + + @@ -205,52 +195,50 @@ - - - + + + + - + - - - - - - - + + + + + + + - + - - + + - + - - - - - + + + + + - - - - + + + + - - - \ No newline at end of file From 26991bed775176f81b2c38e88be89e881af62633 Mon Sep 17 00:00:00 2001 From: ffont Date: Tue, 23 Jan 2024 14:00:35 +0100 Subject: [PATCH 05/28] Add tests for update/fields_to_include parameters of add_sounds_to_index --- utils/search/__init__.py | 4 +-- utils/search/backends/solr555pysolr.py | 14 +++++----- .../backends/test_search_engine_backend.py | 26 ++++++++++++++++++- utils/search/search_sounds.py | 6 ++--- 4 files changed, 38 insertions(+), 12 deletions(-) diff --git a/utils/search/__init__.py b/utils/search/__init__.py index 50fbe5518..697963c8b 100644 --- a/utils/search/__init__.py +++ b/utils/search/__init__.py @@ -184,14 +184,14 @@ class SearchEngineBase: # Sound search related methods - def add_sounds_to_index(self, sound_objects, fields_to_include=[], update_mode=False): + def add_sounds_to_index(self, sound_objects, fields_to_include=[], update=False): """Indexes the provided sound objects in the search index Args: sound_objects (list[sounds.models.Sound]): Sound objects of the sounds to index fields_to_include (list[str]): Specific sound fields that will be included in the document to be indexed. If empty, all available sound fields will be included. - update_mode (bool): Whether to perform an update of the existing documents in the index or to + update (bool): Whether to perform an update of the existing documents in the index or to completely replace them. An update is useful so that fields not included in the document are not removed from the index. """ diff --git a/utils/search/backends/solr555pysolr.py b/utils/search/backends/solr555pysolr.py index 174a0d56f..54f4c8328 100644 --- a/utils/search/backends/solr555pysolr.py +++ b/utils/search/backends/solr555pysolr.py @@ -224,10 +224,12 @@ def convert_sound_to_search_engine_document(self, sound, fields_to_include=[]): if suffix: document[f'{key}{suffix}'] = value - # Remove fields that should not be included - # Note that we could optimize this by never getting the data for these fields in the first place, but because - # the data is already retrieved in the queryset, that optimization would be negligible so we keep it simple. - document = {k: v for k, v in document.items() if k in fields_to_include or not fields_to_include} + if fields_to_include: + # Remove fields that should not be included + # Note that we could optimize this by never getting the data for these fields in the first place, but because + # the data is already retrieved in the queryset, that optimization would be negligible so we keep it simple. + document = {k: v for k, v in document.items() if k in fields_to_include} + document['id'] = sound.id # Make sure we always include the ID return document @@ -374,9 +376,9 @@ def search_process_filter(self, query_filter, only_sounds_within_ids=False, only return query_filter # Sound methods - def add_sounds_to_index(self, sound_objects, update_mode=False, fields_to_include=[]): + def add_sounds_to_index(self, sound_objects, update=False, fields_to_include=[]): documents = [self.convert_sound_to_search_engine_document(s, fields_to_include=fields_to_include) for s in sound_objects] - if update_mode: + if update: documents = [self.transform_document_into_update_document(d) for d in documents] try: self.get_sounds_index().add(documents) diff --git a/utils/search/backends/test_search_engine_backend.py b/utils/search/backends/test_search_engine_backend.py index 99f58b24b..dd6dff4d0 100644 --- a/utils/search/backends/test_search_engine_backend.py +++ b/utils/search/backends/test_search_engine_backend.py @@ -361,7 +361,31 @@ def test_search_enginge_backend_sounds(self): assert_and_continue(self.search_engine.sound_exists_in_index(sound), f'Sound ID {sound.id} should be in search index') - # Re-index all sounds to leave index in "correct" state + # Test the 'update' and 'include_fields' parameters of add_sounds_to_index. + # Start by emptying the index and testing that when adding sounds with update=True, these get created if they don't already exist + self.search_engine.remove_all_sounds() + self.search_engine.add_sounds_to_index(sounds, update=True) + for sound in sounds: + assert_and_continue(self.search_engine.sound_exists_in_index(sound), + f'Sound ID {sound.id} should be in the search index') + + # Make a query filtering by a field we know is in the index and check that all results are returned + results = self.search_engine.search_sounds(query_filter='duration:[* TO *]') + assert_and_continue(len(sounds) == results.num_found, "All sounds should have been returned for this query") + + # Now we index again but only with 2 fields and with update=False. This should replace existing documents and + # only index the selected fields. We then repeat the previous query, but because "duration" field was not included + # in the new index, now the query should return no results. + self.search_engine.add_sounds_to_index(sounds, update=False, fields_to_include=['id', 'original_filename']) + results = self.search_engine.search_sounds(query_filter='duration:[* TO *]') + assert_and_continue(0 == results.num_found, "No soulds should have been returned in this query") + + # Now we update the index with the duration field for all sounds and repeat the query, we should get all results again + self.search_engine.add_sounds_to_index(sounds, update=True, fields_to_include=['duration']) + results = self.search_engine.search_sounds(query_filter='duration:[* TO *]') + assert_and_continue(len(sounds) == results.num_found, "All sounds should have been returned for this query") + + # Re-index all sounds to leave index in "correct" state for next tests self.search_engine.add_sounds_to_index(sounds) self.sound_check_mandatory_doc_fields() diff --git a/utils/search/search_sounds.py b/utils/search/search_sounds.py index 1ec991203..79d999497 100644 --- a/utils/search/search_sounds.py +++ b/utils/search/search_sounds.py @@ -379,7 +379,7 @@ def perform_search_engine_query(query_params): return results, paginator -def add_sounds_to_search_engine(sound_objects, fields_to_include=[], update_mode=False): +def add_sounds_to_search_engine(sound_objects, fields_to_include=[], update=False): """Add the Sounds from the queryset to the search engine Args: @@ -387,7 +387,7 @@ def add_sounds_to_search_engine(sound_objects, fields_to_include=[], update_mode fields_to_include (list[str]): use this list to indicate the specific field names of the sounds that need to be included in the documents that will be indexed. If no fields are specified (fields_to_update=[]), then all available fields will be included. - update_mode (bool): if True, the sounds' data will be updated in the index, otherwise it will be + update (bool): if True, the sounds' data will be updated in the index, otherwise it will be replaced by the new generated documents. This is specially useful in combination with fields_to_include so that different fields of the indexed can be updated separately. @@ -401,7 +401,7 @@ def add_sounds_to_search_engine(sound_objects, fields_to_include=[], update_mode try: console_logger.info("Adding %d sounds to the search engine" % num_sounds) search_logger.info("Adding %d sounds to the search engine" % num_sounds) - get_search_engine().add_sounds_to_index(sound_objects, fields_to_include=fields_to_include, update_mode=update_mode) + get_search_engine().add_sounds_to_index(sound_objects, fields_to_include=fields_to_include, update=update) return num_sounds except SearchEngineException as e: console_logger.info(f"Failed to add sounds to search engine index: {str(e)}") From 92ea90eaca9c46065ce951c921f23aa0ab53e3c7 Mon Sep 17 00:00:00 2001 From: ffont Date: Tue, 23 Jan 2024 23:47:19 +0100 Subject: [PATCH 06/28] Add solr-based basic similarity search support --- freesound/settings.py | 13 + general/tasks.py | 8 +- sounds/views.py | 12 +- utils/search/__init__.py | 8 + utils/search/backends/solr555pysolr.py | 226 +++++++++++++----- utils/search/backends/solr_common.py | 9 +- .../backends/test_search_engine_backend.py | 5 + utils/search/search_sounds.py | 13 +- .../solr9/cores/freesound/conf/schema.xml | 9 + 9 files changed, 220 insertions(+), 83 deletions(-) diff --git a/freesound/settings.py b/freesound/settings.py index 2223ddd38..3d06c31e8 100644 --- a/freesound/settings.py +++ b/freesound/settings.py @@ -638,6 +638,19 @@ SOLR5_BASE_URL = "http://search:8983/solr" SOLR9_BASE_URL = "http://search:8983/solr" +SEARCH_ENGINE_SIMILARITY_ANALYZERS = { + FSDSINET_ANALYZER_NAME: { + 'vector_property_name': 'embeddings', + 'vector_size': 100, + }, + AUDIOSET_YAMNET_ANALYZER_NAME: { + 'vector_property_name': 'embeddings', + 'vector_size': 100, # Note yamnet has higher dimensionality and here we're cropping dimensions + }, +} +SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER = FSDSINET_ANALYZER_NAME +USE_SEARCH_ENGINE_SIMILARITY = False # Does not currently apply to API + # ------------------------------------------------------------------------------- # Similarity client settings SIMILARITY_ADDRESS = 'similarity' diff --git a/general/tasks.py b/general/tasks.py index 2dea3c913..d7e0209d6 100644 --- a/general/tasks.py +++ b/general/tasks.py @@ -260,10 +260,12 @@ def process_analysis_results(sound_id, analyzer, status, analysis_time, exceptio {'task_name': PROCESS_ANALYSIS_RESULTS_TASK_NAME, 'sound_id': sound_id, 'analyzer': analyzer, 'status': status, 'exception': str(exception), 'work_time': round(time.time() - start_time)})) else: - # Load analysis output to database field (following configuration in settings.ANALYZERS_CONFIGURATION) + # Load analysis output to database field (following configuration in settings.ANALYZERS_CONFIGURATION) a.load_analysis_data_from_file_to_db() - # Set sound to index dirty so that the sound gets reindexed with updated analysis fields - a.sound.mark_index_dirty(commit=True) + + if analyzer in settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS or analyzer in settings.ANALYZERS_CONFIGURATION: + # If the analyzer produces data that should be indexed in the search engine, set sound index to dirty so that the sound gets reindexed soon + a.sound.mark_index_dirty(commit=True) workers_logger.info("Finished processing analysis results (%s)" % json.dumps( {'task_name': PROCESS_ANALYSIS_RESULTS_TASK_NAME, 'sound_id': sound_id, 'analyzer': analyzer, 'status': status, 'work_time': round(time.time() - start_time)})) diff --git a/sounds/views.py b/sounds/views.py index 34ff14fdf..4e86e6cf0 100644 --- a/sounds/views.py +++ b/sounds/views.py @@ -825,8 +825,16 @@ def similar(request, username, sound_id): if sound.user.username.lower() != username.lower(): raise Http404 - similarity_results, _ = get_similar_sounds( - sound, request.GET.get('preset', None), settings.NUM_SIMILAR_SOUNDS_PER_PAGE * settings.NUM_SIMILAR_SOUNDS_PAGES) + if not settings.USE_SEARCH_ENGINE_SIMILARITY: + # Get similar sounds from similarity service (gaia) + similarity_results, _ = get_similar_sounds( + sound, request.GET.get('preset', None), settings.NUM_SIMILAR_SOUNDS_PER_PAGE * settings.NUM_SIMILAR_SOUNDS_PAGES) + else: + # Get similar sounds from solr + from utils.search import get_search_engine + results = get_search_engine().search_sounds(similar_to=sound.id) + similarity_results = [(result['id'], result['score']) for result in results.docs] + paginator = paginate(request, [sound_id for sound_id, _ in similarity_results], settings.NUM_SIMILAR_SOUNDS_PER_PAGE) similar_sounds = Sound.objects.ordered_ids(paginator['page'].object_list) tvars = {'similar_sounds': similar_sounds, 'sound': sound} diff --git a/utils/search/__init__.py b/utils/search/__init__.py index 697963c8b..7ee6ad0db 100644 --- a/utils/search/__init__.py +++ b/utils/search/__init__.py @@ -219,6 +219,14 @@ def sound_exists_in_index(self, sound_object_or_id): bool: whether the sound is indexed in the search engine """ raise NotImplementedError + + def get_all_sound_ids_from_index(self): + """Return a list of all sound IDs indexed in the search engine + + Returns: + List[int]: list of all sound IDs indexed in the search engine + """ + raise NotImplementedError def search_sounds(self, textual_query='', query_fields=None, query_filter='', offset=0, current_page=None, num_sounds=settings.SOUNDS_PER_PAGE, sort=settings.SEARCH_SOUNDS_SORT_OPTION_AUTOMATIC, diff --git a/utils/search/backends/solr555pysolr.py b/utils/search/backends/solr555pysolr.py index 54f4c8328..2e9be07db 100644 --- a/utils/search/backends/solr555pysolr.py +++ b/utils/search/backends/solr555pysolr.py @@ -23,12 +23,13 @@ import re import math from datetime import date, datetime +from collections import defaultdict import pysolr from django.conf import settings from forum.models import Post -from sounds.models import Sound +from sounds.models import Sound, SoundAnalysis from utils.text import remove_control_chars from utils.search import SearchEngineBase, SearchResults, SearchEngineException from utils.search.backends.solr_common import SolrQuery, SolrResponseInterpreter @@ -158,7 +159,7 @@ def convert_sound_to_search_engine_document(self, sound, fields_to_include=[]): TODO: Assert that sound object is correct? """ # Document ID (same as sound ID) - document = {'id': sound.id} + document = {'id': sound.id, 'is_sound': True} # Basic sound fields keep_fields = ['username', 'created', 'is_explicit', 'is_remix', 'num_ratings', 'channels', 'md5', @@ -230,6 +231,7 @@ def convert_sound_to_search_engine_document(self, sound, fields_to_include=[]): # the data is already retrieved in the queryset, that optimization would be negligible so we keep it simple. document = {k: v for k, v in document.items() if k in fields_to_include} document['id'] = sound.id # Make sure we always include the ID + document['is_sound'] = True # Make sure we always include the ID return document @@ -378,6 +380,37 @@ def search_process_filter(self, query_filter, only_sounds_within_ids=False, only # Sound methods def add_sounds_to_index(self, sound_objects, update=False, fields_to_include=[]): documents = [self.convert_sound_to_search_engine_document(s, fields_to_include=fields_to_include) for s in sound_objects] + + # If required, collect similarity vectors from all configured analyzers + if 'similarity_vectors' in fields_to_include or not fields_to_include: + similarity_data = defaultdict(list) + sound_ids = [s.id for s in sound_objects] + for analyzer_name, config_options in settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS.items(): + # If we should index similarity data, add it to the documents + vector_solr_field_type = {100: 'sim_vector100'}.get(config_options['vector_size'], None) + if vector_solr_field_type is None: + # If the vector size is not supported, then we can't index the vectors generated by the requested analyzer + continue + for sa in SoundAnalysis.objects.filter(sound_id__in=sound_ids, analyzer=analyzer_name, analysis_status="OK"): + similarity_vectors_per_analyzer_per_sound=[] + data = sa.get_analysis_data_from_file() + if data is not None: + if data[config_options['vector_property_name']] is not None: + similarity_vectors_per_analyzer_per_sound.append({ + 'is_sound': False, + 'analyzer': sa.analyzer, + 'timestamp_start': 0, # This will be used in the future if analyzers generate multiple sound vectors + 'timestamp_end': -1, # This will be used in the future if analyzers generate multiple sound vectors + vector_solr_field_type: data[config_options['vector_property_name']][0:config_options['vector_size']] + }) + if similarity_vectors_per_analyzer_per_sound: + similarity_data[sa.sound_id] += similarity_vectors_per_analyzer_per_sound + + # Add collected vectors to the documents created + for document in documents: + if document['id'] in similarity_data: + document['similarity_vectors'] = similarity_data[document['id']] + if update: documents = [self.transform_document_into_update_document(d) for d in documents] try: @@ -398,7 +431,6 @@ def remove_sounds_from_index(self, sound_objects_or_ids): raise SearchEngineException(e) def remove_all_sounds(self): - """Removes all sounds from the search index""" try: self.get_sounds_index().delete(q="*:*") except pysolr.SolrError as e: @@ -411,79 +443,144 @@ def sound_exists_in_index(self, sound_object_or_id): sound_id = sound_object_or_id.id response = self.search_sounds(query_filter=f'id:{sound_id}', offset=0, num_sounds=1) return response.num_found > 0 + + def get_all_sound_ids_from_index(self): + page_size=2000 + solr_ids = [] + solr_count = None + current_page = 1 + while solr_count is None or len(solr_ids) < solr_count: + response = self.search_sounds(sort=settings.SEARCH_SOUNDS_SORT_OPTION_DATE_NEW_FIRST, + offset=(current_page - 1) * page_size, + num_sounds=page_size) + solr_ids += [int(element['id']) for element in response.docs] + solr_count = response.num_found + current_page += 1 + return sorted(solr_ids) def search_sounds(self, textual_query='', query_fields=None, query_filter='', offset=0, current_page=None, num_sounds=settings.SOUNDS_PER_PAGE, sort=settings.SEARCH_SOUNDS_SORT_OPTION_AUTOMATIC, group_by_pack=False, num_sounds_per_pack_group=1, facets=None, only_sounds_with_pack=False, - only_sounds_within_ids=False, group_counts_as_one_in_facets=False): + only_sounds_within_ids=False, group_counts_as_one_in_facets=False, similar_to=None): query = SolrQuery() + if similar_to is None: + # Usual search query, no similarity search + + # Process search fields: replace "db" field names by solr field names and set default weights if needed + if query_fields is None: + # If no fields provided, use the default + query_fields = settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS + if isinstance(query_fields, list): + query_fields = [self.add_solr_suffix_to_dynamic_fieldname(FIELD_NAMES_MAP.get(field, field)) for field in query_fields] + elif isinstance(query_fields, dict): + # Also remove fields with weight <= 0 + query_fields = [(self.add_solr_suffix_to_dynamic_fieldname(FIELD_NAMES_MAP.get(field, field)), weight) + for field, weight in query_fields.items() if weight > 0] + + # Set main query options + query.set_dismax_query(textual_query, query_fields=query_fields) + + # Process filter + query_filter = self.search_process_filter(query_filter, + only_sounds_within_ids=only_sounds_within_ids, + only_sounds_with_pack=only_sounds_with_pack) + + # Set other query options + if current_page is not None: + offset = (current_page - 1) * num_sounds + query.set_query_options(start=offset, + rows=num_sounds, + field_list=["id", "score"], # We only want the sound IDs of the results as we load data from DB + filter_query=query_filter, + sort=self.search_process_sort(sort)) + + # Configure facets + if facets is not None: + facet_fields = [FIELD_NAMES_MAP[field_name] for field_name, _ in facets.items()] + query.add_facet_fields(*facet_fields) + query.set_facet_options_default(**SOLR_SOUND_FACET_DEFAULT_OPTIONS) + for field_name, extra_options in facets.items(): + query.set_facet_options(FIELD_NAMES_MAP[field_name], **extra_options) + + # Configure grouping + if group_by_pack: + query.set_group_field(group_field="grouping_pack") + query.set_group_options( + group_func=None, + group_query=None, + group_rows=10, # TODO: if limit is lower than rows and start=0, this should probably be equal to limit + group_start=0, + group_limit=num_sounds_per_pack_group, # This is the number of documents that will be returned for each group. + group_offset=0, + group_sort=None, + group_sort_ingroup=None, + group_format='grouped', + group_main=False, + group_num_groups=True, + group_cache_percent=0, + group_truncate=group_counts_as_one_in_facets) + else: + + vector = None + if isinstance(similar_to, list): + vector = similar_to # we allow vectors to be passed directly + else: + # similar_to should be a sound_id + sa = SoundAnalysis.objects.filter(sound_id=similar_to, analyzer=settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER, analysis_status="OK") + config_options = settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS[settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER] + if sa.exists(): + data = sa.first().get_analysis_data_from_file() + if data is not None: + vector = data[config_options['vector_property_name']][0:config_options['vector_size']] + + # Set query + if vector is not None: + max_similar_sounds = 1000000 # TODO: evaluate the performance impact of this so we can set it as high as possible. We want this to be high so that we can get a lot of similar sounds where to apply the filters to. Ideally we'd like this to be the whole collection. + serialized_vector = ','.join([str(n) for n in vector]) + query.set_query(f'{{!knn f=sim_vector100 topK={max_similar_sounds}}}[{serialized_vector}]') + + # Process filter + query_filter = self.search_process_filter(query_filter, + only_sounds_within_ids=only_sounds_within_ids, + only_sounds_with_pack=only_sounds_with_pack) + + # Set other query options + if current_page is not None: + offset = (current_page - 1) * num_sounds + + filter_query = [f'analyzer:{settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER}'] # Add basic filter to only get similarity vectors from selected analyzer + for part in query_filter.split('+'): + if part: + # Add extra query filters to the search query, but using the approptiate prefix to make sure they are applied to the root documents + filter_query.append('{!child of=\"*:* -_nest_path_:*\"}' + part) + + query.set_query_options(start=offset, + rows=num_sounds, + field_list=["id", "score"], # We only want the sound IDs of the results as we load data from DB. In the future we might add timestamp_start/timestamp_end + filter_query=filter_query, + sort=['score desc']) + + # NOTE: ATM we can not add more query options (faceting, etc) to similarity search because it does not return + # root documents but child documents (so that we can get multiple matches per sound if there are multiple vectors indexed). + # If we manage to improve the query so that it returns root documents, we can add more query options here. Or we could also + # first do the similarity search query and then do a normal search with the results of the similarity search as a filter... + else: + query.set_query('') + # If there is no vector found we can't do similarity search. Configure the query to return no results - # Process search fields: replace "db" field names by solr field names and set default weights if needed - if query_fields is None: - # If no fields provided, use the default - query_fields = settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS - if isinstance(query_fields, list): - query_fields = [self.add_solr_suffix_to_dynamic_fieldname(FIELD_NAMES_MAP.get(field, field)) for field in query_fields] - elif isinstance(query_fields, dict): - # Also remove fields with weight <= 0 - query_fields = [(self.add_solr_suffix_to_dynamic_fieldname(FIELD_NAMES_MAP.get(field, field)), weight) - for field, weight in query_fields.items() if weight > 0] - - # Set main query options - query.set_dismax_query(textual_query, query_fields=query_fields) - - # Process filter - query_filter = self.search_process_filter(query_filter, - only_sounds_within_ids=only_sounds_within_ids, - only_sounds_with_pack=only_sounds_with_pack) - - # Set other query options - if current_page is not None: - offset = (current_page - 1) * num_sounds - query.set_query_options(start=offset, - rows=num_sounds, - field_list=["id", "score"], # We only want the sound IDs of the results as we load data from DB - filter_query=query_filter, - sort=self.search_process_sort(sort)) - - # Configure facets - if facets is not None: - facet_fields = [FIELD_NAMES_MAP[field_name] for field_name, _ in facets.items()] - query.add_facet_fields(*facet_fields) - query.set_facet_options_default(**SOLR_SOUND_FACET_DEFAULT_OPTIONS) - for field_name, extra_options in facets.items(): - query.set_facet_options(FIELD_NAMES_MAP[field_name], **extra_options) - - # Configure grouping - if group_by_pack: - query.set_group_field(group_field="grouping_pack") - query.set_group_options( - group_func=None, - group_query=None, - group_rows=10, # TODO: if limit is lower than rows and start=0, this should probably be equal to limit - group_start=0, - group_limit=num_sounds_per_pack_group, # This is the number of documents that will be returned for each group. - group_offset=0, - group_sort=None, - group_sort_ingroup=None, - group_format='grouped', - group_main=False, - group_num_groups=True, - group_cache_percent=0, - group_truncate=group_counts_as_one_in_facets) # Do the query! # Note: we create a SearchResults with the same members as SolrResponseInterpreter (the response from .search()). # We do it in this way to conform to SearchEngine.search_sounds definition which must return SearchResults try: - results = self.get_sounds_index().search(**query.as_kwargs()) + results = self.get_sounds_index().search(**query.as_kwargs(force_sounds=similar_to is None)) # Solr uses a string for the id field, but django uses an int. Convert the id in all results to int # before use to avoid issues - docs = results.docs - for d in docs: - d["id"] = int(d["id"]) + for d in results.docs: + # Get the sound ids from the results + d["id"] = int(d["id"] if similar_to is None else d["id"].split('/')[0]) return SearchResults( docs=results.docs, num_found=results.num_found, @@ -505,7 +602,7 @@ def get_random_sound_id(self): query.set_query("*:*") query.set_query_options(start=0, rows=1, field_list=["id"], filter_query=filter_query, sort=sort) try: - response = self.get_sounds_index().search(search_handler="select", **query.as_kwargs()) + response = self.get_sounds_index().search(search_handler="select", **query.as_kwargs(force_sounds=True)) docs = response.docs if docs: return int(docs[0]['id']) @@ -535,7 +632,6 @@ def remove_forum_posts_from_index(self, forum_post_objects_or_ids): raise SearchEngineException(e) def remove_all_forum_posts(self): - """Removes all forum posts from the search index""" try: self.get_forum_index().delete(q="*:*") except pysolr.SolrError as e: @@ -613,7 +709,7 @@ def get_user_tags(self, username): query.add_facet_fields("tag") query.set_facet_options("tag", limit=10, mincount=1) try: - results = self.get_sounds_index().search(**query.as_kwargs()) + results = self.get_sounds_index().search(**query.as_kwargs(force_sounds=True)) return results.facets['tag'] except pysolr.SolrError as e: raise SearchEngineException(e) @@ -626,7 +722,7 @@ def get_pack_tags(self, username, pack_name): query.add_facet_fields("tag") query.set_facet_options("tag", limit=20, mincount=1) try: - results = self.get_sounds_index().search(**query.as_kwargs()) + results = self.get_sounds_index().search(**query.as_kwargs(force_sounds=True)) return results.facets['tag'] except pysolr.SolrError as e: raise SearchEngineException(e) diff --git a/utils/search/backends/solr_common.py b/utils/search/backends/solr_common.py index 7128d6aa9..1014e9b75 100644 --- a/utils/search/backends/solr_common.py +++ b/utils/search/backends/solr_common.py @@ -268,12 +268,19 @@ def set_group_options(self, group_func=None, group_query=None, group_rows=10, gr self.params['group.truncate'] = group_truncate self.params['group.cache.percent'] = group_cache_percent - def as_kwargs(self): + def as_kwargs(self, force_sounds=False): """Return params in a way that can be passed to pysolr commands as kwargs""" params = {k: v for k, v in self.params.items() if v is not None} for k, v in params.items(): if isinstance(v, bool): params[k] = json.dumps(v) + # If 'force_sounds', we want to make sure we only include sound documents in the query and not any child documents. Add an extra fq to force that. + if force_sounds: + current_fq = params['fq'] + if isinstance(current_fq, list): + params.update({'fq': current_fq + ['is_sound:1']}) + else: + params.update({'fq': [current_fq, 'is_sound:1']}) return params diff --git a/utils/search/backends/test_search_engine_backend.py b/utils/search/backends/test_search_engine_backend.py index dd6dff4d0..707edbc27 100644 --- a/utils/search/backends/test_search_engine_backend.py +++ b/utils/search/backends/test_search_engine_backend.py @@ -388,6 +388,11 @@ def test_search_enginge_backend_sounds(self): # Re-index all sounds to leave index in "correct" state for next tests self.search_engine.add_sounds_to_index(sounds) + # Test that the method to get all sound IDs works as expected + sound_ids = self.search_engine.get_all_sound_ids_from_index() + sound_ids_db = sorted([s.id for s in sounds]) + assert_and_continue(sound_ids_db == sound_ids, 'get_all_sound_ids_from_index returned wrong sound IDs') + self.sound_check_mandatory_doc_fields() self.sound_check_random_sound() self.sound_check_offsets() diff --git a/utils/search/search_sounds.py b/utils/search/search_sounds.py index 79d999497..f56610fa5 100644 --- a/utils/search/search_sounds.py +++ b/utils/search/search_sounds.py @@ -446,21 +446,10 @@ def get_all_sound_ids_from_search_engine(page_size=2000): """ console_logger.info("Getting all sound ids from search engine") search_engine = get_search_engine() - solr_ids = [] - solr_count = None - current_page = 1 try: - while solr_count is None or len(solr_ids) < solr_count: - response = search_engine.search_sounds(query_filter="*:*", - sort=settings.SEARCH_SOUNDS_SORT_OPTION_DATE_NEW_FIRST, - offset=(current_page - 1) * page_size, - num_sounds=page_size) - solr_ids += [int(element['id']) for element in response.docs] - solr_count = response.num_found - current_page += 1 + return search_engine.get_all_sound_ids_from_index() except SearchEngineException as e: search_logger.info(f"Could not retrieve all sound IDs from search engine: {str(e)}") - return sorted(solr_ids) def get_random_sound_id_from_search_engine(): diff --git a/utils/search/solr9/cores/freesound/conf/schema.xml b/utils/search/solr9/cores/freesound/conf/schema.xml index 323429efb..6625443e3 100644 --- a/utils/search/solr9/cores/freesound/conf/schema.xml +++ b/utils/search/solr9/cores/freesound/conf/schema.xml @@ -182,6 +182,7 @@ + @@ -238,6 +239,14 @@ + + + + + + + + From 0d793d1b28978c9c98080a624971ec7e48187128 Mon Sep 17 00:00:00 2001 From: ffont Date: Wed, 24 Jan 2024 09:39:28 +0100 Subject: [PATCH 07/28] Add more taks to vscode workspace --- freesound.code-workspace | 43 ++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/freesound.code-workspace b/freesound.code-workspace index b159aadd8..4dc94ba62 100644 --- a/freesound.code-workspace +++ b/freesound.code-workspace @@ -34,28 +34,23 @@ "tasks": { "version": "2.0.0", "tasks": [ - { - "label": "Run web and search", - "type": "shell", - "command": "docker-compose up web search", - "problemMatcher": [] - }, + { "label": "Docker compose build", "type": "shell", - "command": "docker-compose build", + "command": "docker compose build", "problemMatcher": [] }, { "label": "Build static", "type": "shell", - "command": "docker-compose run --rm web npm run build && docker-compose run --rm web python manage.py collectstatic --clear --noinput", + "command": "docker compose run --rm web npm run build && docker compose run --rm web python manage.py collectstatic --clear --noinput", "problemMatcher": [] }, { "label": "Install static", "type": "shell", - "command": "docker-compose run --rm web npm install --force", + "command": "docker compose run --rm web npm install --force", "problemMatcher": [] }, { @@ -67,37 +62,55 @@ { "label": "Create caches", "type": "shell", - "command": "docker-compose run --rm web python manage.py create_front_page_caches && docker-compose run --rm web python manage.py create_random_sounds && docker-compose run --rm web python manage.py generate_geotags_bytearray", + "command": "docker compose run --rm web python manage.py create_front_page_caches && docker compose run --rm web python manage.py create_random_sounds && docker compose run --rm web python manage.py generate_geotags_bytearray", "problemMatcher": [] }, { "label": "Run tests", "type": "shell", - "command": "docker-compose run --rm web python manage.py test --settings=freesound.test_settings", + "command": "docker compose run --rm web python manage.py test --settings=freesound.test_settings", "problemMatcher": [] }, { "label": "Run tests verbose with warnings", "type": "shell", - "command": "docker-compose run --rm web python -Wa manage.py test -v3 --settings=freesound.test_settings", + "command": "docker compose run --rm web python -Wa manage.py test -v3 --settings=freesound.test_settings", "problemMatcher": [] }, { "label": "Migrate", "type": "shell", - "command": "docker-compose run --rm web python manage.py migrate", + "command": "docker compose run --rm web python manage.py migrate", "problemMatcher": [] }, { "label": "Make migrations", "type": "shell", - "command": "docker-compose run --rm web python manage.py makemigrations", + "command": "docker compose run --rm web python manage.py makemigrations", "problemMatcher": [] }, { "label": "Shell plus", "type": "shell", - "command": "docker-compose run --rm web python manage.py shell_plus", + "command": "docker compose run --rm web python manage.py shell_plus", + "problemMatcher": [] + }, + { + "label": "Reindex search engine", + "type": "shell", + "command": "docker compose run --rm web python manage.py reindex_search_engine_sounds && docker compose run --rm web python manage.py reindex_search_engine_forum", + "problemMatcher": [] + }, + { + "label": "Post dirty sounds to search engine", + "type": "shell", + "command": "docker compose run --rm web python manage.py post_dirty_sounds_to_search_engine", + "problemMatcher": [] + }, + { + "label": "Orchestrate analysis", + "type": "shell", + "command": "docker compose run --rm web python manage.py orchestrate_analysis", "problemMatcher": [] } ] From edddb731a90aa7c15cf3305c2ff63bdb1212d712 Mon Sep 17 00:00:00 2001 From: ffont Date: Wed, 24 Jan 2024 09:40:01 +0100 Subject: [PATCH 08/28] Replace docker-compose by docker compose in docs --- DEVELOPERS.md | 6 +++--- README.md | 34 +++++++++++++++++----------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/DEVELOPERS.md b/DEVELOPERS.md index 3a3a02c8a..7e4a6aa4d 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -144,7 +144,7 @@ If a new search engine backend class is to be implemented, it must closely follo utils.search.SearchEngineBase docstrings. There is a Django management command that can be used in order to test the implementation of a search backend. You can run it like: - docker-compose run --rm web python manage.py test_search_engine_backend -fsw --backend utils.search.backends.solr9pysolr.Solr9PySolrSearchEngine + docker compose run --rm web python manage.py test_search_engine_backend -fsw --backend utils.search.backends.solr9pysolr.Solr9PySolrSearchEngine Please read carefully the documentation of the management command to better understand how it works and how is it doing the testing. @@ -217,7 +217,7 @@ https://github.com/mtg/freesound-audio-analyzers. The docker compose of the main services for the external analyzers which depend on docker images having been previously built from the `freesound-audio-analyzers` repository. To build these images you simply need to checkout the code repository and run `make`. Once the images are built, Freesound can be run including the external analyzer services by of the docker compose -file by running `docker-compose --profile analyzers up` +file by running `docker compose --profile analyzers up` The new analysis pipeline uses a job queue based on Celery/RabbitMQ. RabbitMQ console can be accessed at port `5673` (e.g. `http://localhost:5673/rabbitmq-admin`) and using `guest` as both username and password. Also, accessing @@ -231,7 +231,7 @@ for Freesound async tasks other than analysis). - Make sure that there are no outstanding deprecation warnings for the version of django that we are upgrading to. - docker-compose run --rm web python -Wd manage.py test + docker compose run --rm web python -Wd manage.py test Check for warnings of the form `RemovedInDjango110Warning` (TODO: Make tests fail if a warning occurs) diff --git a/README.md b/README.md index d637b50f3..9e2ff7a93 100644 --- a/README.md +++ b/README.md @@ -65,35 +65,35 @@ Below are instructions for setting up a local Freesound installation for develop 8. Build all Docker containers. The first time you run this command can take a while as a number of Docker images need to be downloaded and things need to be installed and compiled. - docker-compose build + docker compose build 9. Download the [Freesound development database dump](https://drive.google.com/file/d/11z9s8GyYkVlmWdEsLSwUuz0AjZ8cEvGy/view?usp=share_link) (~6MB), uncompress it and place the resulting `freesound-small-dev-dump-2023-09.sql` in the `freesound-data/db_dev_dump/` directory. Then run the database container and load the data into it using the commands below. You should get permission to download this file from Freesound admins. - docker-compose up -d db - docker-compose run --rm db psql -h db -U freesound -d freesound -f freesound-data/db_dev_dump/freesound-small-dev-dump-2023-09.sql + docker compose up -d db + docker compose run --rm db psql -h db -U freesound -d freesound -f freesound-data/db_dev_dump/freesound-small-dev-dump-2023-09.sql # or if the above command does not work, try this one - docker-compose run --rm --no-TTY db psql -h db -U freesound -d freesound < freesound-data/db_dev_dump/freesound-small-dev-dump-2023-09.sql + docker compose run --rm --no-TTY db psql -h db -U freesound -d freesound < freesound-data/db_dev_dump/freesound-small-dev-dump-2023-09.sql 10. Update database by running Django migrations - docker-compose run --rm web python manage.py migrate + docker compose run --rm web python manage.py migrate 11. Create a superuser account to be able to log in to the local Freesound website and to the admin site - docker-compose run --rm web python manage.py createsuperuser + docker compose run --rm web python manage.py createsuperuser 12. Install static build dependencies - docker-compose run --rm web npm install --force + docker compose run --rm web npm install --force 13. Build static files. Note that this step will need to be re-run every time there are changes in Freesound's static code (JS, CSS and static media files). - docker-compose run --rm web npm run build - docker-compose run --rm web python manage.py collectstatic --noinput + docker compose run --rm web npm run build + docker compose run --rm web python manage.py collectstatic --noinput 14. Run services 🎉 - docker-compose up + docker compose up When running this command, the most important services that make Freesound work will be run locally. This includes the web application and database, but also the search engine, cache manager, queue manager and asynchronous workers, including audio processing. @@ -102,24 +102,24 @@ Below are instructions for setting up a local Freesound installation for develop 15. Build the search index, so you can search for sounds and forum posts # Open a new terminal window so the services started in the previous step keep running - docker-compose run --rm web python manage.py reindex_search_engine_sounds - docker-compose run --rm web python manage.py reindex_search_engine_forum + docker compose run --rm web python manage.py reindex_search_engine_sounds + docker compose run --rm web python manage.py reindex_search_engine_forum After following the steps, you'll have a functional Freesound installation up and running, with the most relevant services properly configured. You can run Django's shell plus command like this: - docker-compose run --rm web python manage.py shell_plus + docker compose run --rm web python manage.py shell_plus Because the `web` container mounts a named volume for the home folder of the user running the shell plus process, command history should be kept between container runs :) -16. (extra step) The steps above will get Freesound running, but to save resources in your local machine some non-essential services will not be started by default. If you look at the `docker-compose.yml` file, you'll see that some services are marked with the profile `analyzers` or `all`. These services include sound similarity, search results clustering and the audio analyzers. To run these services you need to explicitly tell `docker-compose` using the `--profile` (note that some services need additional configuration steps (see *Freesound analysis pipeline* section in `DEVELOPERS.md`): +16. (extra step) The steps above will get Freesound running, but to save resources in your local machine some non-essential services will not be started by default. If you look at the `docker compose.yml` file, you'll see that some services are marked with the profile `analyzers` or `all`. These services include sound similarity, search results clustering and the audio analyzers. To run these services you need to explicitly tell `docker compose` using the `--profile` (note that some services need additional configuration steps (see *Freesound analysis pipeline* section in `DEVELOPERS.md`): - docker-compose --profile analyzers up # To run all basic services + sound analyzers - docker-compose --profile all up # To run all services + docker compose --profile analyzers up # To run all basic services + sound analyzers + docker compose --profile all up # To run all services ### Running tests You can run tests using the Django test runner in the `web` container like that: - docker-compose run --rm web python manage.py test --settings=freesound.test_settings + docker compose run --rm web python manage.py test --settings=freesound.test_settings From 475a2221219c15337beb420d926486cd4abb9ce0 Mon Sep 17 00:00:00 2001 From: ffont Date: Wed, 24 Jan 2024 09:40:29 +0100 Subject: [PATCH 09/28] Add parameter to chose analyzer for similarity --- utils/search/backends/solr555pysolr.py | 104 ++++++++++++++----------- 1 file changed, 58 insertions(+), 46 deletions(-) diff --git a/utils/search/backends/solr555pysolr.py b/utils/search/backends/solr555pysolr.py index 2e9be07db..2822431d2 100644 --- a/utils/search/backends/solr555pysolr.py +++ b/utils/search/backends/solr555pysolr.py @@ -86,6 +86,11 @@ } +SOLR_VECTOR_FIELDS_DIMENSIONS_MAP = { + 100: 'sim_vector100', +} + + SOLR_SOUND_FACET_DEFAULT_OPTIONS = { 'limit': 5, 'sort': True, @@ -387,7 +392,7 @@ def add_sounds_to_index(self, sound_objects, update=False, fields_to_include=[]) sound_ids = [s.id for s in sound_objects] for analyzer_name, config_options in settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS.items(): # If we should index similarity data, add it to the documents - vector_solr_field_type = {100: 'sim_vector100'}.get(config_options['vector_size'], None) + vector_solr_field_type = SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(config_options['vector_size'], None) if vector_solr_field_type is None: # If the vector size is not supported, then we can't index the vectors generated by the requested analyzer continue @@ -461,7 +466,8 @@ def get_all_sound_ids_from_index(self): def search_sounds(self, textual_query='', query_fields=None, query_filter='', offset=0, current_page=None, num_sounds=settings.SOUNDS_PER_PAGE, sort=settings.SEARCH_SOUNDS_SORT_OPTION_AUTOMATIC, group_by_pack=False, num_sounds_per_pack_group=1, facets=None, only_sounds_with_pack=False, - only_sounds_within_ids=False, group_counts_as_one_in_facets=False, similar_to=None): + only_sounds_within_ids=False, group_counts_as_one_in_facets=False, + similar_to=None, similar_to_analyzer=settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER): query = SolrQuery() @@ -522,53 +528,59 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of group_cache_percent=0, group_truncate=group_counts_as_one_in_facets) else: + # Similarity search! - vector = None - if isinstance(similar_to, list): - vector = similar_to # we allow vectors to be passed directly - else: - # similar_to should be a sound_id - sa = SoundAnalysis.objects.filter(sound_id=similar_to, analyzer=settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER, analysis_status="OK") - config_options = settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS[settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER] - if sa.exists(): - data = sa.first().get_analysis_data_from_file() - if data is not None: - vector = data[config_options['vector_property_name']][0:config_options['vector_size']] - - # Set query - if vector is not None: - max_similar_sounds = 1000000 # TODO: evaluate the performance impact of this so we can set it as high as possible. We want this to be high so that we can get a lot of similar sounds where to apply the filters to. Ideally we'd like this to be the whole collection. - serialized_vector = ','.join([str(n) for n in vector]) - query.set_query(f'{{!knn f=sim_vector100 topK={max_similar_sounds}}}[{serialized_vector}]') - - # Process filter - query_filter = self.search_process_filter(query_filter, - only_sounds_within_ids=only_sounds_within_ids, - only_sounds_with_pack=only_sounds_with_pack) - - # Set other query options - if current_page is not None: - offset = (current_page - 1) * num_sounds + # We fist set an empty query that will return no results and will be used by default if similarity can't be performed + query.set_query('') + if similar_to_analyzer in settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS: + # Get target vector from sound or from parameter + vector = None + extra_offset = 0 + if isinstance(similar_to, list): + vector = similar_to # we allow vectors to be passed directly + vector_field_name = SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(len(vector), None) + else: + # similar_to should be a sound_id + extra_offset = 1 # We add 1 to the offset so that we don't get the sound itself as a result + sa = SoundAnalysis.objects.filter(sound_id=similar_to, analyzer=similar_to_analyzer, analysis_status="OK") + config_options = settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS[similar_to_analyzer] + vector_field_name = SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(config_options['vector_size'], None) + if sa.exists(): + data = sa.first().get_analysis_data_from_file() + if data is not None: + vector = data[config_options['vector_property_name']][0:config_options['vector_size']] - filter_query = [f'analyzer:{settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER}'] # Add basic filter to only get similarity vectors from selected analyzer - for part in query_filter.split('+'): - if part: - # Add extra query filters to the search query, but using the approptiate prefix to make sure they are applied to the root documents - filter_query.append('{!child of=\"*:* -_nest_path_:*\"}' + part) - - query.set_query_options(start=offset, - rows=num_sounds, - field_list=["id", "score"], # We only want the sound IDs of the results as we load data from DB. In the future we might add timestamp_start/timestamp_end - filter_query=filter_query, - sort=['score desc']) + # Set query + if vector is not None and vector_field_name is not None: + max_similar_sounds = 1000000 # TODO: evaluate the performance impact of this so we can set it as high as possible. We want this to be high so that we can get a lot of similar sounds where to apply the filters to. Ideally we'd like this to be the whole collection. + serialized_vector = ','.join([str(n) for n in vector]) + query.set_query(f'{{!knn f={vector_field_name} topK={max_similar_sounds}}}[{serialized_vector}]') - # NOTE: ATM we can not add more query options (faceting, etc) to similarity search because it does not return - # root documents but child documents (so that we can get multiple matches per sound if there are multiple vectors indexed). - # If we manage to improve the query so that it returns root documents, we can add more query options here. Or we could also - # first do the similarity search query and then do a normal search with the results of the similarity search as a filter... - else: - query.set_query('') - # If there is no vector found we can't do similarity search. Configure the query to return no results + # Process filter + query_filter = self.search_process_filter(query_filter, + only_sounds_within_ids=only_sounds_within_ids, + only_sounds_with_pack=only_sounds_with_pack) + + # Set other query options + if current_page is not None: + offset = (current_page - 1) * num_sounds + + filter_query = [f'analyzer:{settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER}'] # Add basic filter to only get similarity vectors from selected analyzer + for part in query_filter.split('+'): + if part: + # Add extra query filters to the search query, but using the approptiate prefix to make sure they are applied to the root documents + filter_query.append('{!child of=\"is_sound:1\"}' + part) + + query.set_query_options(start=offset + extra_offset, + rows=num_sounds, + field_list=["id", "score"], # We only want the sound IDs of the results as we load data from DB. In the future we might add timestamp_start/timestamp_end + filter_query=filter_query, + sort=['score desc']) + + # NOTE: ATM we can not add more query options (faceting, etc) to similarity search because it does not return + # root documents but child documents (so that we can get multiple matches per sound if there are multiple vectors indexed). + # If we manage to improve the query so that it returns root documents, we can add more query options here. Or we could also + # first do the similarity search query and then do a normal search with the results of the similarity search as a filter... # Do the query! From 442b3a3e2472a7fa5e373d8d238cdb65a370fd2c Mon Sep 17 00:00:00 2001 From: ffont Date: Wed, 24 Jan 2024 10:47:11 +0100 Subject: [PATCH 10/28] Handle case with no valid embeddings --- utils/search/backends/solr555pysolr.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/utils/search/backends/solr555pysolr.py b/utils/search/backends/solr555pysolr.py index 2822431d2..8858df5be 100644 --- a/utils/search/backends/solr555pysolr.py +++ b/utils/search/backends/solr555pysolr.py @@ -548,7 +548,10 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', of if sa.exists(): data = sa.first().get_analysis_data_from_file() if data is not None: - vector = data[config_options['vector_property_name']][0:config_options['vector_size']] + vector_raw = data[config_options['vector_property_name']] + if vector_raw is not None: + vector = vector_raw[0:config_options['vector_size']] + # Set query if vector is not None and vector_field_name is not None: From 85fd8d2d2aaba11838d1ca9c3803e459cf247712 Mon Sep 17 00:00:00 2001 From: ffont Date: Wed, 24 Jan 2024 10:48:41 +0100 Subject: [PATCH 11/28] Get appropriate similairty state when search engine similarity is enabled We used to store similarity_state field in the sound model, but this is no longer needed when using the search engine based similarity. --- search/tests.py | 39 +++++++++++++++++++-------- sounds/models.py | 40 +++++++++++++++++++++++++--- sounds/templatetags/display_sound.py | 6 ++--- sounds/tests/test_sound.py | 2 ++ sounds/views.py | 15 ++++++----- templates/sounds/player.html | 2 +- templates/sounds/sound.html | 2 +- 7 files changed, 80 insertions(+), 26 deletions(-) diff --git a/search/tests.py b/search/tests.py index 8f76da1e0..388a55337 100644 --- a/search/tests.py +++ b/search/tests.py @@ -20,7 +20,7 @@ from django.core.cache import cache from django.test import TestCase -from django.test.utils import skipIf +from django.test.utils import skipIf, override_settings from django.urls import reverse from sounds.models import Sound from utils.search import SearchResults, SearchResultsPaginator @@ -142,6 +142,7 @@ def test_search_page_response_ok(self, perform_search_engine_query): self.assertEqual(resp.context['error_text'], None) self.assertEqual(len(resp.context['docs']), self.NUM_RESULTS) + @mock.patch('search.views.perform_search_engine_query') def test_search_page_num_queries(self, perform_search_engine_query): perform_search_engine_query.return_value = self.perform_search_engine_query_response @@ -155,16 +156,32 @@ def test_search_page_num_queries(self, perform_search_engine_query): cache.clear() with self.assertNumQueries(1): self.client.get(reverse('sounds-search') + '?cm=1') - - # Now check number of queries when displaying results as packs (i.e., searching for packs) - cache.clear() - with self.assertNumQueries(5): - self.client.get(reverse('sounds-search') + '?only_p=1') - - # Also check packs when displaying in grid mode - cache.clear() - with self.assertNumQueries(5): - self.client.get(reverse('sounds-search') + '?only_p=1&cm=1') + + with override_settings(USE_SEARCH_ENGINE_SIMILARITY=True): + # When using search engine similarity, there'll be one extra query performed to get the similarity status of the sounds + + # Now check number of queries when displaying results as packs (i.e., searching for packs) + cache.clear() + with self.assertNumQueries(6): + self.client.get(reverse('sounds-search') + '?only_p=1') + + # Also check packs when displaying in grid mode + cache.clear() + with self.assertNumQueries(6): + self.client.get(reverse('sounds-search') + '?only_p=1&cm=1') + + with override_settings(USE_SEARCH_ENGINE_SIMILARITY=False): + # When not using search engine similarity, there'll be one less query performed as similarity state is retrieved directly from sound object + + # Now check number of queries when displaying results as packs (i.e., searching for packs) + cache.clear() + with self.assertNumQueries(5): + self.client.get(reverse('sounds-search') + '?only_p=1') + + # Also check packs when displaying in grid mode + cache.clear() + with self.assertNumQueries(5): + self.client.get(reverse('sounds-search') + '?only_p=1&cm=1') @mock.patch('search.views.perform_search_engine_query') def test_search_page_with_filters(self, perform_search_engine_query): diff --git a/sounds/models.py b/sounds/models.py index ed47de7f4..817c4c6dc 100644 --- a/sounds/models.py +++ b/sounds/models.py @@ -412,9 +412,15 @@ def get_analyzers_data_left_join_sql(self): def get_analysis_state_essentia_exists_sql(self): """Returns the SQL bits to add analysis_state_essentia_exists to the returned data indicating if thers is a - SoundAnalysis objects existing for th given sound_id for the essentia analyzer and with status OK""" + SoundAnalysis objects existing for the given sound_id for the essentia analyzer and with status OK""" return f" exists(select 1 from sounds_soundanalysis where sounds_soundanalysis.sound_id = sound.id AND sounds_soundanalysis.analyzer = '{settings.FREESOUND_ESSENTIA_EXTRACTOR_NAME}' AND sounds_soundanalysis.analysis_status = 'OK') as analysis_state_essentia_exists," + def get_search_engine_similarity_state_sql(self): + """Returns the SQL bits to add search_engine_similarity_state to the returned data indicating if thers is a + SoundAnalysis object existing for the default similarity analyzer (settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER) + given sound_id and with status OK""" + return f" exists(select 1 from sounds_soundanalysis where sounds_soundanalysis.sound_id = sound.id AND sounds_soundanalysis.analyzer = '{settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER}' AND sounds_soundanalysis.analysis_status = 'OK') as search_engine_similarity_state," + def bulk_query_solr(self, sound_ids): """For each sound, get all fields needed to index the sound in Solr. Using this custom query to avoid the need of having to do some extra queries when displaying some fields related to the sound (e.g. for tags). Using this @@ -514,6 +520,7 @@ def bulk_query(self, where, order_by, limit, args, include_analyzers_output=Fals accounts_profile.has_avatar as user_has_avatar, %s %s + %s ARRAY( SELECT tags_tag.name FROM tags_tag @@ -530,7 +537,8 @@ def bulk_query(self, where, order_by, limit, args, include_analyzers_output=Fals LEFT JOIN tickets_ticket ON tickets_ticket.sound_id = sound.id %s LEFT OUTER JOIN sounds_remixgroup_sounds ON sounds_remixgroup_sounds.sound_id = sound.id - WHERE %s """ % (self.get_analysis_state_essentia_exists_sql(), + WHERE %s """ % (self.get_search_engine_similarity_state_sql(), + self.get_analysis_state_essentia_exists_sql(), self.get_analyzers_data_select_sql() if include_analyzers_output else '', ContentType.objects.get_for_model(Sound).id, self.get_analyzers_data_left_join_sql() if include_analyzers_output else '', @@ -1350,6 +1358,20 @@ def get_geotag_name(self): return f'{self.geotag_lat:.2f}, {self.geotag_lon:.3f}' else: return f'{self.geotag.lat:.2f}, {self.geotag.lon:.3f}' + + @property + def ready_for_similarity(self): + # Retruns True is the sound has been analyzed for similarity and should be available for simialrity queries + if settings.USE_SEARCH_ENGINE_SIMILARITY: + if hasattr(self, 'search_engine_similarity_state'): + # If attribute is precomputed from query (because Sound was retrieved using bulk_query), no need to perform extra queries + return self.search_engine_similarity_state + else: + # Otherwise, check if there is a SoundAnalysis object for this sound with the correct analyzer and status + return SoundAnalysis.objects.filter(sound_id=self.id, analyzer=settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER, analysis_status='OK').exists() + else: + # If not using search engine based similarity, then use the old similarity_state DB field + return self.similarity_state == "OK" class Meta: ordering = ("-created", ) @@ -1577,7 +1599,7 @@ def bulk_query_id(self, pack_ids, sound_ids_for_pack_id=dict(), exclude_deleted= selected_sounds_data.append({ 'id': s.id, 'username': p.user.username, # Packs have same username as sounds inside pack - 'similarity_state': s.similarity_state, + 'ready_for_similarity': s.similarity_state == "OK" if not settings.USE_SEARCH_ENGINE_SIMILARITY else None, # If using search engine similarity, this needs to be retrieved later (see below) 'duration': s.duration, 'preview_mp3': s.locations('preview.LQ.mp3.url'), 'preview_ogg': s.locations('preview.LQ.ogg.url'), @@ -1585,7 +1607,7 @@ def bulk_query_id(self, pack_ids, sound_ids_for_pack_id=dict(), exclude_deleted= 'spectral': s.locations('display.spectral_bw.L.url'), 'num_ratings': s.num_ratings, 'avg_rating': s.avg_rating - }) + }) p.num_sounds_unpublished_precomputed = p.sounds.count() - p.num_sounds p.licenses_data_precomputed = ([lid for _, lid in licenses], [lname for lname, _ in licenses]) p.pack_tags = [{'name': tag, 'count': count, 'browse_url': p.browse_pack_tag_url(tag)} @@ -1596,6 +1618,16 @@ def bulk_query_id(self, pack_ids, sound_ids_for_pack_id=dict(), exclude_deleted= p.num_ratings_precomputed = len(ratings) p.avg_rating_precomputed = sum(ratings) / len(ratings) if len(ratings) else 0.0 + if settings.USE_SEARCH_ENGINE_SIMILARITY: + # To save an individual query for each selected sound, we get the similarity state of all selected sounds per pack in one single extra query + selected_sounds_ids = [] + for p in packs: + selected_sounds_ids += [s['id'] for s in p.selected_sounds_data] + sound_ids_ready_for_similarity = SoundAnalysis.objects.filter(sound_id__in=selected_sounds_ids, analyzer=settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER, analysis_status="OK").values_list('sound_id', flat=True) + for p in packs: + for s in p.selected_sounds_data: + s['ready_for_similarity'] = s['id'] in sound_ids_ready_for_similarity + return packs def dict_ids(self, pack_ids, exclude_deleted=True): diff --git a/sounds/templatetags/display_sound.py b/sounds/templatetags/display_sound.py index 71cf40499..4d9cdbd70 100644 --- a/sounds/templatetags/display_sound.py +++ b/sounds/templatetags/display_sound.py @@ -200,7 +200,7 @@ def display_sound_no_sound_object(context, file_data, player_size, show_bookmark 'spectral': sound.locations('display.spectral_bw.L.url'), 'id': sound.id, # Only used for sounds that do actually have a sound object so we can display bookmark/similarity buttons 'username': sound.user.username, # Only used for sounds that do actually have a sound object so we can display bookmark/similarity/remix buttons - 'similarity_state': sound.similarity_state # Only used for sounds that do actually have a sound object so we can display bookmark/similarity/remix buttons + 'ready_for_similarity': sound.ready_for_similarity # Only used for sounds that do actually have a sound object so we can display bookmark/similarity/remix buttons 'remixgroup_id': sound.remixgroup_id # Only used for sounds that do actually have a sound object so we can display bookmark/similarity/remix buttons 'num_ratings': sound.num_ratings, # Used to display rating widget in players 'avg_rating': sound.avg_rating, # Used to display rating widget in players @@ -210,7 +210,7 @@ def display_sound_no_sound_object(context, file_data, player_size, show_bookmark 'sound': { 'id': file_data.get('id', file_data['preview_mp3'].split('/')[-2]), # If no id, use a unique fake ID to avoid caching problems 'username': file_data.get('username', 'nousername'), - 'similarity_state': file_data.get('similarity_state', 'FA'), + 'ready_for_similarity': file_data.get('ready_for_similarity', False), 'duration': file_data['duration'], 'samplerate': file_data.get('samplerate', 44100), 'num_ratings': file_data.get('num_ratings', 0), @@ -236,7 +236,7 @@ def display_sound_no_sound_object(context, file_data, player_size, show_bookmark }, 'show_milliseconds': 'true' if ('big' in player_size ) else 'false', 'show_bookmark_button': show_bookmark and 'id' in file_data, - 'show_similar_sounds_button': show_similar_sounds and 'similarity_state' in file_data, + 'show_similar_sounds_button': show_similar_sounds and file_data.get('ready_for_similarity', False), 'show_remix_group_button': show_remix and 'remixgroup_id' in file_data, 'show_rate_widget': 'avg_rating' in file_data, 'player_size': player_size, diff --git a/sounds/tests/test_sound.py b/sounds/tests/test_sound.py index 889b82ca5..5b6b411fa 100644 --- a/sounds/tests/test_sound.py +++ b/sounds/tests/test_sound.py @@ -793,6 +793,7 @@ def _test_similarity_update(self, cache_keys, expected, request_func, similarity self.assertEqual(self.sound.similarity_state, 'OK') self.assertContains(request_func(user) if user is not None else request_func(), expected) + @override_settings(USE_SEARCH_ENGINE_SIMILARITY=False) def test_similarity_update_display(self): self._test_similarity_update( self._get_sound_display_cache_keys(), @@ -801,6 +802,7 @@ def test_similarity_update_display(self): user=self.user, ) + @override_settings(USE_SEARCH_ENGINE_SIMILARITY=False) def test_similarity_update_view(self): self._test_similarity_update( self._get_sound_view_footer_top_cache_keys(), diff --git a/sounds/views.py b/sounds/views.py index 4e86e6cf0..8d3b871e8 100644 --- a/sounds/views.py +++ b/sounds/views.py @@ -65,6 +65,7 @@ from utils.nginxsendfile import sendfile, prepare_sendfile_arguments_for_sound_download from utils.pagination import paginate from utils.ratelimit import key_for_ratelimiting, rate_per_ip +from utils.search import get_search_engine, SearchEngineException from utils.search.search_sounds import get_random_sound_id_from_search_engine, perform_search_engine_query from utils.similarity_utilities import get_similar_sounds from utils.sound_upload import create_sound, NoAudioException, AlreadyExistsException, CantMoveException, \ @@ -820,8 +821,7 @@ def similar(request, username, sound_id): sound = get_object_or_404(Sound, id=sound_id, moderation_state="OK", - processing_state="OK", - similarity_state="OK") + processing_state="OK") if sound.user.username.lower() != username.lower(): raise Http404 @@ -831,10 +831,13 @@ def similar(request, username, sound_id): sound, request.GET.get('preset', None), settings.NUM_SIMILAR_SOUNDS_PER_PAGE * settings.NUM_SIMILAR_SOUNDS_PAGES) else: # Get similar sounds from solr - from utils.search import get_search_engine - results = get_search_engine().search_sounds(similar_to=sound.id) - similarity_results = [(result['id'], result['score']) for result in results.docs] - + try: + results = get_search_engine().search_sounds(similar_to=sound.id) + similarity_results = [(result['id'], result['score']) for result in results.docs] + except SearchEngineException: + # Search engine not available, return empty list + similarity_results = [] + paginator = paginate(request, [sound_id for sound_id, _ in similarity_results], settings.NUM_SIMILAR_SOUNDS_PER_PAGE) similar_sounds = Sound.objects.ordered_ids(paginator['page'].object_list) tvars = {'similar_sounds': similar_sounds, 'sound': sound} diff --git a/templates/sounds/player.html b/templates/sounds/player.html index f658834ef..3d6b463eb 100644 --- a/templates/sounds/player.html +++ b/templates/sounds/player.html @@ -5,7 +5,7 @@ data-bookmark="{% if show_bookmark_button %}true{% else %}false{% endif %}" data-bookmark-modal-url="{% if show_bookmark_button %}{% url 'bookmarks-add-form-for-sound' sound.id %}{% endif %}" data-add-bookmark-url="{% if show_bookmark_button %}{% url 'add-bookmark' sound.id %}{% endif %}" - data-similar-sounds="{% if show_similar_sounds_button and sound.similarity_state == 'OK' %}true{% else %}false{% endif %}" + data-similar-sounds="{% if show_similar_sounds_button and sound.ready_for_similarity %}true{% else %}false{% endif %}" data-similar-sounds-modal-url="{% if show_similar_sounds_button %}{% url 'sound-similar' sound.username sound.id %}?ajax=1{% endif %}" data-remix-group="{% if show_remix_group_button and sound.remixgroup_id %}true{% else %}false{% endif %}" data-remix-group-modal-url="{% if show_remix_group_button %}{% url 'sound-remixes' sound.username sound.id %}?ajax=1{% endif %}" diff --git a/templates/sounds/sound.html b/templates/sounds/sound.html index 5e222496a..7193e9529 100644 --- a/templates/sounds/sound.html +++ b/templates/sounds/sound.html @@ -65,7 +65,7 @@

- {% endif %} -
+
{% if not sound %} - + {% endif %}
diff --git a/templates/search/search.html b/templates/search/search.html index 5a061ce3a..6ddd8adac 100644 --- a/templates/search/search.html +++ b/templates/search/search.html @@ -372,16 +372,22 @@

No results... 😟
Loading map...
- {% if paginator.count > max_search_results_map_mode %} -
-

{% bw_icon 'notification' %} Note that only the first {{ max_search_results_map_mode|bw_intcomma }} search results are shown on the map

+
+ +
+ {% if paginator.count < max_search_results_map_mode %} +
+

{% bw_icon 'notification' %} Note that only the first {{ max_search_results_map_mode|bw_intcomma }} search results are shown on the map

+
+ {% endif %} +
{% endif %} - {% endif %}
{% endif %}