From bd37e4caebd0091804548b157a1bf7f8789109e4 Mon Sep 17 00:00:00 2001 From: Daniel Valenzuela Date: Fri, 15 Nov 2024 09:56:07 -0300 Subject: [PATCH] feat: incremental reindex_studio management command --- openedx/core/djangoapps/content/search/api.py | 207 +++++++++++------- .../management/commands/reindex_studio.py | 14 +- .../0002_incrementalindexcompleted.py | 21 ++ .../core/djangoapps/content/search/models.py | 9 + .../content/search/tests/test_api.py | 70 ++++++ 5 files changed, 241 insertions(+), 80 deletions(-) create mode 100644 openedx/core/djangoapps/content/search/migrations/0002_incrementalindexcompleted.py diff --git a/openedx/core/djangoapps/content/search/api.py b/openedx/core/djangoapps/content/search/api.py index b1d224b411e4..9d0a2379a0bb 100644 --- a/openedx/core/djangoapps/content/search/api.py +++ b/openedx/core/djangoapps/content/search/api.py @@ -5,7 +5,7 @@ import logging import time -from contextlib import contextmanager +from contextlib import contextmanager, nullcontext from datetime import datetime, timedelta, timezone from functools import wraps from typing import Callable, Generator @@ -24,7 +24,7 @@ from rest_framework.request import Request from common.djangoapps.student.role_helpers import get_course_roles from openedx.core.djangoapps.content.course_overviews.models import CourseOverview -from openedx.core.djangoapps.content.search.models import get_access_ids_for_request +from openedx.core.djangoapps.content.search.models import get_access_ids_for_request, IncrementalIndexCompleted from openedx.core.djangoapps.content_libraries import api as lib_api from xmodule.modulestore.django import modulestore @@ -217,6 +217,83 @@ def _using_temp_index(status_cb: Callable[[str], None] | None = None) -> Generat _wait_for_meili_task(client.delete_index(temp_index_name)) +def _configure_index(index_name): + """ + Configure the index. The following index settings are best changed on an empty index. + Changing them on a populated index will "re-index all documents in the index", which can take some time. + + Args: + index_name (str): The name of the index to configure + """ + client = _get_meilisearch_client() + + # Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique): + client.index(index_name).update_distinct_attribute(Fields.usage_key) + # Mark which attributes can be used for filtering/faceted search: + client.index(index_name).update_filterable_attributes([ + # Get specific block/collection using combination of block_id and context_key + Fields.block_id, + Fields.block_type, + Fields.context_key, + Fields.usage_key, + Fields.org, + Fields.tags, + Fields.tags + "." + Fields.tags_taxonomy, + Fields.tags + "." + Fields.tags_level0, + Fields.tags + "." + Fields.tags_level1, + Fields.tags + "." + Fields.tags_level2, + Fields.tags + "." + Fields.tags_level3, + Fields.collections, + Fields.collections + "." + Fields.collections_display_name, + Fields.collections + "." + Fields.collections_key, + Fields.type, + Fields.access_id, + Fields.last_published, + Fields.content + "." + Fields.problem_types, + ]) + # Mark which attributes are used for keyword search, in order of importance: + client.index(index_name).update_searchable_attributes([ + # Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields. + Fields.display_name, + Fields.block_id, + Fields.content, + Fields.description, + Fields.tags, + Fields.collections, + # If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they + # are searchable only if at least one document in the index has a value. If we didn't list them here and, + # say, there were no tags.level3 tags in the index, the client would get an error if trying to search for + # these sub-fields: "Attribute `tags.level3` is not searchable." + Fields.tags + "." + Fields.tags_taxonomy, + Fields.tags + "." + Fields.tags_level0, + Fields.tags + "." + Fields.tags_level1, + Fields.tags + "." + Fields.tags_level2, + Fields.tags + "." + Fields.tags_level3, + Fields.collections + "." + Fields.collections_display_name, + Fields.collections + "." + Fields.collections_key, + Fields.published + "." + Fields.display_name, + Fields.published + "." + Fields.published_description, + ]) + # Mark which attributes can be used for sorting search results: + client.index(index_name).update_sortable_attributes([ + Fields.display_name, + Fields.created, + Fields.modified, + Fields.last_published, + ]) + + # Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance. + # cf https://www.meilisearch.com/docs/learn/core_concepts/relevancy + client.index(index_name).update_ranking_rules([ + "sort", + "words", + "typo", + "proximity", + "attribute", + "exactness", + ]) + + def _recurse_children(block, fn, status_cb: Callable[[str], None] | None = None) -> None: """ Recurse the children of an XBlock and call the given function for each @@ -279,8 +356,31 @@ def is_meilisearch_enabled() -> bool: return False -# pylint: disable=too-many-statements -def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None: +def reset_index(status_cb: Callable[[str], None] | None = None) -> None: + if status_cb is None: + status_cb = log.info + + status_cb("Creating new empty index...") + with _using_temp_index(status_cb) as temp_index_name: + _configure_index(temp_index_name) + status_cb("Index recreated!") + status_cb("Index reset complete.") + + +def init_index(status_cb: Callable[[str], None] | None = None, warn_cb: Callable[[str], None] | None = None) -> None: + if status_cb is None: + status_cb = log.info + if warn_cb is None: + warn_cb = log.warning + + if _index_exists(STUDIO_INDEX_NAME): + warn_cb("A rebuild of the index is required. Please run ./manage.py cms reindex_studio --experimental [--incremental]") + return + + reset_index(status_cb) + + +def rebuild_index(status_cb: Callable[[str], None] | None = None, incremental=False) -> None: """ Rebuild the Meilisearch index from scratch """ @@ -292,7 +392,14 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None: # Get the lists of libraries status_cb("Counting libraries...") - lib_keys = [lib.library_key for lib in lib_api.ContentLibrary.objects.select_related('org').only('org', 'slug')] + keys_indexed = [] + if incremental: + keys_indexed = list(IncrementalIndexCompleted.objects.values_list('context_key', flat=True)) + lib_keys = [ + lib.library_key + for lib in lib_api.ContentLibrary.objects.select_related('org').only('org', 'slug').order_by('-id') + if lib.library_key not in keys_indexed + ] num_libraries = len(lib_keys) # Get the list of courses @@ -305,83 +412,19 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None: num_blocks_done = 0 # How many individual components/XBlocks we've indexed status_cb(f"Found {num_courses} courses, {num_libraries} libraries.") - with _using_temp_index(status_cb) as temp_index_name: + with _using_temp_index(status_cb) if not incremental else nullcontext(STUDIO_INDEX_NAME) as index_name: ############## Configure the index ############## - # The following index settings are best changed on an empty index. - # Changing them on a populated index will "re-index all documents in the index, which can take some time" + # The index settings are best changed on an empty index. + # Changing them on a populated index will "re-index all documents in the index", which can take some time # and use more RAM. Instead, we configure an empty index then populate it one course/library at a time. - - # Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique): - client.index(temp_index_name).update_distinct_attribute(Fields.usage_key) - # Mark which attributes can be used for filtering/faceted search: - client.index(temp_index_name).update_filterable_attributes([ - # Get specific block/collection using combination of block_id and context_key - Fields.block_id, - Fields.block_type, - Fields.context_key, - Fields.usage_key, - Fields.org, - Fields.tags, - Fields.tags + "." + Fields.tags_taxonomy, - Fields.tags + "." + Fields.tags_level0, - Fields.tags + "." + Fields.tags_level1, - Fields.tags + "." + Fields.tags_level2, - Fields.tags + "." + Fields.tags_level3, - Fields.collections, - Fields.collections + "." + Fields.collections_display_name, - Fields.collections + "." + Fields.collections_key, - Fields.type, - Fields.access_id, - Fields.last_published, - Fields.content + "." + Fields.problem_types, - ]) - # Mark which attributes are used for keyword search, in order of importance: - client.index(temp_index_name).update_searchable_attributes([ - # Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields. - Fields.display_name, - Fields.block_id, - Fields.content, - Fields.description, - Fields.tags, - Fields.collections, - # If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they - # are searchable only if at least one document in the index has a value. If we didn't list them here and, - # say, there were no tags.level3 tags in the index, the client would get an error if trying to search for - # these sub-fields: "Attribute `tags.level3` is not searchable." - Fields.tags + "." + Fields.tags_taxonomy, - Fields.tags + "." + Fields.tags_level0, - Fields.tags + "." + Fields.tags_level1, - Fields.tags + "." + Fields.tags_level2, - Fields.tags + "." + Fields.tags_level3, - Fields.collections + "." + Fields.collections_display_name, - Fields.collections + "." + Fields.collections_key, - Fields.published + "." + Fields.display_name, - Fields.published + "." + Fields.published_description, - ]) - # Mark which attributes can be used for sorting search results: - client.index(temp_index_name).update_sortable_attributes([ - Fields.display_name, - Fields.created, - Fields.modified, - Fields.last_published, - ]) - - # Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance. - # cf https://www.meilisearch.com/docs/learn/core_concepts/relevancy - client.index(temp_index_name).update_ranking_rules([ - "sort", - "words", - "typo", - "proximity", - "attribute", - "exactness", - ]) + if not incremental: + _configure_index(index_name) ############## Libraries ############## status_cb("Indexing libraries...") - def index_library(lib_key: str) -> list: + def index_library(lib_key: LibraryLocatorV2) -> list: docs = [] for component in lib_api.get_library_components(lib_key): try: @@ -396,7 +439,7 @@ def index_library(lib_key: str) -> list: if docs: try: # Add all the docs in this library at once (usually faster than adding one at a time): - _wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) + _wait_for_meili_task(client.index(index_name).add_documents(docs)) except (TypeError, KeyError, MeilisearchError) as err: status_cb(f"Error indexing library {lib_key}: {err}") return docs @@ -416,7 +459,7 @@ def index_collection_batch(batch, num_done, library_key) -> int: if docs: try: # Add docs in batch of 100 at once (usually faster than adding one at a time): - _wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) + _wait_for_meili_task(client.index(index_name).add_documents(docs)) except (TypeError, KeyError, MeilisearchError) as err: status_cb(f"Error indexing collection batch {p}: {err}") return num_done @@ -439,6 +482,8 @@ def index_collection_batch(batch, num_done, library_key) -> int: num_collections_done, lib_key, ) + if incremental: + IncrementalIndexCompleted.objects.get_or_create(context_key=lib_key) status_cb(f"{num_collections_done}/{num_collections} collections indexed for library {lib_key}") num_contexts_done += 1 @@ -464,7 +509,7 @@ def add_with_children(block): if docs: # Add all the docs in this course at once (usually faster than adding one at a time): - _wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) + _wait_for_meili_task(client.index(index_name).add_documents(docs)) return docs paginator = Paginator(CourseOverview.objects.only('id', 'display_name'), 1000) @@ -473,10 +518,16 @@ def add_with_children(block): status_cb( f"{num_contexts_done + 1}/{num_contexts}. Now indexing course {course.display_name} ({course.id})" ) + if course.id in keys_indexed: + num_contexts_done += 1 + continue course_docs = index_course(course) + if incremental: + IncrementalIndexCompleted.objects.get_or_create(context_key=course.id) num_contexts_done += 1 num_blocks_done += len(course_docs) + IncrementalIndexCompleted.objects.all().delete() status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses, collections and libraries.") diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py index 3767ebcba6c9..9fa2b30ea87a 100644 --- a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py +++ b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py @@ -19,7 +19,10 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--experimental', action='store_true') - parser.set_defaults(experimental=False) + parser.add_argument('--reset', action='store_true') + parser.add_argument('--init', action='store_true') + parser.add_argument('--incremental', action='store_true') + parser.set_defaults(experimental=False, reset=False, init=False, incremental=False) def handle(self, *args, **options): """ @@ -34,4 +37,11 @@ def handle(self, *args, **options): "Use the --experimental argument to acknowledge and run it." ) - api.rebuild_index(self.stdout.write) + if options["reset"]: + api.reset_index(self.stdout.write) + elif options["init"]: + api.init_index(self.stdout.write) + elif options["incremental"]: + api.rebuild_index(self.stdout.write, incremental=True) + else: + api.rebuild_index(self.stdout.write) diff --git a/openedx/core/djangoapps/content/search/migrations/0002_incrementalindexcompleted.py b/openedx/core/djangoapps/content/search/migrations/0002_incrementalindexcompleted.py new file mode 100644 index 000000000000..a316c35a7dfe --- /dev/null +++ b/openedx/core/djangoapps/content/search/migrations/0002_incrementalindexcompleted.py @@ -0,0 +1,21 @@ +# Generated by Django 4.2.16 on 2024-11-15 12:40 + +from django.db import migrations, models +import opaque_keys.edx.django.models + + +class Migration(migrations.Migration): + + dependencies = [ + ('search', '0001_initial'), + ] + + operations = [ + migrations.CreateModel( + name='IncrementalIndexCompleted', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('context_key', opaque_keys.edx.django.models.LearningContextKeyField(max_length=255, unique=True)), + ], + ), + ] diff --git a/openedx/core/djangoapps/content/search/models.py b/openedx/core/djangoapps/content/search/models.py index 711c493ff895..91e12affc326 100644 --- a/openedx/core/djangoapps/content/search/models.py +++ b/openedx/core/djangoapps/content/search/models.py @@ -65,3 +65,12 @@ def get_access_ids_for_request(request: Request, omit_orgs: list[str] = None) -> course_clause | library_clause ).order_by('-id').values_list("id", flat=True) ) + + +class IncrementalIndexCompleted(models.Model): + """ + Stores the contex keys of aleady indexed courses and libraries for incremental indexing. + """ + context_key = LearningContextKeyField( + max_length=255, unique=True, null=False, + ) diff --git a/openedx/core/djangoapps/content/search/tests/test_api.py b/openedx/core/djangoapps/content/search/tests/test_api.py index 990226f343cf..3949021953a4 100644 --- a/openedx/core/djangoapps/content/search/tests/test_api.py +++ b/openedx/core/djangoapps/content/search/tests/test_api.py @@ -10,12 +10,15 @@ from opaque_keys.edx.keys import UsageKey import ddt +import pytest from django.test import override_settings from freezegun import freeze_time +from meilisearch.errors import MeilisearchApiError from openedx_learning.api import authoring as authoring_api from organizations.tests.factories import OrganizationFactory from common.djangoapps.student.tests.factories import UserFactory +from openedx.core.djangoapps.content.search.models import IncrementalIndexCompleted from openedx.core.djangoapps.content_libraries import api as library_api from openedx.core.djangoapps.content_tagging import api as tagging_api from openedx.core.djangoapps.content.course_overviews.api import CourseOverview @@ -236,6 +239,73 @@ def test_reindex_meilisearch(self, mock_meilisearch): any_order=True, ) + @override_settings(MEILISEARCH_ENABLED=True) + def test_reindex_meilisearch_incremental(self, mock_meilisearch): + + # Add tags field to doc, since reindex calls includes tags + doc_sequential = copy.deepcopy(self.doc_sequential) + doc_sequential["tags"] = {} + doc_vertical = copy.deepcopy(self.doc_vertical) + doc_vertical["tags"] = {} + doc_problem1 = copy.deepcopy(self.doc_problem1) + doc_problem1["tags"] = {} + doc_problem1["collections"] = {'display_name': [], 'key': []} + doc_problem2 = copy.deepcopy(self.doc_problem2) + doc_problem2["tags"] = {} + doc_problem2["collections"] = {'display_name': [], 'key': []} + doc_collection = copy.deepcopy(self.collection_dict) + doc_collection["tags"] = {} + + api.rebuild_index(incremental=True) + assert mock_meilisearch.return_value.index.return_value.add_documents.call_count == 3 + mock_meilisearch.return_value.index.return_value.add_documents.assert_has_calls( + [ + call([doc_sequential, doc_vertical]), + call([doc_problem1, doc_problem2]), + call([doc_collection]), + ], + any_order=True, + ) + + # Now we simulate interruption by patching _wait_for_meili_task to raise an exception + def simulated_interruption(): + yield + yield + raise Exception("Simulated interruption") + with patch("openedx.core.djangoapps.content.search.api._wait_for_meili_task", side_effect=simulated_interruption()): + with pytest.raises(Exception, match="Simulated interruption"): + api.rebuild_index(incremental=True) + assert IncrementalIndexCompleted.objects.all().count() == 1 + api.rebuild_index(incremental=True) + assert IncrementalIndexCompleted.objects.all().count() == 0 + assert mock_meilisearch.return_value.index.return_value.add_documents.call_count == 7 + + @override_settings(MEILISEARCH_ENABLED=True) + def test_reset_meilisearch_index(self, mock_meilisearch): + api.reset_index() + mock_meilisearch.return_value.swap_indexes.assert_called_once() + mock_meilisearch.return_value.create_index.assert_called_once() + mock_meilisearch.return_value.delete_index.call_count = 2 + api.reset_index() + mock_meilisearch.return_value.delete_index.call_count = 4 + + @override_settings(MEILISEARCH_ENABLED=True) + def test_init_meilisearch_index(self, mock_meilisearch): + api.init_index() + mock_meilisearch.return_value.swap_indexes.assert_not_called() + mock_meilisearch.return_value.create_index.assert_not_called() + mock_meilisearch.return_value.delete_index.assert_not_called() + + mock_meilisearch.return_value.get_index.side_effect = [ + MeilisearchApiError("Testing reindex", Mock(code="index_not_found", text=None)), + MeilisearchApiError("Testing reindex", Mock(code="index_not_found", text=None)), + Mock(), + ] + api.init_index() + mock_meilisearch.return_value.swap_indexes.assert_called_once() + mock_meilisearch.return_value.create_index.assert_called_once() + mock_meilisearch.return_value.delete_index.call_count = 2 + @override_settings(MEILISEARCH_ENABLED=True) @patch( "openedx.core.djangoapps.content.search.api.searchable_doc_for_collection",