From ab1451b78111124dcef138cf081cfc3eca468b12 Mon Sep 17 00:00:00 2001 From: farhan Date: Tue, 5 Mar 2024 14:49:56 +0500 Subject: [PATCH 1/3] chore: Moved structures.py from tubular repository --- .../workflows/units-test-scripts-common.yml | 33 + Makefile | 4 +- scripts/common/README.rst | 71 ++ scripts/common/__init__.py | 0 scripts/common/pytest.ini | 0 scripts/common/requirements/base.in | 4 + scripts/common/requirements/base.txt | 24 + scripts/common/requirements/testing.in | 4 + scripts/common/requirements/testing.txt | 44 ++ scripts/common/structures.py | 200 ++++++ scripts/common/tests/__init__.py | 0 scripts/common/tests/test_splitmongo.py | 502 +++++++++++++ scripts/common/utils/splitmongo.py | 679 ++++++++++++++++++ 13 files changed, 1564 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/units-test-scripts-common.yml create mode 100644 scripts/common/README.rst create mode 100644 scripts/common/__init__.py create mode 100644 scripts/common/pytest.ini create mode 100644 scripts/common/requirements/base.in create mode 100644 scripts/common/requirements/base.txt create mode 100644 scripts/common/requirements/testing.in create mode 100644 scripts/common/requirements/testing.txt create mode 100644 scripts/common/structures.py create mode 100644 scripts/common/tests/__init__.py create mode 100644 scripts/common/tests/test_splitmongo.py create mode 100644 scripts/common/utils/splitmongo.py diff --git a/.github/workflows/units-test-scripts-common.yml b/.github/workflows/units-test-scripts-common.yml new file mode 100644 index 000000000000..58b4d6ccd842 --- /dev/null +++ b/.github/workflows/units-test-scripts-common.yml @@ -0,0 +1,33 @@ +name: units-test-scripts-common + +on: + pull_request: + push: + branches: + - master + +jobs: + test: + runs-on: ubuntu-latest + + strategy: + matrix: + python-version: [ '3.8' ] + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r scripts/common/requirements/testing.txt + + - name: Run pytest + run: | + pytest scripts/common diff --git a/Makefile b/Makefile index 1012e558a7d9..5c58a7ff8d41 100644 --- a/Makefile +++ b/Makefile @@ -140,7 +140,9 @@ REQ_FILES = \ requirements/edx/semgrep \ scripts/xblock/requirements \ scripts/user_retirement/requirements/base \ - scripts/user_retirement/requirements/testing + scripts/user_retirement/requirements/testing \ + scripts/common/requirements/base \ + scripts/common/requirements/testing define COMMON_CONSTRAINTS_TEMP_COMMENT # This is a temporary solution to override the real common_constraints.txt\n# In edx-lint, until the pyjwt constraint in edx-lint has been removed.\n# See BOM-2721 for more details.\n# Below is the copied and edited version of common_constraints\n diff --git a/scripts/common/README.rst b/scripts/common/README.rst new file mode 100644 index 000000000000..0b51b5503a64 --- /dev/null +++ b/scripts/common/README.rst @@ -0,0 +1,71 @@ +Common Scripts +============== + +`This `_ directory contains some common python scripts. Some of them are migrated from the other repositories. + +These scripts could be called from any automation/CD framework. + +How to run the scripts +====================== + +Download the Scripts +-------------------- + +To download the scripts, you can perform a partial clone of the edx-platform repository to obtain only the required scripts. The following steps demonstrate how to achieve this. Alternatively, you may choose other utilities or libraries for the partial clone. + +.. code-block:: bash + + repo_url=git@github.com:openedx/edx-platform.git + branch=master + directory=scripts/common + + git clone --branch $branch --single-branch --depth=1 --filter=tree:0 $repo_url + cd edx-platform + git sparse-checkout init --cone + git sparse-checkout set $directory + +Create Python Virtual Environment +--------------------------------- + +Create a Python virtual environment using Python 3.8: + +.. code-block:: bash + + python3.8 -m venv ../venv + source ../venv/bin/activate + +Install Pip Packages +-------------------- + +Install the required pip packages using the provided requirements file: + +.. code-block:: bash + + pip install -r scripts/common/requirements/base.txt + + +Execute Script +-------------- + +You can simply execute Python scripts with python command + +.. code-block:: bash + + python scripts/common/structures.py prune plan_file.json + +Feel free to customize these steps according to your specific environment and requirements. + +Run Test Cases +============== + +Before running test cases, install the testing requirements: + +.. code-block:: bash + + pip install -r scripts/common/requirements/testing.txt + +Run the test cases using pytest: + +.. code-block:: bash + + pytest scripts/common diff --git a/scripts/common/__init__.py b/scripts/common/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/scripts/common/pytest.ini b/scripts/common/pytest.ini new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/scripts/common/requirements/base.in b/scripts/common/requirements/base.in new file mode 100644 index 000000000000..70eb4a70a6b0 --- /dev/null +++ b/scripts/common/requirements/base.in @@ -0,0 +1,4 @@ +click +click-log +pymongo +edx-opaque-keys diff --git a/scripts/common/requirements/base.txt b/scripts/common/requirements/base.txt new file mode 100644 index 000000000000..ddf6f2021bb3 --- /dev/null +++ b/scripts/common/requirements/base.txt @@ -0,0 +1,24 @@ +# +# This file is autogenerated by pip-compile with Python 3.8 +# by the following command: +# +# make upgrade +# +click==8.1.7 + # via + # -r scripts/common/requirements/base.in + # click-log +click-log==0.4.0 + # via -r scripts/common/requirements/base.in +edx-opaque-keys==2.5.1 + # via -r scripts/common/requirements/base.in +pbr==6.0.0 + # via stevedore +pymongo==3.13.0 + # via + # -r scripts/common/requirements/base.in + # edx-opaque-keys +stevedore==5.2.0 + # via edx-opaque-keys +typing-extensions==4.10.0 + # via edx-opaque-keys diff --git a/scripts/common/requirements/testing.in b/scripts/common/requirements/testing.in new file mode 100644 index 000000000000..d1e18b775ad4 --- /dev/null +++ b/scripts/common/requirements/testing.in @@ -0,0 +1,4 @@ +-r base.txt + +pytest +ddt diff --git a/scripts/common/requirements/testing.txt b/scripts/common/requirements/testing.txt new file mode 100644 index 000000000000..be8d5b77d768 --- /dev/null +++ b/scripts/common/requirements/testing.txt @@ -0,0 +1,44 @@ +# +# This file is autogenerated by pip-compile with Python 3.8 +# by the following command: +# +# make upgrade +# +click==8.1.7 + # via + # -r scripts/common/requirements/base.txt + # click-log +click-log==0.4.0 + # via -r scripts/common/requirements/base.txt +ddt==1.7.2 + # via -r scripts/common/requirements/testing.in +edx-opaque-keys==2.5.1 + # via -r scripts/common/requirements/base.txt +exceptiongroup==1.2.0 + # via pytest +iniconfig==2.0.0 + # via pytest +packaging==23.2 + # via pytest +pbr==6.0.0 + # via + # -r scripts/common/requirements/base.txt + # stevedore +pluggy==1.4.0 + # via pytest +pymongo==3.13.0 + # via + # -r scripts/common/requirements/base.txt + # edx-opaque-keys +pytest==8.0.2 + # via -r scripts/common/requirements/testing.in +stevedore==5.2.0 + # via + # -r scripts/common/requirements/base.txt + # edx-opaque-keys +tomli==2.0.1 + # via pytest +typing-extensions==4.10.0 + # via + # -r scripts/common/requirements/base.txt + # edx-opaque-keys diff --git a/scripts/common/structures.py b/scripts/common/structures.py new file mode 100644 index 000000000000..7aac79f5ed2a --- /dev/null +++ b/scripts/common/structures.py @@ -0,0 +1,200 @@ +#! /usr/bin/env python3 +""" +Script to detect and prune old Structure documents from the "Split" Modulestore +MongoDB (edxapp.modulestore.structures by default). See docstring/help for the +"make_plan" and "prune" commands for more details. +""" + +import logging +from os import path +import sys + +import click +import click_log + +# Add top-level project path to sys.path before importing scripts code +sys.path.append(path.abspath(path.join(path.dirname(__file__), '../..'))) + +from scripts.common.utils.splitmongo import SplitMongoBackend, ChangePlan + +# Add top-level module path to sys.path before importing tubular code. +# sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +# from tubular.splitmongo import ChangePlan, SplitMongoBackend # pylint: disable=wrong-import-position + +LOG = logging.getLogger('structures') +click_log.basic_config(LOG) + + +@click.group() +@click.option( + '--connection', + default="mongodb://localhost:27017", + help=( + 'Connection string to the target mongo database. This defaults to ' + 'localhost without password (that will work against devstack). ' + 'You may need to use urllib.parse.quote_plus() to percent-escape ' + 'your username and password.' + ) +) +@click.option( + '--database-name', + default='edxapp', + help='Name of the edX Mongo database containing the course structures to prune.' +) +@click.pass_context +def cli(ctx, connection, database_name): + """ + Recover space on MongoDB for edx-platform by deleting unreachable, + historical course content data. To use, first make a change plan with the + "make_plan" command, and then execute that plan against the database with + the "prune" command. + + This script provides logic to clean up old, unused course content data for + the DraftVersioningModuleStore modulestore, more commonly referred to as the + "Split Mongo" or "Split" modulestore (DraftVersioningModuleStore subclasses + SplitMongoModuleStore). All courses and assets that have newer style locator + keys use DraftVersioningModuleStore. These keys start with "course-v1:", + "ccx-v1:", or "block-v1:". Studio authored content data for this modulestore + is saved as immutable data structures. The edx-platform code never cleans up + old data however, meaning there is an unbounded history of a course's + content revisions stored in MongoDB. + + The older modulestore is DraftModuleStore, sometimes called "Old Mongo". + This code does not address that modulestore in any way. That modulestore + handles courses that use the old "/" separator, such as + "MITx/6.002x/2012_Spring", as well as assets starting with "i4x://". + """ + if ctx.obj is None: + ctx.obj = dict() + + ctx.obj['BACKEND'] = SplitMongoBackend(connection, database_name) + + +@cli.command("make_plan") +@click_log.simple_verbosity_option(default='INFO') +@click.argument('plan_file', type=click.File('w')) +@click.option( + '--details', + type=click.File('w'), + default=None, + help="Name of file to write the human-readable details of the Change Plan." +) +@click.option( + '--retain', + default=2, + type=click.IntRange(0, None), + help=("The maximum number of intermediate structures to preserve for any " + "single branch of an active version. This value does not include the " + "active or original structures (those are always preserved). Defaults " + "to 2. Put 0 here if you want to prune as much as possible.") +) +@click.option( + '--delay', + default=15000, + type=click.IntRange(0, None), + help=("Delay in milliseconds between queries to fetch structures from MongoDB " + "during plan creation. Tune to adjust load on the database.") +) +@click.option( + '--batch-size', + default=10000, + type=click.IntRange(1, None), + help="How many Structures do we fetch at a time?" +) +@click.option( + '--ignore-missing/--no-ignore-missing', + default=False, + help=("Force plan creation, even if missing structures are found. " + "Should repair invalid ids by repointing to original. " + "Review of plan highly recommended") +) +@click.option( + '--dump-structures/--no-dump-structures', + default=False, + help="Dump all strucutres to stderr for debugging or recording state before cleanup." +) +@click.pass_context +def make_plan(ctx, plan_file, details, retain, delay, batch_size, ignore_missing, dump_structures): + """ + Create a Change Plan JSON file describing the operations needed to prune the + database. This command is read-only and does not alter the database. + + The Change Plan JSON is a dictionary with two keys: + + "delete" - A sorted array of Structure document IDs to delete. Since MongoDB + object IDs are created in ascending order by timestamp, this means that the + oldest documents come earlier in the list. + + "update_parents" - A list of [Structure ID, New Parent/Previous ID] pairs. + This is used to re-link the oldest preserved Intermediate Structure back to + the Original Structure, so that we don't leave the database in a state where + a Structure's "previous_version" points to a deleted Structure. + + Specifying a --details file will generate a more verbose, human-readable + text description of the Change Plan for verification purposes. The details + file will only display Structures that are reachable from an Active Version, + so any Structures that are "orphaned" as a result of partial runs of this + script or Studio race conditions will not be reflected. That being said, + orphaned Structures are detected and properly noted in the Change Plan JSON. + """ + structures_graph = ctx.obj['BACKEND'].structures_graph(delay / 1000.0, batch_size) + + # This will create the details file as a side-effect, if specified. + change_plan = ChangePlan.create(structures_graph, retain, ignore_missing, dump_structures, details) + change_plan.dump(plan_file) + + +@cli.command() +@click_log.simple_verbosity_option(default='INFO') +@click.argument('plan_file', type=click.File('r')) +@click.option( + '--delay', + default=15000, + type=click.IntRange(0, None), + help=("Delay in milliseconds between batch deletions during pruning. Tune to " + "adjust load on the database.") +) +@click.option( + '--batch-size', + default=1000, + type=click.IntRange(1, None), + help=("How many Structures do we delete at a time? Tune to adjust load on " + "the database.") +) +@click.option( + '--start', + default=None, + help=("Structure ID to start deleting from. Specifying a Structure ID that " + "is not in the Change Plan is an error. Specifying a Structure ID that " + "has already been deleted is NOT an error, so it's safe to re-run.") +) +@click.pass_context +def prune(ctx, plan_file, delay, batch_size, start): + """ + Prune the MongoDB database according to a Change Plan file. + + This command tries to be as safe as possible. It executes parent updates + before deletes, so an interruption at any point should be safe in that it + won't leave the structure graphs in an inconsistent state. It should also + be safe to resume pruning with the same Change Plan in the event of an + interruption. + + It's also safe to run while Studio is still operating, though you should be + careful to test and tweak the delay and batch_size options to throttle load + on your database. + """ + change_plan = ChangePlan.load(plan_file) + if start is not None and start not in change_plan.delete: + raise click.BadParameter( + "{} is not in the Change Plan {}".format( + start, click.format_filename(plan_file.name) + ), + param_hint='--start' + ) + ctx.obj['BACKEND'].update(change_plan, delay / 1000.0, batch_size, start) + + +if __name__ == '__main__': + # pylint doesn't grok click magic, but this is straight from their docs... + cli(obj={}) # pylint: disable=no-value-for-parameter, unexpected-keyword-arg diff --git a/scripts/common/tests/__init__.py b/scripts/common/tests/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/scripts/common/tests/test_splitmongo.py b/scripts/common/tests/test_splitmongo.py new file mode 100644 index 000000000000..197ee089ec49 --- /dev/null +++ b/scripts/common/tests/test_splitmongo.py @@ -0,0 +1,502 @@ +""" +Test Structure pruning related Split Mongo code. + +IMPORTANT: If you are making changes to this code, please re-enable the +TestSplitMongoBackend tests and run them locally against the MongoDB instance +in your Docker Devstack. See the TestSplitMongoBackend docstring for more info. +""" +import itertools +import sys +import textwrap +import unittest +from datetime import datetime +from io import StringIO +from os import path +from unittest.mock import patch + +import ddt +from bson.objectid import ObjectId +from opaque_keys.edx.locator import CourseLocator, LibraryLocator +from pymongo import MongoClient + +# Add top-level project path to sys.path before importing scripts code +sys.path.append(path.abspath(path.join(path.dirname(__file__), '../..'))) + +from scripts.common.utils.splitmongo import ( + ActiveVersionBranch, ChangePlan, Structure, SplitMongoBackend, StructuresGraph +) + + +def create_test_graph(*version_histories): + """ + Given any number of lists, where each list represents a history of Structure + IDs from oldest to newest, return a StructureGraph matching that + specification. Course names, branch names, and other attributes that exist + for debugging/reporting but do not change pruning behavior will be + automatically generated with plausible values. + """ + all_structures = {} + all_active_version_branches = [] + + active_id_pool = ("A{:023x}".format(i) for i in itertools.count(1)) + course_key_pool = ( + CourseLocator('edx', 'splitmongo', str(i)) for i in itertools.count(1) + ) + branch_pool = itertools.cycle(['draft-branch', 'published-branch']) + + for version_history in version_histories: + assert version_history # The history can't be empty + structure_ids = [str(version) for version in version_history] + + # Create the Original + original_id = structure_ids[0] + history = [Structure(original_id, original_id, None)] + + # Create all other Structures (if any) + for previous_id, current_id in zip(structure_ids, structure_ids[1:]): + history.append(Structure(current_id, original_id, previous_id)) + + # Add to our overall Structures dict (overwrites should be identical or + # our test data is bad). + for structure in history: + if structure.id in all_structures: + assert structure == all_structures[structure.id] + else: + all_structures[structure.id] = structure + + active_version_id = structure_ids[-1] + all_active_version_branches.append( + ActiveVersionBranch( + id=next(active_id_pool), + branch=next(branch_pool), + structure_id=active_version_id, + key=next(course_key_pool), + edited_on=datetime(2012, 5, 2) + + ) + ) + + return StructuresGraph(all_active_version_branches, all_structures) + + +@ddt.ddt +class TestCourseChangePlan(unittest.TestCase): + """ + ChangePlans for single and multiple courses. + """ + + def test_simple(self): + """Simple happy path ChangePlans.""" + graph = create_test_graph(["1", "2", "3", "4"]) + + # Preserve no intermediate structures -- prune the middle structures. + plan_no_intermediate = ChangePlan.create(graph, 0, False, False) + self.assertEqual(plan_no_intermediate.delete, ["2", "3"]) + self.assertEqual(plan_no_intermediate.update_parents, [("4", "1")]) + + # Preserve one intermediate structure + plan_1_intermediate = ChangePlan.create(graph, 1, False, False) + self.assertEqual(plan_1_intermediate.delete, ["2"]) + self.assertEqual(plan_1_intermediate.update_parents, [("3", "1")]) + + # Preserve two intermediate structures -- Do nothing + plan_2_intermediate = ChangePlan.create(graph, 2, False, False) + self.assertEqual(plan_2_intermediate.delete, []) + self.assertEqual(plan_2_intermediate.update_parents, []) + + @ddt.data( + create_test_graph(["1"]), # Original (is also Active) + create_test_graph(["1", "2"]), # "1" = Original, "2" = Active + ) + def test_no_changes(self, graph): + """These scenarios should result in no Changes.""" + plan_1 = ChangePlan.create(graph, 0, False, False) + plan_2 = ChangePlan.create(graph, 2, False, False) + self.assertEqual(plan_1, plan_2) + self.assertEqual(plan_1.delete, []) + self.assertEqual(plan_1.update_parents, []) + + def test_overlapping_shared_history(self): + """Test multiple branches that overlap in what history to preserve.""" + graph = create_test_graph( + ["1", "2", "3"], + ["1", "2", "3", "4", "5"], + ["1", "2", "3", "6"], + ["1", "2", "7", "8", "9", "10"], + ) + plan = ChangePlan.create(graph, 1, False, False) + + # We specified only one intermediate structure in each branch should be + # preserved. So why do we only delete "7" and "8" here? + # "1" is the original structure, and will always be preserved. + # "2" is the intermediate structure preserved by the first branch. It + # won't be deleted, even if other branches might want to flag it for + # deletion. + # "3" would be deleted by the second branch, but it's Active in the + # first, and so is preserved. Active Structures are never deleted. + # "4" is preserved by the second branch. + # "5" is the Active Structure for the second branch. + # "6" is the Active Structure for the third branch. + # "7" is marked for deletion by the fourth branch. + # "8" is marked for deletion by the fourth branch. + # "9" is preserved by the fourth branch. + # "10" is the Active Structure for the fourth branch. + self.assertEqual(plan.delete, ["7", "8"]) + self.assertEqual(plan.update_parents, [("9", "1")]) + + def test_non_overlapping_shared_history(self): + """Test shared history, preserved intermediate set doesn't overlap.""" + graph = create_test_graph( + ["1", "2", "3"], + ["1", "2", "3", "4", "5", "6"], + ) + plan = ChangePlan.create(graph, 0, False, False) + self.assertEqual(plan.delete, ["2", "4", "5"]) + self.assertEqual(plan.update_parents, [("3", "1"), ("6", "1")]) + + graph_save_1 = create_test_graph( + ["1", "2", "3", "4"], + ["1", "2", "3", "4", "5", "6", "7"], + ) + plan_save_1 = ChangePlan.create(graph_save_1, 1, False, False) + self.assertEqual(plan_save_1.delete, ["2", "5"]) + self.assertEqual(plan_save_1.update_parents, [("3", "1"), ("6", "1")]) + + def test_details_output(self): + """Test our details file output.""" + graph = create_test_graph( + ["1"], + ["2", "3"], + ["4", "5", "6"] + ) + buff = StringIO() + buff.name = "test_file.txt" + plan = ChangePlan.create(graph, 0, False, False, buff) + details_txt = buff.getvalue() + + # pylint: disable=line-too-long + expected_output = textwrap.dedent( + """ + == Summary == + Active Version Branches: 3 + Total Structures: 6 + Structures to Save: 5 + Structures to Delete: 1 + Structures to Rewrite Parent Link: 1 + + == Active Versions == + Active Version A00000000000000000000001 [2012-05-02 00:00:00] draft-branch for course-v1:edx+splitmongo+1 + + 1 (active) (original) + + Active Version A00000000000000000000002 [2012-05-02 00:00:00] published-branch for course-v1:edx+splitmongo+2 + + 3 (active) + + 2 (original) + + Active Version A00000000000000000000003 [2012-05-02 00:00:00] draft-branch for course-v1:edx+splitmongo+3 + + 6 (active) (re-link to original) + - 5 + + 4 (original) + + """ + ).lstrip() + # pylint: enable=line-too-long + self.assertEqual(expected_output, details_txt) + self.assertEqual( + plan, + ChangePlan( + delete=["5"], + update_parents=[("6", "4")] + ) + ) + + +class TestSplitMongoBackendHelpers(unittest.TestCase): + """ + Test the static helper methods of SplitMongoBackend. + + Requires no actual database connection. + """ + + def test_parse_structure_doc(self): + """Test basic parsing of Structures.""" + original_structure = SplitMongoBackend.parse_structure_doc( + { + '_id': obj_id(1), + 'original_version': obj_id(1), + 'previous_version': None, + 'extra_data': "This is ignored" + } + ) + self.assertEqual( + original_structure, + Structure(id=str_id(1), original_id=str_id(1), previous_id=None) + ) + self.assertTrue(original_structure.is_original()) + + other_structure = SplitMongoBackend.parse_structure_doc( + { + '_id': obj_id(2), + 'original_version': obj_id(1), + 'previous_version': obj_id(1), + 'extra_data': "This is ignored" + } + ) + self.assertEqual( + other_structure, + Structure(id=str_id(2), original_id=str_id(1), previous_id=str_id(1)) + ) + self.assertFalse(other_structure.is_original()) + + def test_batch(self): + """Test the batch helper that breaks up iterables for DB operations.""" + self.assertEqual( + list(SplitMongoBackend.batch([], 1)), + [] + ) + self.assertEqual( + list(SplitMongoBackend.batch([1, 2, 3], 1)), + [[1], [2], [3]] + ) + self.assertEqual( + list(SplitMongoBackend.batch([1, 2, 3], 2)), + [[1, 2], [3]] + ) + self.assertEqual( + list(SplitMongoBackend.batch([1, 2, 3, 4], 2)), + [[1, 2], [3, 4]] + ) + + def test_iter_from_start(self): + """Test what we use to resume deletion from a given Structure ID.""" + all_ids = [1, 2, 3] + self.assertEqual( + list(SplitMongoBackend.iter_from_start(all_ids, None)), + all_ids + ) + self.assertEqual( + list(SplitMongoBackend.iter_from_start(all_ids, 1)), + all_ids + ) + self.assertEqual( + list(SplitMongoBackend.iter_from_start(all_ids, 2)), + [2, 3] + ) + self.assertEqual( + list(SplitMongoBackend.iter_from_start(all_ids, 3)), + [3] + ) + self.assertEqual( + list(SplitMongoBackend.iter_from_start(all_ids, 4)), + [] + ) + + +@unittest.skip("Requires local MongoDB instance (run manually).") +class TestSplitMongoBackend(unittest.TestCase): + """ + Tests the MongoDB-specific portions of the code. + + These tests should be about simple read/write from the database. Complex + trees of Structures can be created and tested in TestSingleCourseChangePlan + without invoking the database. + + These tests will be disabled by default because I didn't want to add MongoDB + as a test-time dependency for tubular, and the only decent looking MongoDB + mocking library I could find was no longer being maintained. Given how + isolated Split Mongo related code is in tubular (nothing else touches it), + the main danger of breakage comes from file format changes in edx-platform, + which automated testing at this level wouldn't catch anyway. + + So basically, if you want to work on this code, please run these tests + locally by spinning up the MongoDB server used for Docker Devstack and + commenting out the unittest.skip decorator above. + """ + CONNECT_STR = "mongodb://localhost:27017" + DATABASE_NAME = "splitmongo_test" + + def setUp(self): + """Clear our test MongoDB instance of data.""" + super().setUp() + + self.client = MongoClient(self.CONNECT_STR) + database = self.client[self.DATABASE_NAME] + + # Remove anything that might have been there from a previous test. + database.drop_collection('modulestore.active_versions') + database.drop_collection('modulestore.structures') + + # Convenince pointers to our collections. + self.active_versions = database['modulestore.active_versions'] + self.structures = database['modulestore.structures'] + + # The backend we should use in our tests for querying. + self.backend = SplitMongoBackend(self.CONNECT_STR, self.DATABASE_NAME) + self.seed_data() + + def seed_data(self): + """Create a Course and Library.""" + structure_docs = [ + # Branch 1 + dict(_id=obj_id(1), original_version=obj_id(1), previous_version=None), + dict(_id=obj_id(2), original_version=obj_id(1), previous_version=obj_id(1)), + dict(_id=obj_id(3), original_version=obj_id(1), previous_version=obj_id(2)), + dict(_id=obj_id(4), original_version=obj_id(1), previous_version=obj_id(3)), + + # Branch 2 + dict(_id=obj_id(10), original_version=obj_id(10), previous_version=None), + dict(_id=obj_id(11), original_version=obj_id(10), previous_version=obj_id(10)), + + # Branch 3 + dict(_id=obj_id(20), original_version=obj_id(20), previous_version=None), + ] + active_versions_docs = [ + { + '_id': obj_id(100), + 'edited_on': datetime(2012, 5, 2), + 'org': 'edx', + 'course': 'split_course', + 'run': '2017', + 'versions': { + 'draft-branch': obj_id(4), + 'published-branch': obj_id(11) + } + }, + { + '_id': obj_id(101), + 'edited_on': datetime(2012, 5, 3), + 'org': 'edx', + 'course': 'split_library', + 'run': 'library', + 'versions': { + 'library': obj_id(20), + } + } + ] + self.structures.insert_many(structure_docs) + self.active_versions.insert_many(active_versions_docs) + + def test_structures_graph(self): + """Test pulling a full graph out.""" + graph = self.backend.structures_graph(0, 100) + self.assertEqual( + graph.branches, + [ + ActiveVersionBranch( + id=str_id(100), + branch='draft-branch', + structure_id=str_id(4), + key=CourseLocator('edx', 'split_course', '2017'), + edited_on=datetime(2012, 5, 2), + ), + ActiveVersionBranch( + id=str_id(100), + branch='published-branch', + structure_id=str_id(11), + key=CourseLocator('edx', 'split_course', '2017'), + edited_on=datetime(2012, 5, 2), + ), + ActiveVersionBranch( + id=str_id(101), + branch='library', + structure_id=str_id(20), + key=LibraryLocator('edx', 'split_library'), + edited_on=datetime(2012, 5, 3), + ), + ] + ) + self.assertEqual( + list(graph.structures.keys()), + [str_id(i) for i in [1, 2, 3, 4, 10, 11, 20]] + ) + + def test_update(self): + """Execute a simple update.""" + self.backend.update( + ChangePlan( + delete=[str_id(i) for i in [2, 3]], + update_parents=[(str_id(4), str_id(1))] + ), + delay=0 + ) + graph = self.backend.structures_graph(0, 100) + self.assertEqual( + list(graph.structures.keys()), + [str_id(i) for i in [1, 4, 10, 11, 20]] + ) + self.assertEqual( + graph.structures, + { + str_id(1): Structure(id=str_id(1), original_id=str_id(1), previous_id=None), + # This one got its previous_id rewritten from 3 -> 1 + str_id(4): Structure(id=str_id(4), original_id=str_id(1), previous_id=str_id(1)), + str_id(10): Structure(id=str_id(10), original_id=str_id(10), previous_id=None), + str_id(11): Structure(id=str_id(11), original_id=str_id(10), previous_id=str_id(10)), + str_id(20): Structure(id=str_id(20), original_id=str_id(20), previous_id=None), + } + ) + + def test_race_condition(self): + """Create new Structures are during ChangePlan creation.""" + # Get the real method before we patch it... + real_all_structures_fn = SplitMongoBackend._all_structures # pylint: disable=protected-access + + def add_structures(backend, delay, batch_size): + """Do what _all_structures() would do, then add new Structures.""" + structures = real_all_structures_fn(backend, delay, batch_size) + + # Create new Structures + self.structures.insert_one( + dict(_id=obj_id(5), original_version=obj_id(1), previous_version=obj_id(4)), + ) + self.structures.insert_one( + dict(_id=obj_id(6), original_version=obj_id(1), previous_version=obj_id(5)), + ) + self.structures.insert_one( + dict(_id=obj_id(7), original_version=obj_id(1), previous_version=obj_id(6)), + ) + + # Update the Draft branch of course-v1:edx+split_course+2017 to + # point to one of the new Structures + self.active_versions.update_one( + {'_id': obj_id(100)}, + {'$set': {'versions.draft-branch': obj_id(5)}} + ) + + # Create an entirely new ActiveVersion and point it to the newest + # Structure. + self.active_versions.insert_one( + { + '_id': obj_id(102), + 'edited_on': datetime(2012, 5, 3), + 'org': 'edx', + 'course': 'split_library_race', + 'run': 'library', + 'versions': { + 'library': obj_id(7), + } + } + ) + + return structures + + with patch.object(SplitMongoBackend, '_all_structures', autospec=True) as all_structures_mock: + all_structures_mock.side_effect = add_structures + graph = self.backend.structures_graph(0, 100) + self.assertEqual(len(graph.structures), 10) + self.assertEqual(len(graph.branches), 4) + + plan = ChangePlan.create(graph, 0, False, False) + self.assertNotIn(str_id(5), plan.delete) # Active updated to this for our course. + self.assertNotIn(str_id(7), plan.delete) # Active for our new Library + self.assertIn(str_id(4), plan.delete) # Was our Active before + self.assertIn(str_id(6), plan.delete) # Intermediate structure to new Library + + +def str_id(int_id): + """Return the string version of Object IDs that PyMongo will accept.""" + return "{:024}".format(int_id) + + +def obj_id(int_id): + """Helper to create Object IDs that PyMongo will accept.""" + return ObjectId(str_id(int_id)) diff --git a/scripts/common/utils/splitmongo.py b/scripts/common/utils/splitmongo.py new file mode 100644 index 000000000000..69e1caf29185 --- /dev/null +++ b/scripts/common/utils/splitmongo.py @@ -0,0 +1,679 @@ +""" +This module provides logic to clean up old, unused course content data for the +DraftVersioningModuleStore modulestore, more commonly referred to as the "Split +Mongo" or "Split" modulestore (DraftVersioningModuleStore subclasses +SplitMongoModuleStore). All courses and assets that have newer style locator +keys use DraftVersioningModuleStore. These keys start with "course-v1:", +"ccx-v1:", or "block-v1:". + +The older modulestore is DraftModuleStore, sometimes called "Old Mongo". This +code does not address that modulestore in any way. That modulestore handles +courses that use the old "/" separator, such as "MITx/6.002x/2012_Spring", as +well as assets starting with "i4x://". + +"Split" gets its name from the fact that it separates the Structure of a course +from the content in the leaf nodes. In theory, the Structure is an outline of +the course that contains all the parent/child relations for different content +blocks (chapters, sections, sub-sections, verticals, videos, etc.), as well as +small, commonly inherited metadata like due dates. More detailed information +about any particular block of content is stored in a separate collection as +Definitions. + +Both Structures and Definitions are immutable in Split. When a course is edited, +a new Structure is created, and the Active Versions entry for a course is +updated to point to that new Structure. In that way, we never get a partially +applied edit -- it either succeeds or fails atomically. The Active Versions +entry for a Course has pointers to "published" and "draft" Structures. There is +also a special "library" pointer that is only used by Content Libraries. We do +not need to distinguish between these for the purposes of cleanup. + +The problem is that Structure documents have become far larger than they were +intended to be, and we never created code to properly clean them up. As such, it +is not uncommon for the majority of Mongo storage space to be used by old +Structure documents that are completely unused (and are unreachable) by LMS or +Studio. + +This module provides cleanup functionality with various tweakable options for +how much history to preserve. For simplicity, it reads all Structure IDs into +memory instead of working on subsets of the data. As a practical matter, this +means that it will work for databases with up to about 10 million Structures +before RAM usage starts to become a problem. +""" +from collections import deque, namedtuple +from itertools import count, takewhile +import json +import logging +import os +import sys +import time + +from bson.objectid import ObjectId +from pymongo import MongoClient, UpdateOne +from opaque_keys.edx.locator import CourseLocator, LibraryLocator + +LOG = logging.getLogger('structures') + + +class StructuresGraph(namedtuple('DatabaseSummary', 'branches structures')): + """ + This summarizes the entire set of Structure relationships in a database. + + Each Structure represents a saved state for the Course or Content Library. + For each branch ("published", "draft", or "library"), there is a sequence of + Structures that starts with an Original and ends in an Active Structure:: + + Original -> (Intermediate 1) -> (Intermediate 2) -> ... -> Active + + `branches` is a list of ActiveVersionBranch objects representing what's + currently live on the LMS and Studio. Active Structures referenced in this + list cannot be removed because it would break the site for users. + + `structures` is a dict of Structure IDs (Strings) to Structure objects + (described above). All the Structure objects store ID locations to their + parent and original Structures rather than having direct references to them. + This is partly because we don't really need to traverse the vast majority of + the graph. Look at `ChangePlan` for details on why that is. + """ + def traverse_ids(self, start_id, limit=None, include_start=False): + """ + Given a Structure ID to start from, this will iterate through the + previous_id chain, for up to `limit` parent relationships. If `limit` is + None, it will keep going until it gets through the Original. + """ + if include_start: + yield start_id + + current_id = start_id + i = 0 + while current_id in self.structures: + if limit is not None and i >= limit: + return + + current_id = self.structures[current_id].previous_id + if current_id is None: + return + + yield current_id + i += 1 + + +class ActiveVersionBranch(namedtuple('ActiveVersionBranch', 'id branch structure_id key edited_on')): + """ + An Active Version document can point to multiple branches (e.g. "published", + "draft"). This object represensts one of those branches. + + The value for `branch` can be "draft-branch", "published-branch", or + "library". All Courses have a draft-branch and a published-branch. Content + Libraries have only a "library" branch. + + The value for `key` is the Opaque Key representing the Course or Library, + mostly for debugging purposes (they're not a part of the plan file). + + The value for `edited_on` is a timestamp showing the last time the Active + Version document was modified -- for a Course, this means when *either* the + published-branch or draft-branch was most recently modified. Again, this is + not used for pruning, but just provides debug information. + """ + def __str__(self): + return "Active Version {} [{}] {} for {}".format( + self.id, + self.edited_on.strftime('%Y-%m-%d %H:%M:%S'), + self.branch, + self.key, + ) + + +class Structure(namedtuple('Structure', 'id original_id previous_id')): + """ + The parts of a SplitMongo Structure document that we care about, namely the + ID (str'd version of the ObjectID), and the IDs of the Original and Previous + structure documents. The previous_id may be None () + + We use a namedtuple for this specifically because it's more space efficient + than a dict, and we can have millions of Structures. + """ + def is_original(self): + """Is this Structure an original (i.e. should never be deleted)?""" + return self.previous_id is None + + +class ChangePlan(namedtuple('ChangePlan', 'delete update_parents')): + """ + Summary of the pruning actions we want a Backend to take. + + The idea of having this data structure and being able to serialize it is so + that we can save our plan of action somewhere for debugging, failure + recovery, and batching updates. + + `delete` is a list of Structure IDs we want to delete. + + `update_parents` is a list of (structure_id, new_previous_id) tuples + representing the previous_id updates we need to make. + + A ChangePlan is just a declarative. It is the responsibility of the + Backend to figure out how to implement a ChangePlan safely and efficiently + in order to do the actual updates. + """ + def dump(self, file_obj): + """Serialize ChangePlan to a file (JSON format).""" + json.dump( + { + "delete": self.delete, + "update_parents": self.update_parents, + }, + file_obj, + indent=2, + ) + LOG.info( + "Wrote Change Plan: %s (%s deletions, %s parent updates)", + os.path.realpath(file_obj.name), + len(self.delete), + len(self.update_parents) + ) + + @classmethod + def load(cls, file_obj): + """Load a ChangePlan from a JSON file. Takes a file object.""" + data = json.load(file_obj) + return cls( + delete=data["delete"], update_parents=data["update_parents"] + ) + + @classmethod + def create(cls, structures_graph, num_intermediate_structures, ignore_missing, dump_structures, details_file=None): + """ + Given a StructuresGraph and a target number for intermediate Structures + to preserve, return a ChangePlan that represents the changes needed to + prune the database. The overall strategy is to iterate through all + Active Structures, walk back through the ancestors, and add all the + Structure IDs we should save to a set. After we have our save set, we + know that we can delete all other structures without worrying about + whether those Structures are reachable or knowing what their + relationships are. This keeps things simpler, and means that we should + be more resilient to failures when pruning. + + Structure documents exist in chains of parent/child relationships, + starting with an Original Structure, having some number of Intermediate + Structures, and ending in an Active Structure:: + + Original -> (Intermediate 1) -> (Intermediate 2) -> ... -> Active + + Pruning Rules: + + 1. All Active Structures must be preserved, as those are being used by + the LMS and Studio to serve course content. + + 2. All Original Structures should be preserved, since those are used by + the LMS and Studio to determine common shared ancestry between + Structures. + + 3. Up to `num_intermediate_structures` Intermediate Structures will be + kept. These Structures are not actually used in edx-platform code, + but they are sometimes used by developers to allow emergency reverts + in course team support situations (e.g. someone accidentally wiped + out their course with a bad import). + + 4. The oldest preserved Intermediate Structure will be modified so that + its `previous_id` is updated to point to the Original Structure. That + way, we're not preserving references to the IDs of Structures that + have been pruned. + + """ + structure_ids_to_save = set() + set_parent_to_original = set() + + branches, structures = structures_graph + + # Figure out which Structures to save... + for branch in branches: + # Anything that's actively being pointed to (is the head of a branch) + # must be preserved. This is what's being served by Studio and LMS. + active_structure_id = branch.structure_id + structure_ids_to_save.add(active_structure_id) + + # All originals will be saved. + structure_ids_to_save.add(structures[active_structure_id].original_id) + + # Save up to `num_intermediate_structures` intermediate nodes + int_structure_ids_to_save = structures_graph.traverse_ids( + active_structure_id, limit=num_intermediate_structures + ) + for int_structure_id in int_structure_ids_to_save: + structure_ids_to_save.add(int_structure_id) + + missing_structure_ids = structure_ids_to_save - structures.keys() + + if ignore_missing: + # Remove missing structures since we can't save them + structure_ids_to_save -= missing_structure_ids + elif len(missing_structure_ids) > 0: + LOG.error("Missing structures detected") + sys.exit(1) + + # Figure out what links to rewrite -- the oldest structure to save that + # isn't an original. + for branch in branches: + rewrite_candidates = takewhile( + lambda s: s in structure_ids_to_save and not structures[s].is_original(), + structures_graph.traverse_ids(branch.structure_id, include_start=True) + ) + # `last_seen` will have the last structure_id from the + # `rewrite_candidates` iterable. + last_seen = deque(rewrite_candidates, 1) + if last_seen: + structure = structures[last_seen.pop()] + # Don't do a rewrite if it's just a no-op... + if structure.original_id != structure.previous_id: + set_parent_to_original.add(structure.id) + + # Sort the items in the ChangePlan. This might not be helpful, but I'm + # hoping that it will keep disk changes more localized and not thrash + # things as much as randomly distributed deletes. Mongo ObjectIDs are + # ordered (they have a timestamp component). + change_plan = cls( + delete=sorted(structures.keys() - structure_ids_to_save), + update_parents=sorted( + (s_id, structures[s_id].original_id) + for s_id in set_parent_to_original + ) + ) + + if details_file: + change_plan.write_details( + details_file, structures_graph, structure_ids_to_save, set_parent_to_original + ) + + if dump_structures: + active_structure_ids = {branch.structure_id for branch in branches} + for sid in structures: + save = sid in structure_ids_to_save + active = sid in active_structure_ids + relink = sid in set_parent_to_original + prev_misssing = structures[sid].previous_id is not None and structures[sid].previous_id not in structures + LOG.info(f"DUMP id: {sid}, original_id: {structures[sid].original_id}, previous_id: {structures[sid].previous_id}, save: {save}, active: {active}, prev_missing: {prev_misssing}, rewrite_previous_to_original: {relink}") + + for missing_structure_id in missing_structure_ids: + active_structure_ids = {branch.structure_id for branch in branches} + + LOG.error(f"Missing structure ID: {missing_structure_id}") + original_ids = set() + for structure in structures.values(): + if structure.previous_id == missing_structure_id: + save = structure.id in structure_ids_to_save + active = structure.id in active_structure_ids + relink = structure.id in set_parent_to_original + prev_misssing = structure.previous_id is not None and structure.previous_id not in structures + LOG.info(f"Structure {structure.id} points to missing structure with ID: {structure.previous_id}") + original_ids.add(structure.original_id) + + active_structure_ids = {branch.structure_id for branch in branches} + + branches_to_log = [] + + LOG.info(f"Looking for branches that lead to missing ID {missing_structure_id}") + for branch in branches: + structure = structures[branch.structure_id] + if structure.original_id in original_ids: + for sid in structures_graph.traverse_ids(branch.structure_id): + if sid not in structures: + branches_to_log.append(branch) + + for branch in branches_to_log: + structure = structures[branch.structure_id] + + LOG.info(f"Branch: {branch}") + + save = branch.structure_id in structure_ids_to_save + active = branch.structure_id in active_structure_ids + relink = branch.structure_id in set_parent_to_original + prev_misssing = structure.previous_id is not None and structure.previous_id not in structures + + for sid in structures_graph.traverse_ids(branch.structure_id, include_start=True): + if sid in structures: + save = sid in structure_ids_to_save + active = sid in active_structure_ids + relink = sid in set_parent_to_original + prev_misssing = structures[sid].previous_id is not None and structures[sid].previous_id not in structures + LOG.info(f"id: {sid}, original_id: {structures[sid].original_id}, previous_id: {structures[sid].previous_id}, save: {save}, active: {active}, prev_missing: {prev_misssing}, rewrite_previous_to_original: {relink}") + + return change_plan + + @staticmethod + def write_details(details_file, structures_graph, structure_ids_to_save, set_parent_to_original): + """ + Simple dump of the changes we're going to make to the database. + + This method requires information that we don't actually keep in the + ChangePlan file, such as the Course IDs and edit times. Because of this, + it can only be created at the time the ChangePlan is being generated, + and cannot be derived from an existing ChangePlan. The goal was to + provide this debug information while keeping the ChangePlan file format + as stupidly simple as possible. + """ + branches, structures = structures_graph + active_structure_ids = {branch.structure_id for branch in branches} + + def text_for(s_id): + """Helper method to format Structures consistently.""" + action = "+" if s_id in structure_ids_to_save else "-" + notes = [] + if s_id in active_structure_ids: + notes.append("(active)") + if s_id in set_parent_to_original: + notes.append("(re-link to original)") + if s_id in structures and structures[s_id].is_original(): + notes.append("(original)") + + if notes: + return "{} {} {}".format(action, s_id, " ".join(notes)) + + return "{} {}".format(action, s_id) + + print("== Summary ==", file=details_file) + print("Active Version Branches: {}".format(len(branches)), file=details_file) + print("Total Structures: {}".format(len(structures)), file=details_file) + print("Structures to Save: {}".format(len(structure_ids_to_save)), file=details_file) + print("Structures to Delete: {}".format(len(structures) - len(structure_ids_to_save)), file=details_file) + print("Structures to Rewrite Parent Link: {}".format(len(set_parent_to_original)), file=details_file) + print("\n== Active Versions ==", file=details_file) + + for branch in branches: + print("{}".format(branch), file=details_file) + for structure_id in structures_graph.traverse_ids(branch.structure_id, include_start=True): + print(text_for(structure_id), file=details_file) + print("", file=details_file) + + LOG.info( + "Wrote Change Details File: %s", os.path.realpath(details_file.name) + ) + + +class SplitMongoBackend: + """ + Interface to the MongoDB backend. This is currently the only supported KV + store for the Split(DraftVersioning)ModuleStore, but having this as a + separate class makes it easier to stub in test data. + + The methods on this class should accept and return backend-agnostic data + structures, so no BSON details should leak out. + """ + def __init__(self, mongo_connection_str, db_name): + self._db = MongoClient( + mongo_connection_str, + connectTimeoutMS=2000, + socketTimeoutMS=300000, # *long* operations + serverSelectionTimeoutMS=2000 + ) + self._active_versions = self._db[db_name].modulestore.active_versions + self._structures = self._db[db_name].modulestore.structures + + def structures_graph(self, delay, batch_size): + """ + Return StructuresGraph for the entire modulestore. + + `batch_size` is the number of structure documents we pull at a time. + `delay` is the delay in seconds between batch queries. + + This has one slight complication. A StructuresGraph is expected to be a + consistent view of the database, but MongoDB doesn't offer a "repeatable + read" transaction isolation mode. That means that Structures may be + added at any time between our database calls. Because of this, we have + to be careful in stitching together something that is safe. The + guarantees we try to make about the StructuresGraph being returned are: + + 1. Every Structure ID in `active_structure_ids` is also in `structures` + 2. If `branches` is stale and there is a new Structure that is Active + in the database, it is *not* in `structures`. + + Scenario A: We fetch branches, then structures + 1. Get Branches (and thus Active Structure IDs) + 2. New Structures created by Studio + 3. Get all Structures + + It is almost certainly the case that the new Structures created in (2) + should be active. Our algorithm works by starting from the Active + Structure IDs that we know about, making a "save" list, and then + deleting all other Structures. The problem in this scenario is that we + fetch the new Structures in (3), but we don't know that they're Active + because our `active_structure_ids` comes from (1) and is stale. So we + would in fact delete what should be Active Structures. + + Scenario B: We fetch structures, then branches + 1. Get all Structures + 2. New Structures created by Studio + 3. Get Branches (and thus Active Structure IDs) + + In this scenario, we may see Active Structure IDs that are not in + our Structures dict. This is bad because we won't know how to crawl + their ancestry and mark the appropriate Structure IDs to be saved. + + So the approach we take is Scenario B with a fallback. After we fetch + everything, we go through the Active Structure IDs and make sure that + those Structures and their ancestors exist in `structures`. If they + don't, we make extra fetches to get them. Misses should be rare, so it + shouldn't have a drastic performance impact overall. + + Note that it's safe if the ChangePlan as a whole is a little stale, so + long as it's internally consistent. We only ever delete Structures that + are in the `structures` doc, so a new Active Version that we're + completely unaware of will be left alone. + """ + structures = self._all_structures(delay, batch_size) + branches = self._all_branches() + + # Guard against the race condition that branch.structure_id or its + # ancestors are not in `structures`. Make sure that we add those. + LOG.info( + "Checking for missing Structures (a small number are expected " + "unless edits are disabled during change plan creation)." + ) + missing_count = 0 + for branch in branches: + structure_id = branch.structure_id + while structure_id and (structure_id not in structures): + structures[structure_id] = self._get_structure(structure_id) + missing_count += 1 + LOG.warning( + "Structure %s linked from Active Structure %s (%s) fetched.", + structure_id, + branch.structure_id, + branch.key, + ) + structure_id = structures[structure_id].previous_id + + LOG.info("Finished checking for missing Structures, found %s", missing_count) + + return StructuresGraph(branches, structures) + + def _all_structures(self, delay, batch_size): + """ + Return a dict mapping Structure IDs to Structures for all Structures in + the database. + + `batch_size` is the number of structure documents we pull at a time. + `delay` is the delay in seconds between batch queries. + """ + LOG.info("Fetching all known Structures (this might take a while)...") + LOG.info("Delay in seconds: %s, Batch size: %s", delay, batch_size) + + # Important to keep this as a generator to limit memory usage. + parsed_docs = ( + self.parse_structure_doc(doc) + for doc + in self._structures_from_db(delay, batch_size) + ) + structures = {structure.id: structure for structure in parsed_docs} + LOG.info("Fetched %s Structures", len(structures)) + + return structures + + def _structures_from_db(self, delay, batch_size): + """ + Iterate through all Structure documents in the database. + + `batch_size` is the number of structure documents we pull at a time. + `delay` is the delay in seconds between batch queries. + """ + cursor = self._structures.find( + projection=['original_version', 'previous_version'] + ) + cursor.batch_size(batch_size) + for i, structure_doc in enumerate(cursor, start=1): + yield structure_doc + if i % batch_size == 0: + LOG.info("Structure Cursor at %s (%s)", i, structure_doc['_id']) + time.sleep(delay) + + def _all_branches(self): + """Retrieve list of all ActiveVersionBranch objects in the database.""" + branches = [] + LOG.info("Fetching all Active Version Branches...") + + for av_doc in self._active_versions.find(): + for branch, obj_id in av_doc['versions'].items(): + structure_id = str(obj_id) + if branch == 'library': + key = LibraryLocator(av_doc['org'], av_doc['course']) + else: + key = CourseLocator(av_doc['org'], av_doc['course'], av_doc['run']) + + branches.append( + ActiveVersionBranch( + str(av_doc['_id']), + branch, + structure_id, + key, + av_doc['edited_on'], + ) + ) + + LOG.info("Fetched %s Active Version Branches", len(branches)) + + return sorted(branches) + + def _get_structure(self, structure_id): + """Get an individual Structure from the database.""" + structure_doc = self._structures.find_one( + {'_id': ObjectId(structure_id)}, + projection=['original_version', 'previous_version'] + ) + return self.parse_structure_doc(structure_doc) + + def update(self, change_plan, delay=1000, batch_size=1000, start=None): + """ + Update the backend according to the relinking and deletions specified in + the change_plan. + """ + # Step 1: Relink - Change the previous pointer for the oldest structure + # we want to keep, so that it points back to the original. We never + # delete the original. Relinking happens before deletion so that we + # never leave our course in a broken state (at worst, parts of it + # become unreachable). + self._update_parents(change_plan.update_parents, delay, batch_size) + + # Step 2: Delete unused Structures + self._delete(change_plan.delete, delay, batch_size, start) + + def _update_parents(self, id_parent_pairs, delay, batch_size): + """ + Update Structure parent relationships. + + `id_parent_pairs` is a list of tuples, where the first element of each + tuple is a Structure ID (str) to target, and the second element is the + Structure ID that will be the new parent of the first element. + """ + for id_parent_pairs_batch in self.batch(id_parent_pairs, batch_size): + updates = [ + UpdateOne( + {'_id': ObjectId(structure_id)}, + {'$set': {'previous_version': ObjectId(previous_id)}} + ) + for structure_id, previous_id in id_parent_pairs_batch + ] + result = self._structures.bulk_write(updates) + LOG.info( + "Updated %s/%s parent relationships.", + result.bulk_api_result['nModified'], + result.bulk_api_result['nMatched'], + ) + time.sleep(delay) + + def _delete(self, structure_ids, delay, batch_size, start=None): + """ + Delete old structures in batches. + + `structure_ids` is a list of Structure IDs to delete. + `delay` is the delay in seconds (floats are ok) between batch deletes. + `batch_size` is how many we try to delete in each batch statement. + """ + s_ids_with_offset = self.iter_from_start(structure_ids, start) + for structure_ids_batch in self.batch(s_ids_with_offset, batch_size): + result = self._structures.delete_many( + { + '_id': { + '$in': [ObjectId(s_id) for s_id in structure_ids_batch] + } + } + ) + LOG.info( + "Deleted %s/%s Structures: %s - %s", + result.deleted_count, + len(structure_ids_batch), + structure_ids_batch[0], + structure_ids_batch[-1], + ) + time.sleep(delay) + + @staticmethod + def parse_structure_doc(structure_doc): + """ + Structure docs are pretty big, but we only care about three top level + fields, all of which are ObjectIds: + + _id: The Structure ID + + previous_version: The Structure ID for the parent. An Original + Structure will have None for this field. + + original_version: The Original Structure that this Structure and all + its ancestors are ultimately dervied from. An + Original Structure points to itself with this field. + """ + _id = str(structure_doc['_id']) + original_id = str(structure_doc['original_version']) + previous_id = structure_doc['previous_version'] + if previous_id is not None: + previous_id = str(previous_id) + return Structure(_id, original_id, previous_id) + + @staticmethod + def batch(iterable, batch_size): + """Yield lists of up to `batch_size` in length from `iterable`.""" + iterator = iter(iterable) + curr_batch = [] + for i in count(1): + try: + curr_batch.append(next(iterator)) + if i % batch_size == 0: + yield curr_batch + curr_batch = [] + except StopIteration: + break + if curr_batch: + yield curr_batch + + @staticmethod + def iter_from_start(structure_ids, start=None): + """ + Yields from an iterable once it encounters the `start` value. If `start` + is None, just yields from the beginning. + """ + if start is None: + for structure_id in structure_ids: + yield structure_id + return + + for structure_id in structure_ids: + if structure_id < start: + continue + yield structure_id From 4822e959f7524864af1be1d2f1ee1ba895f39ead Mon Sep 17 00:00:00 2001 From: farhan Date: Tue, 12 Mar 2024 12:22:10 +0500 Subject: [PATCH 2/3] chore: refactor common to structures_pruning --- ...units-test-scripts-structures-pruning.yml} | 4 +- Makefile | 4 +- scripts/common/requirements/testing.txt | 44 ------------------- .../{common => structures_pruning}/README.rst | 20 +++++---- .../__init__.py | 0 .../{common => structures_pruning}/pytest.ini | 0 .../requirements/base.in | 2 +- .../requirements/base.txt | 8 ++-- .../requirements/testing.in | 2 +- .../requirements/testing.txt | 44 +++++++++++++++++++ .../structures.py | 2 +- .../tests/__init__.py | 0 .../tests/test_splitmongo.py | 2 +- .../utils/splitmongo.py | 0 14 files changed, 67 insertions(+), 65 deletions(-) rename .github/workflows/{units-test-scripts-common.yml => units-test-scripts-structures-pruning.yml} (81%) delete mode 100644 scripts/common/requirements/testing.txt rename scripts/{common => structures_pruning}/README.rst (71%) rename scripts/{common => structures_pruning}/__init__.py (100%) rename scripts/{common => structures_pruning}/pytest.ini (100%) rename scripts/{common => structures_pruning}/requirements/base.in (100%) rename scripts/{common => structures_pruning}/requirements/base.txt (60%) rename scripts/{common => structures_pruning}/requirements/testing.in (100%) create mode 100644 scripts/structures_pruning/requirements/testing.txt rename scripts/{common => structures_pruning}/structures.py (98%) rename scripts/{common => structures_pruning}/tests/__init__.py (100%) rename scripts/{common => structures_pruning}/tests/test_splitmongo.py (99%) rename scripts/{common => structures_pruning}/utils/splitmongo.py (100%) diff --git a/.github/workflows/units-test-scripts-common.yml b/.github/workflows/units-test-scripts-structures-pruning.yml similarity index 81% rename from .github/workflows/units-test-scripts-common.yml rename to .github/workflows/units-test-scripts-structures-pruning.yml index 58b4d6ccd842..fad1802f412a 100644 --- a/.github/workflows/units-test-scripts-common.yml +++ b/.github/workflows/units-test-scripts-structures-pruning.yml @@ -26,8 +26,8 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -r scripts/common/requirements/testing.txt + pip install -r scripts/structures_pruning/requirements/testing.txt - name: Run pytest run: | - pytest scripts/common + pytest scripts/structures_pruning diff --git a/Makefile b/Makefile index 5c58a7ff8d41..a6a5222579bc 100644 --- a/Makefile +++ b/Makefile @@ -141,8 +141,8 @@ REQ_FILES = \ scripts/xblock/requirements \ scripts/user_retirement/requirements/base \ scripts/user_retirement/requirements/testing \ - scripts/common/requirements/base \ - scripts/common/requirements/testing + scripts/structures_pruning/requirements/base \ + scripts/structures_pruning/requirements/testing define COMMON_CONSTRAINTS_TEMP_COMMENT # This is a temporary solution to override the real common_constraints.txt\n# In edx-lint, until the pyjwt constraint in edx-lint has been removed.\n# See BOM-2721 for more details.\n# Below is the copied and edited version of common_constraints\n diff --git a/scripts/common/requirements/testing.txt b/scripts/common/requirements/testing.txt deleted file mode 100644 index be8d5b77d768..000000000000 --- a/scripts/common/requirements/testing.txt +++ /dev/null @@ -1,44 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.8 -# by the following command: -# -# make upgrade -# -click==8.1.7 - # via - # -r scripts/common/requirements/base.txt - # click-log -click-log==0.4.0 - # via -r scripts/common/requirements/base.txt -ddt==1.7.2 - # via -r scripts/common/requirements/testing.in -edx-opaque-keys==2.5.1 - # via -r scripts/common/requirements/base.txt -exceptiongroup==1.2.0 - # via pytest -iniconfig==2.0.0 - # via pytest -packaging==23.2 - # via pytest -pbr==6.0.0 - # via - # -r scripts/common/requirements/base.txt - # stevedore -pluggy==1.4.0 - # via pytest -pymongo==3.13.0 - # via - # -r scripts/common/requirements/base.txt - # edx-opaque-keys -pytest==8.0.2 - # via -r scripts/common/requirements/testing.in -stevedore==5.2.0 - # via - # -r scripts/common/requirements/base.txt - # edx-opaque-keys -tomli==2.0.1 - # via pytest -typing-extensions==4.10.0 - # via - # -r scripts/common/requirements/base.txt - # edx-opaque-keys diff --git a/scripts/common/README.rst b/scripts/structures_pruning/README.rst similarity index 71% rename from scripts/common/README.rst rename to scripts/structures_pruning/README.rst index 0b51b5503a64..c16a837a93af 100644 --- a/scripts/common/README.rst +++ b/scripts/structures_pruning/README.rst @@ -1,9 +1,11 @@ -Common Scripts -============== +Structures Pruning Scripts +========================== + +`This `_ directory contains mongo db structures pruning script that is migrated from the +`tubular `_ repository. -`This `_ directory contains some common python scripts. Some of them are migrated from the other repositories. -These scripts could be called from any automation/CD framework. +This script could be called from any automation/CD framework. How to run the scripts ====================== @@ -17,7 +19,7 @@ To download the scripts, you can perform a partial clone of the edx-platform rep repo_url=git@github.com:openedx/edx-platform.git branch=master - directory=scripts/common + directory=scripts/structures_pruning git clone --branch $branch --single-branch --depth=1 --filter=tree:0 $repo_url cd edx-platform @@ -41,7 +43,7 @@ Install the required pip packages using the provided requirements file: .. code-block:: bash - pip install -r scripts/common/requirements/base.txt + pip install -r scripts/structures_pruning/requirements/base.txt Execute Script @@ -51,7 +53,7 @@ You can simply execute Python scripts with python command .. code-block:: bash - python scripts/common/structures.py prune plan_file.json + python scripts/structures_pruning/structures.py prune plan_file.json Feel free to customize these steps according to your specific environment and requirements. @@ -62,10 +64,10 @@ Before running test cases, install the testing requirements: .. code-block:: bash - pip install -r scripts/common/requirements/testing.txt + pip install -r scripts/structures_pruning/requirements/testing.txt Run the test cases using pytest: .. code-block:: bash - pytest scripts/common + pytest scripts/structures_pruning diff --git a/scripts/common/__init__.py b/scripts/structures_pruning/__init__.py similarity index 100% rename from scripts/common/__init__.py rename to scripts/structures_pruning/__init__.py diff --git a/scripts/common/pytest.ini b/scripts/structures_pruning/pytest.ini similarity index 100% rename from scripts/common/pytest.ini rename to scripts/structures_pruning/pytest.ini diff --git a/scripts/common/requirements/base.in b/scripts/structures_pruning/requirements/base.in similarity index 100% rename from scripts/common/requirements/base.in rename to scripts/structures_pruning/requirements/base.in index 70eb4a70a6b0..0eb09f693e17 100644 --- a/scripts/common/requirements/base.in +++ b/scripts/structures_pruning/requirements/base.in @@ -1,4 +1,4 @@ click click-log -pymongo edx-opaque-keys +pymongo diff --git a/scripts/common/requirements/base.txt b/scripts/structures_pruning/requirements/base.txt similarity index 60% rename from scripts/common/requirements/base.txt rename to scripts/structures_pruning/requirements/base.txt index ddf6f2021bb3..52e37de32bd2 100644 --- a/scripts/common/requirements/base.txt +++ b/scripts/structures_pruning/requirements/base.txt @@ -6,17 +6,17 @@ # click==8.1.7 # via - # -r scripts/common/requirements/base.in + # -r scripts/structures_pruning/requirements/base.in # click-log click-log==0.4.0 - # via -r scripts/common/requirements/base.in + # via -r scripts/structures_pruning/requirements/base.in edx-opaque-keys==2.5.1 - # via -r scripts/common/requirements/base.in + # via -r scripts/structures_pruning/requirements/base.in pbr==6.0.0 # via stevedore pymongo==3.13.0 # via - # -r scripts/common/requirements/base.in + # -r scripts/structures_pruning/requirements/base.in # edx-opaque-keys stevedore==5.2.0 # via edx-opaque-keys diff --git a/scripts/common/requirements/testing.in b/scripts/structures_pruning/requirements/testing.in similarity index 100% rename from scripts/common/requirements/testing.in rename to scripts/structures_pruning/requirements/testing.in index d1e18b775ad4..7066f2bc5e9d 100644 --- a/scripts/common/requirements/testing.in +++ b/scripts/structures_pruning/requirements/testing.in @@ -1,4 +1,4 @@ -r base.txt -pytest ddt +pytest diff --git a/scripts/structures_pruning/requirements/testing.txt b/scripts/structures_pruning/requirements/testing.txt new file mode 100644 index 000000000000..e8b9d3bd7871 --- /dev/null +++ b/scripts/structures_pruning/requirements/testing.txt @@ -0,0 +1,44 @@ +# +# This file is autogenerated by pip-compile with Python 3.8 +# by the following command: +# +# make upgrade +# +click==8.1.7 + # via + # -r scripts/structures_pruning/requirements/base.txt + # click-log +click-log==0.4.0 + # via -r scripts/structures_pruning/requirements/base.txt +ddt==1.7.2 + # via -r scripts/structures_pruning/requirements/testing.in +edx-opaque-keys==2.5.1 + # via -r scripts/structures_pruning/requirements/base.txt +exceptiongroup==1.2.0 + # via pytest +iniconfig==2.0.0 + # via pytest +packaging==24.0 + # via pytest +pbr==6.0.0 + # via + # -r scripts/structures_pruning/requirements/base.txt + # stevedore +pluggy==1.4.0 + # via pytest +pymongo==3.13.0 + # via + # -r scripts/structures_pruning/requirements/base.txt + # edx-opaque-keys +pytest==8.1.1 + # via -r scripts/structures_pruning/requirements/testing.in +stevedore==5.2.0 + # via + # -r scripts/structures_pruning/requirements/base.txt + # edx-opaque-keys +tomli==2.0.1 + # via pytest +typing-extensions==4.10.0 + # via + # -r scripts/structures_pruning/requirements/base.txt + # edx-opaque-keys diff --git a/scripts/common/structures.py b/scripts/structures_pruning/structures.py similarity index 98% rename from scripts/common/structures.py rename to scripts/structures_pruning/structures.py index 7aac79f5ed2a..6b3f99182f78 100644 --- a/scripts/common/structures.py +++ b/scripts/structures_pruning/structures.py @@ -15,7 +15,7 @@ # Add top-level project path to sys.path before importing scripts code sys.path.append(path.abspath(path.join(path.dirname(__file__), '../..'))) -from scripts.common.utils.splitmongo import SplitMongoBackend, ChangePlan +from scripts.structures_pruning.utils.splitmongo import SplitMongoBackend, ChangePlan # Add top-level module path to sys.path before importing tubular code. # sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) diff --git a/scripts/common/tests/__init__.py b/scripts/structures_pruning/tests/__init__.py similarity index 100% rename from scripts/common/tests/__init__.py rename to scripts/structures_pruning/tests/__init__.py diff --git a/scripts/common/tests/test_splitmongo.py b/scripts/structures_pruning/tests/test_splitmongo.py similarity index 99% rename from scripts/common/tests/test_splitmongo.py rename to scripts/structures_pruning/tests/test_splitmongo.py index 197ee089ec49..706afd800778 100644 --- a/scripts/common/tests/test_splitmongo.py +++ b/scripts/structures_pruning/tests/test_splitmongo.py @@ -22,7 +22,7 @@ # Add top-level project path to sys.path before importing scripts code sys.path.append(path.abspath(path.join(path.dirname(__file__), '../..'))) -from scripts.common.utils.splitmongo import ( +from scripts.structures_pruning.utils.splitmongo import ( ActiveVersionBranch, ChangePlan, Structure, SplitMongoBackend, StructuresGraph ) diff --git a/scripts/common/utils/splitmongo.py b/scripts/structures_pruning/utils/splitmongo.py similarity index 100% rename from scripts/common/utils/splitmongo.py rename to scripts/structures_pruning/utils/splitmongo.py From c11e002b289428521269e62d4d1e1fe8f24bdc8e Mon Sep 17 00:00:00 2001 From: farhan Date: Tue, 12 Mar 2024 14:46:28 +0500 Subject: [PATCH 3/3] chore: Adds support of python 3.12 --- .github/workflows/units-test-scripts-structures-pruning.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/units-test-scripts-structures-pruning.yml b/.github/workflows/units-test-scripts-structures-pruning.yml index fad1802f412a..434b617c1736 100644 --- a/.github/workflows/units-test-scripts-structures-pruning.yml +++ b/.github/workflows/units-test-scripts-structures-pruning.yml @@ -12,7 +12,7 @@ jobs: strategy: matrix: - python-version: [ '3.8' ] + python-version: [ '3.8', '3.12' ] steps: - name: Checkout code