From 9cbe1227905d3cb86de9c7512cef84ee5bf2a52c Mon Sep 17 00:00:00 2001 From: Adrian Date: Fri, 24 May 2024 10:38:03 -0600 Subject: [PATCH] Rework backfill task to create sub-tasks (#465) --- celery_config.py | 2 + tasks/__init__.py | 2 + .../backfill_existing_gh_app_installations.py | 113 ++++++++++-------- ...ill_owners_without_gh_app_installations.py | 111 ++++++++++++++--- ..._backfill_existing_gh_app_installations.py | 20 ++-- ...ill_owners_without_gh_app_installations.py | 83 +------------ 6 files changed, 178 insertions(+), 153 deletions(-) diff --git a/celery_config.py b/celery_config.py index 05692c086..b90de9b5d 100644 --- a/celery_config.py +++ b/celery_config.py @@ -107,7 +107,9 @@ def init_celery_tracing(*args, **kwargs): # Backfill GH Apps backfill_existing_gh_app_installations_name = "app.tasks.backfill_existing_gh_app_installations.BackfillExistingGHAppInstallationsTask" +backfill_existing_individual_gh_app_installation_name = "app.tasks.backfill_existing_individual_gh_app_installation.BackfillExistingIndividualGHAppInstallationTask" backfill_owners_without_gh_app_installations_name = "app.tasks.backfill_owners_without_gh_app_installations.BackfillOwnersWithoutGHAppInstallationsTask" +backfill_owners_without_gh_app_installation_individual_name = "app.tasks.backfill_owners_without_gh_app_installation_individual.BackfillOwnersWithoutGHAppInstallationIndividualTask" trial_expiration_task_name = "app.tasks.plan.TrialExpirationTask" trial_expiration_cron_task_name = "app.cron.plan.TrialExpirationCronTask" diff --git a/tasks/__init__.py b/tasks/__init__.py index f5f86d124..29af1ad74 100644 --- a/tasks/__init__.py +++ b/tasks/__init__.py @@ -4,8 +4,10 @@ from tasks.backfill_commit_data_to_storage import backfill_commit_data_to_storage_task from tasks.backfill_existing_gh_app_installations import ( backfill_existing_gh_app_installations_name, + backfill_existing_individual_gh_app_installation_name, ) from tasks.backfill_owners_without_gh_app_installations import ( + backfill_owners_without_gh_app_installation_individual_name, backfill_owners_without_gh_app_installations_name, ) from tasks.brolly_stats_rollup import brolly_stats_rollup_task diff --git a/tasks/backfill_existing_gh_app_installations.py b/tasks/backfill_existing_gh_app_installations.py index 74191f858..f519bc343 100644 --- a/tasks/backfill_existing_gh_app_installations.py +++ b/tasks/backfill_existing_gh_app_installations.py @@ -4,7 +4,10 @@ from sqlalchemy.orm.session import Session from app import celery_app -from celery_config import backfill_existing_gh_app_installations_name +from celery_config import ( + backfill_existing_gh_app_installations_name, + backfill_existing_individual_gh_app_installation_name, +) from database.models.core import GithubAppInstallation, Owner from helpers.backfills import ( add_repos_service_ids_from_provider, @@ -19,13 +22,19 @@ class BackfillExistingGHAppInstallationsTask( BaseCodecovTask, name=backfill_existing_gh_app_installations_name ): - def backfill_existing_gh_apps( + def run_impl( self, db_session: Session, - owner_ids: List[int] = None, - missed_owner_ids=[], + owner_ids: Optional[List[int]] = None, yield_amount: int = 1000, + *args, + **kwargs, ): + log.info( + "Starting Existing GH App backfill task", + ) + + # Backfill gh apps we already have # Get owners that have installations, and installations queries owners_query = ( db_session.query(Owner) @@ -48,70 +57,72 @@ def backfill_existing_gh_apps( ) for gh_app_installation in gh_app_installations: - # Check if gh app has 'all' repositories selected - owner = gh_app_installation.owner - ownerid = gh_app_installation.owner.ownerid + self.app.tasks[ + backfill_existing_individual_gh_app_installation_name + ].apply_async(kwargs=dict(gh_app_installation_id=gh_app_installation.id)) - try: - owner_service = get_owner_provider_service(owner=owner) - is_selection_all = maybe_set_installation_to_all_repos( - db_session=db_session, - owner_service=owner_service, - gh_app_installation=gh_app_installation, - ) + return {"successful": True, "reason": "backfill tasks queued"} + + +RegisteredBackfillExistingGHAppInstallationsTask = celery_app.register_task( + BackfillExistingGHAppInstallationsTask() +) +backfill_existing_gh_app_installations_task = celery_app.tasks[ + RegisteredBackfillExistingGHAppInstallationsTask.name +] - if not is_selection_all: - # Find and add all repos the gh app has access to - add_repos_service_ids_from_provider( - db_session=db_session, - ownerid=ownerid, - owner_service=owner_service, - gh_app_installation=gh_app_installation, - ) - log.info("Successful backfill", extra=dict(ownerid=ownerid)) - except: - log.info( - "Backfill unsuccessful for this owner", extra=dict(ownerid=ownerid) - ) - missed_owner_ids.append(ownerid) - continue - del gh_app_installations +class BackfillExistingIndividualGHAppInstallationTask( + BaseCodecovTask, name=backfill_existing_individual_gh_app_installation_name +): def run_impl( self, db_session: Session, - owner_ids: Optional[List[int]] = None, + gh_app_installation_id: int, *args, **kwargs, ): - log.info( - "Starting Existing GH App backfill task", + gh_app_installation = db_session.query(GithubAppInstallation).get( + gh_app_installation_id ) - missed_owner_ids = [] - - # Backfill gh apps we already have - self.backfill_existing_gh_apps( - db_session=db_session, - owner_ids=owner_ids, - missed_owner_ids=missed_owner_ids, - ) + # Check if gh app has 'all' repositories selected + owner = gh_app_installation.owner + ownerid = gh_app_installation.owner.ownerid log.info( - "Backfill for existing gh apps completed", + "Attempt to backfill gh_app_installation", + extra=dict(owner_id=ownerid, parent_id=self.request.parent_id), ) - log.info( - "Potential owner ids that didn't backfill", - extra=dict(missed_owner_ids=missed_owner_ids), - ) + try: + owner_service = get_owner_provider_service(owner=owner) + is_selection_all = maybe_set_installation_to_all_repos( + db_session=db_session, + owner_service=owner_service, + gh_app_installation=gh_app_installation, + ) - return {"successful": True, "reason": "backfill task finished"} + if not is_selection_all: + # Find and add all repos the gh app has access to + add_repos_service_ids_from_provider( + db_session=db_session, + ownerid=ownerid, + owner_service=owner_service, + gh_app_installation=gh_app_installation, + ) + log.info("Successful backfill", extra=dict(ownerid=ownerid)) + return {"successful": True, "reason": "backfill task finished"} + except: + log.info( + "Backfill unsuccessful for this owner", extra=dict(ownerid=ownerid) + ) + return {"successful": False, "reason": "backfill unsuccessful"} -RegisteredBackfillExistingGHAppInstallationsTask = celery_app.register_task( - BackfillExistingGHAppInstallationsTask() +RegisteredBackfillExistingIndividualGHAppInstallationTask = celery_app.register_task( + BackfillExistingIndividualGHAppInstallationTask() ) -backfill_existing_gh_app_installations_task = celery_app.tasks[ - RegisteredBackfillExistingGHAppInstallationsTask.name +backfill_existing_individual_gh_app_installation_task = celery_app.tasks[ + RegisteredBackfillExistingIndividualGHAppInstallationTask.name ] diff --git a/tasks/backfill_owners_without_gh_app_installations.py b/tasks/backfill_owners_without_gh_app_installations.py index 9e500ddf6..8943ffa78 100644 --- a/tasks/backfill_owners_without_gh_app_installations.py +++ b/tasks/backfill_owners_without_gh_app_installations.py @@ -5,7 +5,10 @@ from sqlalchemy.orm.session import Session from app import celery_app -from celery_config import backfill_owners_without_gh_app_installations_name +from celery_config import ( + backfill_owners_without_gh_app_installation_individual_name, + backfill_owners_without_gh_app_installations_name, +) from database.models.core import ( GITHUB_APP_INSTALLATION_DEFAULT_NAME, GithubAppInstallation, @@ -28,7 +31,6 @@ def backfill_owners_with_integration_without_gh_app( self, db_session: Session, owner_ids: List[int] = None, - missed_owner_ids=[], yield_amount: int = 1000, ): owners_with_integration_id_without_gh_app_query = ( @@ -92,13 +94,12 @@ def backfill_owners_with_integration_without_gh_app( log.info( "Backfill unsuccessful for this owner", extra=dict(ownerid=ownerid) ) - missed_owner_ids.append(ownerid) - continue def run_impl( self, db_session: Session, owner_ids: Optional[List[int]] = None, + yield_amount: int = 1000, *args, **kwargs, ): @@ -106,25 +107,37 @@ def run_impl( "Starting backfill for owners without gh app task", ) - missed_owner_ids = [] - # Backfill owners with legacy integration + adding new gh app - self.backfill_owners_with_integration_without_gh_app( - db_session=db_session, - owner_ids=owner_ids, - missed_owner_ids=missed_owner_ids, + owners_with_integration_id_without_gh_app_query = ( + db_session.query(Owner) + .outerjoin( + GithubAppInstallation, + Owner.ownerid == GithubAppInstallation.ownerid, + ) + .filter( + GithubAppInstallation.ownerid == None, + Owner.integration_id.isnot(None), + Owner.service == "github", + ) ) - log.info( - "Backfill for owners without apps finished", - ) + if owner_ids: + owners_with_integration_id_without_gh_app_query = ( + owners_with_integration_id_without_gh_app_query.filter( + Owner.ownerid.in_(owner_ids) + ) + ) - log.info( - "Potential owner ids that didn't backfill", - extra=dict(missed_owner_ids=missed_owner_ids), + owners: List[Owner] = owners_with_integration_id_without_gh_app_query.yield_per( + yield_amount ) - return {"successful": True, "reason": "backfill task finished"} + for owner in owners: + self.app.tasks[ + backfill_owners_without_gh_app_installation_individual_name + ].apply_async(kwargs=dict(ownerid=owner.ownerid)) + + return {"successful": True, "reason": "backfill tasks queued"} RegisterOwnersWithoutGHAppInstallations = celery_app.register_task( @@ -133,3 +146,67 @@ def run_impl( backfill_owners_without_gh_app_installations = celery_app.tasks[ RegisterOwnersWithoutGHAppInstallations.name ] + + +class BackfillOwnersWithoutGHAppInstallationIndividual( + BaseCodecovTask, name=backfill_owners_without_gh_app_installation_individual_name +): + def run_impl( + self, + db_session: Session, + ownerid: int, + *args, + **kwargs, + ): + owner = db_session.query(Owner).get(ownerid) + + log.info( + "Attempt to create GH App", + extra=dict(owner_id=ownerid, parent_id=self.request.parent_id), + ) + + try: + owner_service = get_owner_provider_service(owner=owner) + + # Create new GH app installation and add all repos the gh app has access to + log.info( + "This owner has no Github App Installation", + extra=dict(ownerid=ownerid), + ) + gh_app_installation = GithubAppInstallation( + owner=owner, + installation_id=owner.integration_id, + app_id=get_config("github", "integration", "id"), + name=GITHUB_APP_INSTALLATION_DEFAULT_NAME, + ) + db_session.add(gh_app_installation) + + is_selection_all = maybe_set_installation_to_all_repos( + db_session=db_session, + owner_service=owner_service, + gh_app_installation=gh_app_installation, + ) + + if not is_selection_all: + # Find and add all repos the gh app has access to + add_repos_service_ids_from_provider( + db_session=db_session, + ownerid=ownerid, + owner_service=owner_service, + gh_app_installation=gh_app_installation, + ) + log.info("Successful backfill", extra=dict(ownerid=ownerid)) + return {"successful": True, "reason": "backfill task finished"} + except: + log.info( + "Backfill unsuccessful for this owner", extra=dict(ownerid=ownerid) + ) + return {"successful": False, "reason": "backfill unsuccessful"} + + +RegisterOwnersWithoutGHAppInstallationIndividual = celery_app.register_task( + BackfillOwnersWithoutGHAppInstallationIndividual() +) +backfill_owners_without_gh_app_installation_individual = celery_app.tasks[ + RegisterOwnersWithoutGHAppInstallationIndividual.name +] diff --git a/tasks/tests/unit/test_backfill_existing_gh_app_installations.py b/tasks/tests/unit/test_backfill_existing_gh_app_installations.py index 6037788c0..5f599c4d3 100644 --- a/tasks/tests/unit/test_backfill_existing_gh_app_installations.py +++ b/tasks/tests/unit/test_backfill_existing_gh_app_installations.py @@ -6,7 +6,7 @@ ) from database.tests.factories.core import OwnerFactory, RepositoryFactory from tasks.backfill_existing_gh_app_installations import ( - BackfillExistingGHAppInstallationsTask, + BackfillExistingIndividualGHAppInstallationTask, ) @@ -51,8 +51,10 @@ def test_gh_app_with_selection_all( return_value=mock_repo_provider, ) - task = BackfillExistingGHAppInstallationsTask() - assert task.run_impl(dbsession, owner_ids=None) == { + task = BackfillExistingIndividualGHAppInstallationTask() + assert task.run_impl( + dbsession, gh_app_installation_id=gh_app_installation.id + ) == { "successful": True, "reason": "backfill task finished", } @@ -98,8 +100,10 @@ def test_gh_app_with_specific_owner_ids( return_value=mock_repo_provider, ) - task = BackfillExistingGHAppInstallationsTask() - assert task.run_impl(dbsession, owner_ids=[owner.ownerid]) == { + task = BackfillExistingIndividualGHAppInstallationTask() + assert task.run_impl( + dbsession, gh_app_installation_id=gh_app_installation.id + ) == { "successful": True, "reason": "backfill task finished", } @@ -165,8 +169,10 @@ def test_gh_app_without_all_repo_selection( return_value=mock_repo_provider, ) - task = BackfillExistingGHAppInstallationsTask() - assert task.run_impl(dbsession, owner_ids=None) == { + task = BackfillExistingIndividualGHAppInstallationTask() + assert task.run_impl( + dbsession, gh_app_installation_id=gh_app_installation.id + ) == { "successful": True, "reason": "backfill task finished", } diff --git a/tasks/tests/unit/test_backfill_owners_without_gh_app_installations.py b/tasks/tests/unit/test_backfill_owners_without_gh_app_installations.py index 3364443c5..d54db6b4c 100644 --- a/tasks/tests/unit/test_backfill_owners_without_gh_app_installations.py +++ b/tasks/tests/unit/test_backfill_owners_without_gh_app_installations.py @@ -3,7 +3,7 @@ from database.models.core import GithubAppInstallation from database.tests.factories.core import OwnerFactory, RepositoryFactory from tasks.backfill_owners_without_gh_app_installations import ( - BackfillOwnersWithoutGHAppInstallations, + BackfillOwnersWithoutGHAppInstallationIndividual, ) @@ -58,8 +58,8 @@ def test_no_previous_app_existing_repos_only( return_value=mock_repo_provider, ) - task = BackfillOwnersWithoutGHAppInstallations() - assert task.run_impl(dbsession, owner_ids=[owner.ownerid]) == { + task = BackfillOwnersWithoutGHAppInstallationIndividual() + assert task.run_impl(dbsession, ownerid=owner.ownerid) == { "successful": True, "reason": "backfill task finished", } @@ -106,8 +106,8 @@ def test_no_previous_app_some_existing_repos( return_value=mock_repo_provider, ) - task = BackfillOwnersWithoutGHAppInstallations() - assert task.run_impl(dbsession, owner_ids=None) == { + task = BackfillOwnersWithoutGHAppInstallationIndividual() + assert task.run_impl(dbsession, ownerid=owner.ownerid) == { "successful": True, "reason": "backfill task finished", } @@ -122,76 +122,3 @@ def test_no_previous_app_some_existing_repos( # Only added the service_id as long as the repository exists in our system as well assert len(new_gh_app_installation.repository_service_ids) == 1 assert new_gh_app_installation.repository_service_ids[0] == repo_service_id - - -# class TestBackfillBothTypesOfOwners(object): -# def test_backfill_with_both_types_of_owners( -# self, mocker, mock_repo_provider, dbsession: Session -# ): -# owner = OwnerFactory(service="github", integration_id=12345) -# gh_app_installation = GithubAppInstallation( -# owner=owner, -# repository_service_ids=None, -# installation_id=owner.integration_id, -# name=GITHUB_APP_INSTALLATION_DEFAULT_NAME, -# ) - -# owner_without_app = OwnerFactory(service="github", integration_id=12345) -# mock_repos = [ -# repo_obj("159089634", "pytest", "python", False, "main", True), -# repo_obj("164948070", "spack", "python", False, "develop", True), -# ] -# for repo in mock_repos: -# repo_data = repo["repo"] -# dbsession.add( -# RepositoryFactory( -# owner=owner_without_app, -# name=repo_data["name"], -# service_id=repo_data["service_id"], -# ) -# ) - -# dbsession.add_all([owner, gh_app_installation, owner_without_app]) -# dbsession.commit() - -# # Mock fn return values; 1st call "all", 2nd call "selected" -# mock_repo_provider.get_gh_app_installation.side_effect = [ -# {"repository_selection": "all"}, -# {"repository_selection": "selected"}, -# ] -# mock_repo_provider.list_repos_using_installation.return_value = mock_repos -# mocker.patch( -# f"tasks.backfill_owners_without_gh_app_installations.get_owner_provider_service", -# return_value=mock_repo_provider, -# ) - -# task = BackfillOwnersWithoutGHAppInstallations() -# assert task.run_impl(dbsession, owner_ids=None) == { -# "successful": True, -# "reason": "backfill task finished", -# } - -# gh_app_installation = ( -# dbsession.query(GithubAppInstallation) -# .filter_by(ownerid=owner.ownerid) -# .first() -# ) -# assert gh_app_installation.owner == owner -# assert gh_app_installation.repository_service_ids == None - -# new_gh_app_installation = ( -# dbsession.query(GithubAppInstallation) -# .filter_by(ownerid=owner_without_app.ownerid) -# .first() -# ) -# assert new_gh_app_installation.owner == owner_without_app -# assert ( -# new_gh_app_installation.installation_id == owner_without_app.integration_id -# ) -# assert len(new_gh_app_installation.repository_service_ids) == len(mock_repos) - -# for repo in mock_repos: -# assert ( -# repo["repo"]["service_id"] -# in new_gh_app_installation.repository_service_ids -# )