Skip to content

Commit

Permalink
Add superuser API endpoints to export and import org data (#1394)
Browse files Browse the repository at this point in the history
Fixes #890 

This PR introduces new streaming superuser-only API endpoints to export
and import database information for an organization. New Adminstrator
deployment documentation on how to manage the process and copy files
between S3 buckets as needed is also included.

---------

Co-authored-by: Henry Wilkinson <[email protected]>
Co-authored-by: Ilya Kreymer <[email protected]>
  • Loading branch information
3 people authored Jul 2, 2024
1 parent bdfc094 commit f076e7d
Show file tree
Hide file tree
Showing 16 changed files with 866 additions and 18 deletions.
10 changes: 6 additions & 4 deletions backend/btrixcloud/basecrawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@


# ============================================================================
# pylint: disable=too-many-instance-attributes
# pylint: disable=too-many-instance-attributes, too-many-public-methods
class BaseCrawlOps:
"""operations that apply to all crawls"""

Expand Down Expand Up @@ -137,7 +137,7 @@ async def _files_to_resources(
return []

crawl_files = [CrawlFile(**data) for data in files]
return await self._resolve_signed_urls(crawl_files, org, crawlid, qa_run_id)
return await self.resolve_signed_urls(crawl_files, org, crawlid, qa_run_id)

async def get_wacz_files(self, crawl_id: str, org: Organization):
"""Return list of WACZ files associated with crawl."""
Expand Down Expand Up @@ -427,13 +427,15 @@ async def _resolve_crawl_refs(

return crawl

async def _resolve_signed_urls(
async def resolve_signed_urls(
self,
files: List[CrawlFile],
org: Organization,
crawl_id: Optional[str] = None,
qa_run_id: Optional[str] = None,
update_presigned_url: bool = False,
) -> List[CrawlFileOut]:
"""Regenerate presigned URLs for files as necessary"""
if not files:
print("no files")
return []
Expand All @@ -446,7 +448,7 @@ async def _resolve_signed_urls(
presigned_url = file_.presignedUrl
now = dt_now()

if not presigned_url or now >= file_.expireAt:
if update_presigned_url or not presigned_url or now >= file_.expireAt:
exp = now + delta
presigned_url = await self.storage_ops.get_presigned_url(
org, file_, self.presign_duration_seconds
Expand Down
2 changes: 1 addition & 1 deletion backend/btrixcloud/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -944,7 +944,7 @@ async def get_qa_run_for_replay(
if not org:
raise HTTPException(status_code=400, detail="missing_org")

resources = await self._resolve_signed_urls(
resources = await self.resolve_signed_urls(
qa_run.files, org, crawl.id, qa_run_id
)

Expand Down
2 changes: 2 additions & 0 deletions backend/btrixcloud/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,8 @@ def main():

init_uploads_api(*base_crawl_init)

org_ops.set_base_crawl_ops(base_crawl_ops)

user_manager.set_ops(org_ops, crawl_config_ops, base_crawl_ops)

background_job_ops.set_ops(base_crawl_ops, profiles)
Expand Down
2 changes: 1 addition & 1 deletion backend/btrixcloud/main_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def main():

user_manager = init_user_manager(mdb, email, invite_ops)

org_ops = OrgOps(mdb, invite_ops)
org_ops = OrgOps(mdb, invite_ops, user_manager)

event_webhook_ops = EventWebhookOps(mdb, org_ops)

Expand Down
56 changes: 56 additions & 0 deletions backend/btrixcloud/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1155,6 +1155,40 @@ async def serialize_for_user(self, user: User, user_manager) -> OrgOut:
return OrgOut.from_dict(result)


# ============================================================================
class OrgOutExport(Organization):
"""Org out for export"""

# Additional field so export contains user names and emails
userDetails: Optional[List[Dict[str, Union[str, int, UUID]]]]

async def serialize_for_export(self, user_manager):
"""Serialize result with users for org export"""

result = self.to_dict()
user_details = []
keys = list(self.users.keys())
user_list = await user_manager.get_user_names_by_ids(keys)

for org_user in user_list:
id_ = str(org_user["id"])
role = self.users.get(id_)
if not role:
continue

user_details.append(
{
"id": id_,
"role": role.value,
"name": org_user.get("name", ""),
"email": org_user.get("email", ""),
}
)

result["userDetails"] = user_details
return self.from_dict(result)


# ============================================================================
class OrgMetrics(BaseModel):
"""Organization API metrics model"""
Expand All @@ -1176,6 +1210,27 @@ class OrgMetrics(BaseModel):
publicCollectionsCount: int


# ============================================================================
class OrgImportExportData(BaseModel):
"""Model for org import/export data"""

dbVersion: str
org: Dict[str, Any]
profiles: List[Dict[str, Any]]
workflows: List[Dict[str, Any]]
workflowRevisions: List[Dict[str, Any]]
items: List[Dict[str, Any]]
pages: List[Dict[str, Any]]
collections: List[Dict[str, Any]]


# ============================================================================
class OrgImportExport(BaseModel):
"""Model for org import/export"""

data: OrgImportExportData


# ============================================================================

### PAGINATION ###
Expand Down Expand Up @@ -1470,6 +1525,7 @@ class WebhookNotification(BaseMongoModel):
### BACKGROUND JOBS ###


# ============================================================================
class BgJobType(str, Enum):
"""Background Job Types"""

Expand Down
Loading

0 comments on commit f076e7d

Please sign in to comment.